summaryrefslogtreecommitdiff
path: root/crawler/WebRequest.php
diff options
context:
space:
mode:
Diffstat (limited to 'crawler/WebRequest.php')
-rw-r--r--crawler/WebRequest.php74
1 files changed, 58 insertions, 16 deletions
diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php
index f25f31d..6053bae 100644
--- a/crawler/WebRequest.php
+++ b/crawler/WebRequest.php
@@ -1,29 +1,71 @@
<?php
+header('Content-type: text/plain; charset=utf-8');
+
/**
* User: Marvin Borner
* Date: 16/09/2018
* Time: 21:53
*/
-
class WebRequest
{
- public static function getContent($url): array
+ private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
+
+ public static function getContent($url)
{
- $curl = curl_init($url);
- curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
- curl_setopt($curl, CURLOPT_ENCODING, '');
- curl_setopt($curl, CURLOPT_TIMEOUT, 5);
- curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
- $content = curl_exec($curl);
- $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
- $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
- if (preg_match('~Location: (.*)~i', $content, $match)) {
- $updatedUrl = trim($match[1]); // update url on 301/302
+ if (self::checkRobotsTxt($url)) {
+ $curl = curl_init($url);
+ curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent);
+ curl_setopt($curl, CURLOPT_ENCODING, '');
+ curl_setopt($curl, CURLOPT_TIMEOUT, 5);
+ curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
+ curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
+ $content = curl_exec($curl);
+ $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
+ $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
+ $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302
+ curl_close($curl);
+
+ return [$content, $responseCode, $downloadSize, $updatedUrl];
}
- curl_close($curl);
- return [$content, $responseCode, $downloadSize, $updatedUrl ?? $url];
+ return false;
+ }
+
+ public static function checkRobotsTxt($url): bool
+ {
+ $userAgent = self::$userAgent;
+ $parsed = parse_url($url);
+ $agents = array(preg_quote('*', NULL));
+ if ($userAgent) {
+ $agents[] = preg_quote($userAgent, NULL);
+ }
+ $agents = implode('|', $agents);
+ $robotsTxt = @file("http://{$parsed['host']}/robots.txt");
+ if (empty($robotsTxt)) {
+ return true;
+ }
+ $rules = array();
+ $ruleApplies = false;
+ foreach ($robotsTxt as $line) {
+ if (!$line = trim($line)) {
+ continue;
+ }
+ if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
+ $ruleApplies = preg_match("/($agents)/i", $match[1]);
+ }
+ if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
+ if (!$regs[1]) {
+ return true;
+ }
+ $rules[] = preg_quote(trim($regs[1]), '/');
+ }
+ }
+ foreach ($rules as $rule) {
+ if (preg_match("/^$rule/", $parsed['path'])) {
+ return false;
+ }
+ }
+ return true;
}
} \ No newline at end of file