diff options
Diffstat (limited to 'crawler/WebRequest.php')
-rw-r--r-- | crawler/WebRequest.php | 74 |
1 files changed, 58 insertions, 16 deletions
diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php index f25f31d..6053bae 100644 --- a/crawler/WebRequest.php +++ b/crawler/WebRequest.php @@ -1,29 +1,71 @@ <?php +header('Content-type: text/plain; charset=utf-8'); + /** * User: Marvin Borner * Date: 16/09/2018 * Time: 21:53 */ - class WebRequest { - public static function getContent($url): array + private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; + + public static function getContent($url) { - $curl = curl_init($url); - curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); - curl_setopt($curl, CURLOPT_ENCODING, ''); - curl_setopt($curl, CURLOPT_TIMEOUT, 5); - curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); - $content = curl_exec($curl); - $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); - $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; - if (preg_match('~Location: (.*)~i', $content, $match)) { - $updatedUrl = trim($match[1]); // update url on 301/302 + if (self::checkRobotsTxt($url)) { + $curl = curl_init($url); + curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent); + curl_setopt($curl, CURLOPT_ENCODING, ''); + curl_setopt($curl, CURLOPT_TIMEOUT, 5); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); + $content = curl_exec($curl); + $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); + $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; + $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302 + curl_close($curl); + + return [$content, $responseCode, $downloadSize, $updatedUrl]; } - curl_close($curl); - return [$content, $responseCode, $downloadSize, $updatedUrl ?? $url]; + return false; + } + + public static function checkRobotsTxt($url): bool + { + $userAgent = self::$userAgent; + $parsed = parse_url($url); + $agents = array(preg_quote('*', NULL)); + if ($userAgent) { + $agents[] = preg_quote($userAgent, NULL); + } + $agents = implode('|', $agents); + $robotsTxt = @file("http://{$parsed['host']}/robots.txt"); + if (empty($robotsTxt)) { + return true; + } + $rules = array(); + $ruleApplies = false; + foreach ($robotsTxt as $line) { + if (!$line = trim($line)) { + continue; + } + if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) { + $ruleApplies = preg_match("/($agents)/i", $match[1]); + } + if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) { + if (!$regs[1]) { + return true; + } + $rules[] = preg_quote(trim($regs[1]), '/'); + } + } + foreach ($rules as $rule) { + if (preg_match("/^$rule/", $parsed['path'])) { + return false; + } + } + return true; } }
\ No newline at end of file |