diff options
author | Marvin Borner | 2018-09-19 17:43:33 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-19 17:43:33 +0200 |
commit | 4f751868837a0f78423c927c6cd4aeb24bf37c00 (patch) | |
tree | 25b3462724a03bf0081fef410f2484d35bf43fcf | |
parent | 4a57c1fda6d50d655eb48ffb712bd43729ea5c10 (diff) |
Finally fixed almost all bugs :zap: :fire:
-rw-r--r-- | crawler/Algorithms.php | 79 | ||||
-rw-r--r-- | crawler/CrawlController.php | 57 | ||||
-rw-r--r-- | crawler/Database.php | 3 | ||||
-rw-r--r-- | crawler/WebRequest.php | 74 | ||||
-rw-r--r-- | crawler/crawler.php | 1 |
5 files changed, 148 insertions, 66 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php index 73d2ecc..6c8d513 100644 --- a/crawler/Algorithms.php +++ b/crawler/Algorithms.php @@ -1,10 +1,14 @@ <?php +header('Content-type: text/plain; charset=utf-8'); + /** * User: Marvin Borner * Date: 16/09/2018 * Time: 21:51 */ +require_once 'CrawlController.php'; + class Algorithms { public static function getUrlInfo($path): array @@ -23,14 +27,14 @@ class Algorithms if (!isset($urlInfo['description'])) { $urlInfo['description'] = ''; foreach ($path->query('//p') as $text) { - if (strlen($urlInfo['description']) < 350) { + if (mb_strlen($urlInfo['description']) < 350) { $urlInfo['description'] .= $text->textContent . ' '; } } } if (empty($urlInfo['title'])) { $urlInfo['title'] = ''; - if (strlen($urlInfo['title']) < 350) { + if (mb_strlen($urlInfo['title']) < 350) { $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; } } @@ -46,10 +50,8 @@ class Algorithms foreach ($path->query('//a') as $link) { $linkHref = $link->getAttribute('href'); - if ($linkHref !== 'javascript:void(0)') { - $href = self::cleanUrl($linkHref); - $allLinks[] = $href; - } + $href = self::cleanUrl($linkHref); + $allLinks[] = $href; } return array_unique($allLinks); @@ -66,25 +68,25 @@ class Algorithms public static function cleanUrl($url): string { - global $currentlyCrawled; - - $newUrl = ltrim($url); // trim whitespaces + $newUrl = self::fixEncoding(ltrim($url)); // trim whitespaces // normally only for links/href - if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) { - if (strpos($newUrl, 'www') === 0) { + if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || mb_strpos($newUrl, 'http') !== 0) { + if (mb_strpos($newUrl, 'www') === 0) { $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning - } else if (strpos($newUrl, 'javascript:') === 0) { - $newUrl = ''; // fixes javascript void links - } else if (strpos($newUrl, '../') === 0) { - $parsedUrl = parse_url($currentlyCrawled); - $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) - $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path - } else if (strpos($newUrl, '/') === 0) { - $parsedUrl = parse_url($currentlyCrawled); + } else if (mb_strpos($newUrl, 'javascript:') === 0 || mb_strpos($newUrl, 'mailto') === 0) { + $newUrl = CrawlController::$currentlyCrawled; // fixes javascript void links + } else if (mb_strpos($newUrl, '../') === 0) { + $parsedUrl = parse_url(CrawlController::$currentlyCrawled); + $backCount = mb_substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) + if ($backCount >= 1) { + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path + } + } else if (mb_strpos($newUrl, '/') === 0) { + $parsedUrl = parse_url(CrawlController::$currentlyCrawled); $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path } else { - $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning + $newUrl = '/' . CrawlController::$currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning } } @@ -95,8 +97,12 @@ class Algorithms // strip some things $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes - $newUrl = strtok($newUrl, '?'); // parameters - $newUrl = strtok($newUrl, '#'); // hash fragments + $newUrl = self::mb_strtok($newUrl, '?'); // parameters + $newUrl = self::mb_strtok($newUrl, '#'); // hash fragments + + if (mb_strpos($newUrl, '/') === 0) { + $newUrl = mb_substr($newUrl, 1); // remove first slash from domain, which could have been added + } if ($url !== $newUrl) { print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; @@ -104,4 +110,33 @@ class Algorithms return $newUrl; } + + private static function fixEncoding($text): string + { + return iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8', $text); + } + + private static function mb_strtok($str, $delimiters) + { + $pos = 0; + $string = $str; + + $token = ''; + + while ($pos < mb_strlen($string)) { + $char = mb_substr($string, $pos, 1); + $pos++; + if (mb_strpos($delimiters, $char) === FALSE) { + $token .= $char; + } else if ($token !== '') { + return $token; + } + } + + if ($token !== '') { + return $token; + } + + return false; + } }
\ No newline at end of file diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php index 97edf25..53d5aac 100644 --- a/crawler/CrawlController.php +++ b/crawler/CrawlController.php @@ -1,4 +1,5 @@ <?php +header('Content-type: text/plain; charset=utf-8'); /** * User: Marvin Borner * Date: 14/09/2018 @@ -12,48 +13,50 @@ require_once 'Algorithms.php'; class CrawlController { - private static $currentlyCrawled; + public static $currentlyCrawled; public static function start($url = '') { set_time_limit(3600000); - error_reporting(E_ERROR | E_PARSE); + + self::$currentlyCrawled = $url; while (true) { - self::$currentlyCrawled = $url; - self::crawl(self::$currentlyCrawled); + self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled)); } } private static function crawl($url) { - if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) { + if ($url !== '' && Database::alreadyCrawled($url)) { Database::removeFromQueue(self::$currentlyCrawled); self::$currentlyCrawled = Database::getFromQueue('DESC'); } else { $requestResponse = WebRequest::getContent($url); - self::$currentlyCrawled = $requestResponse[3]; - if (preg_match('/2\d\d/', $requestResponse[1])) { // success - print 'Download Size: ' . $requestResponse[2]; - - $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); - - $urlInfo = Algorithms::getUrlInfo($htmlPath); - Database::saveUrlData(self::$currentlyCrawled, $urlInfo); - - $allLinks = Algorithms::getLinks($htmlPath); - Database::insertIntoQueue($allLinks); - - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start - print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; - } else { - print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; - - Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end - print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; + if ($requestResponse) { + self::$currentlyCrawled = $requestResponse[3]; + if (preg_match('/2\d\d/', $requestResponse[1])) { // success + print 'Download Size: ' . $requestResponse[2]; + + $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + + $urlInfo = Algorithms::getUrlInfo($htmlPath); + Database::saveUrlData(self::$currentlyCrawled, $urlInfo); + + $allLinks = Algorithms::getLinks($htmlPath); + Database::insertIntoQueue($allLinks); + + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; + } else { + print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; + + Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; + } } } } diff --git a/crawler/Database.php b/crawler/Database.php index 0d500ad..f27803e 100644 --- a/crawler/Database.php +++ b/crawler/Database.php @@ -1,4 +1,5 @@ <?php +header('Content-type: text/plain; charset=utf-8'); /** * User: Marvin Borner * Date: 16/09/2018 @@ -41,7 +42,7 @@ class Database public static function alreadyCrawled($url): bool { - print "\t\e[96mChecking if url already has been crawled " . $url . "\n"; + print "\t\e[96mChecking if url has already been crawled " . $url . "\n"; $hash = md5($url); $conn = self::initDbConnection(); $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php index f25f31d..6053bae 100644 --- a/crawler/WebRequest.php +++ b/crawler/WebRequest.php @@ -1,29 +1,71 @@ <?php +header('Content-type: text/plain; charset=utf-8'); + /** * User: Marvin Borner * Date: 16/09/2018 * Time: 21:53 */ - class WebRequest { - public static function getContent($url): array + private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; + + public static function getContent($url) { - $curl = curl_init($url); - curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); - curl_setopt($curl, CURLOPT_ENCODING, ''); - curl_setopt($curl, CURLOPT_TIMEOUT, 5); - curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); - $content = curl_exec($curl); - $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); - $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; - if (preg_match('~Location: (.*)~i', $content, $match)) { - $updatedUrl = trim($match[1]); // update url on 301/302 + if (self::checkRobotsTxt($url)) { + $curl = curl_init($url); + curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent); + curl_setopt($curl, CURLOPT_ENCODING, ''); + curl_setopt($curl, CURLOPT_TIMEOUT, 5); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); + $content = curl_exec($curl); + $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); + $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; + $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302 + curl_close($curl); + + return [$content, $responseCode, $downloadSize, $updatedUrl]; } - curl_close($curl); - return [$content, $responseCode, $downloadSize, $updatedUrl ?? $url]; + return false; + } + + public static function checkRobotsTxt($url): bool + { + $userAgent = self::$userAgent; + $parsed = parse_url($url); + $agents = array(preg_quote('*', NULL)); + if ($userAgent) { + $agents[] = preg_quote($userAgent, NULL); + } + $agents = implode('|', $agents); + $robotsTxt = @file("http://{$parsed['host']}/robots.txt"); + if (empty($robotsTxt)) { + return true; + } + $rules = array(); + $ruleApplies = false; + foreach ($robotsTxt as $line) { + if (!$line = trim($line)) { + continue; + } + if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) { + $ruleApplies = preg_match("/($agents)/i", $match[1]); + } + if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) { + if (!$regs[1]) { + return true; + } + $rules[] = preg_quote(trim($regs[1]), '/'); + } + } + foreach ($rules as $rule) { + if (preg_match("/^$rule/", $parsed['path'])) { + return false; + } + } + return true; } }
\ No newline at end of file diff --git a/crawler/crawler.php b/crawler/crawler.php index b5df1dc..1e121e4 100644 --- a/crawler/crawler.php +++ b/crawler/crawler.php @@ -1,4 +1,5 @@ <?php +header('Content-type: text/plain; charset=utf-8'); require_once 'CrawlController.php'; CrawlController::start($argv[1]);
\ No newline at end of file |