diff options
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/Algorithms.php | 142 | ||||
-rw-r--r-- | crawler/CrawlController.php | 63 | ||||
-rw-r--r-- | crawler/WebRequest.php | 71 | ||||
-rw-r--r-- | crawler/crawler.php | 5 |
4 files changed, 0 insertions, 281 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php deleted file mode 100644 index 6c8d513..0000000 --- a/crawler/Algorithms.php +++ /dev/null @@ -1,142 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); - -/** - * User: Marvin Borner - * Date: 16/09/2018 - * Time: 21:51 - */ - -require_once 'CrawlController.php'; - -class Algorithms -{ - public static function getUrlInfo($path): array - { - $urlInfo = []; - - $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent); - foreach ($path->query('//html') as $language) { - $urlInfo['language'] = strip_tags($language->getAttribute('lang')); - } - foreach ($path->query('/html/head/meta[@name="description"]') as $description) { - $urlInfo['description'] = strip_tags($description->getAttribute('content')); - } - - // Fix empty information - if (!isset($urlInfo['description'])) { - $urlInfo['description'] = ''; - foreach ($path->query('//p') as $text) { - if (mb_strlen($urlInfo['description']) < 350) { - $urlInfo['description'] .= $text->textContent . ' '; - } - } - } - if (empty($urlInfo['title'])) { - $urlInfo['title'] = ''; - if (mb_strlen($urlInfo['title']) < 350) { - $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; - } - } - - print "\t\e[92mFound data: " . $urlInfo['title'] . "\n"; - - return $urlInfo; - } - - public static function getLinks($path): array - { - $allLinks = []; - - foreach ($path->query('//a') as $link) { - $linkHref = $link->getAttribute('href'); - $href = self::cleanUrl($linkHref); - $allLinks[] = $href; - } - - return array_unique($allLinks); - } - - public static function createPathFromHtml($content): \DOMXPath - { - $dom = new DOMDocument(); - libxml_use_internal_errors(true); - $dom->loadHTML($content); - libxml_use_internal_errors(false); - return new DOMXPath($dom); - } - - public static function cleanUrl($url): string - { - $newUrl = self::fixEncoding(ltrim($url)); // trim whitespaces - - // normally only for links/href - if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || mb_strpos($newUrl, 'http') !== 0) { - if (mb_strpos($newUrl, 'www') === 0) { - $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning - } else if (mb_strpos($newUrl, 'javascript:') === 0 || mb_strpos($newUrl, 'mailto') === 0) { - $newUrl = CrawlController::$currentlyCrawled; // fixes javascript void links - } else if (mb_strpos($newUrl, '../') === 0) { - $parsedUrl = parse_url(CrawlController::$currentlyCrawled); - $backCount = mb_substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) - if ($backCount >= 1) { - $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path - } - } else if (mb_strpos($newUrl, '/') === 0) { - $parsedUrl = parse_url(CrawlController::$currentlyCrawled); - $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path - } else { - $newUrl = '/' . CrawlController::$currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning - } - } - - // if it's pure domain without slash (prevents duplicate domains because of slash) - if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) { - $newUrl .= '/'; - } - - // strip some things - $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes - $newUrl = self::mb_strtok($newUrl, '?'); // parameters - $newUrl = self::mb_strtok($newUrl, '#'); // hash fragments - - if (mb_strpos($newUrl, '/') === 0) { - $newUrl = mb_substr($newUrl, 1); // remove first slash from domain, which could have been added - } - - if ($url !== $newUrl) { - print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; - } - - return $newUrl; - } - - private static function fixEncoding($text): string - { - return iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8', $text); - } - - private static function mb_strtok($str, $delimiters) - { - $pos = 0; - $string = $str; - - $token = ''; - - while ($pos < mb_strlen($string)) { - $char = mb_substr($string, $pos, 1); - $pos++; - if (mb_strpos($delimiters, $char) === FALSE) { - $token .= $char; - } else if ($token !== '') { - return $token; - } - } - - if ($token !== '') { - return $token; - } - - return false; - } -}
\ No newline at end of file diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php deleted file mode 100644 index 5b20b75..0000000 --- a/crawler/CrawlController.php +++ /dev/null @@ -1,63 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); -/** - * User: Marvin Borner - * Date: 14/09/2018 - * Time: 23:48 - */ - -require_once '../database/mysqlConf.inc'; -require_once '../database/Database.php'; -require_once 'WebRequest.php'; -require_once 'Algorithms.php'; - -class CrawlController -{ - public static $currentlyCrawled; - - public static function start($url = '') - { - set_time_limit(3600000); - - self::$currentlyCrawled = $url; - - while (true) { - self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled)); - } - } - - private static function crawl($url) - { - if ($url !== '' && Database::alreadyCrawled($url)) { - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('DESC'); - } else { - $requestResponse = WebRequest::getContent($url); - if ($requestResponse) { - self::$currentlyCrawled = $requestResponse[3]; - if (preg_match('/2\d\d/', $requestResponse[1])) { // success - print 'Download Size: ' . $requestResponse[2]; - - $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); - - $urlInfo = Algorithms::getUrlInfo($htmlPath); - Database::saveUrlData(self::$currentlyCrawled, $urlInfo); - - $allLinks = Algorithms::getLinks($htmlPath); - Database::insertIntoQueue($allLinks); - - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start - print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; - } else { - print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; - - Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end - print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; - } - } - } - } -}
\ No newline at end of file diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php deleted file mode 100644 index 6053bae..0000000 --- a/crawler/WebRequest.php +++ /dev/null @@ -1,71 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); - -/** - * User: Marvin Borner - * Date: 16/09/2018 - * Time: 21:53 - */ -class WebRequest -{ - private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; - - public static function getContent($url) - { - if (self::checkRobotsTxt($url)) { - $curl = curl_init($url); - curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent); - curl_setopt($curl, CURLOPT_ENCODING, ''); - curl_setopt($curl, CURLOPT_TIMEOUT, 5); - curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); - $content = curl_exec($curl); - $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); - $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; - $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302 - curl_close($curl); - - return [$content, $responseCode, $downloadSize, $updatedUrl]; - } - - return false; - } - - public static function checkRobotsTxt($url): bool - { - $userAgent = self::$userAgent; - $parsed = parse_url($url); - $agents = array(preg_quote('*', NULL)); - if ($userAgent) { - $agents[] = preg_quote($userAgent, NULL); - } - $agents = implode('|', $agents); - $robotsTxt = @file("http://{$parsed['host']}/robots.txt"); - if (empty($robotsTxt)) { - return true; - } - $rules = array(); - $ruleApplies = false; - foreach ($robotsTxt as $line) { - if (!$line = trim($line)) { - continue; - } - if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) { - $ruleApplies = preg_match("/($agents)/i", $match[1]); - } - if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) { - if (!$regs[1]) { - return true; - } - $rules[] = preg_quote(trim($regs[1]), '/'); - } - } - foreach ($rules as $rule) { - if (preg_match("/^$rule/", $parsed['path'])) { - return false; - } - } - return true; - } -}
\ No newline at end of file diff --git a/crawler/crawler.php b/crawler/crawler.php deleted file mode 100644 index 1e121e4..0000000 --- a/crawler/crawler.php +++ /dev/null @@ -1,5 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); -require_once 'CrawlController.php'; - -CrawlController::start($argv[1]);
\ No newline at end of file |