diff options
Diffstat (limited to 'crawler/CrawlController.php')
-rw-r--r-- | crawler/CrawlController.php | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php new file mode 100644 index 0000000..e5a270b --- /dev/null +++ b/crawler/CrawlController.php @@ -0,0 +1,74 @@ +<?php +/** + * User: Marvin Borner + * Date: 14/09/2018 + * Time: 23:48 + */ + +include 'mysql_conf.inc'; + +class CrawlController +{ + public function __construct() + { + set_time_limit(3600000); + error_reporting(E_ERROR | E_PARSE); + + $currentlyCrawled = $argv[1] ?? ''; + + while (true) { + crawl($currentlyCrawled); + } + } + + public function crawl($url) + { + global $currentlyCrawled; + + if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = $this->getFromQueue('DESC'); + } else { + $requestResponse = getContent($url); + $currentlyCrawled = $requestResponse[3]; + if (preg_match('/2\d\d/', $requestResponse[1])) { // success + print 'Download Size: ' . $requestResponse[2]; + + $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); + $allLinks = Algorithms::getLinks($htmlPath); + + Database::writeToQueue($allLinks); + $this->saveData($urlInfo, $currentlyCrawled); + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + } else { + print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + + Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + } + } + } + + public function saveData($urlInfo, $url) + { + if ($url !== '') { + print "\e[96mFinished previous url - crawling: " . $url . "\n"; + + $title = $urlInfo['title'] ?? ''; + $description = $urlInfo['description'] ?? ''; + $language = $urlInfo['language'] ?? 'en'; + $hash = md5($url); + $data = [$title, $description, $language, $hash]; + + Database::saveUrlData($data); + } + } + + +}
\ No newline at end of file |