From 9e9442a222666c5aadb6db32140dadddcb807933 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sun, 16 Sep 2018 22:07:50 +0200 Subject: Code cleanup :zap: :construction: --- crawler/CrawlController.php | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 crawler/CrawlController.php (limited to 'crawler/CrawlController.php') diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php new file mode 100644 index 0000000..e5a270b --- /dev/null +++ b/crawler/CrawlController.php @@ -0,0 +1,74 @@ +getFromQueue('DESC'); + } else { + $requestResponse = getContent($url); + $currentlyCrawled = $requestResponse[3]; + if (preg_match('/2\d\d/', $requestResponse[1])) { // success + print 'Download Size: ' . $requestResponse[2]; + + $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); + $allLinks = Algorithms::getLinks($htmlPath); + + Database::writeToQueue($allLinks); + $this->saveData($urlInfo, $currentlyCrawled); + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + } else { + print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + + Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + } + } + } + + public function saveData($urlInfo, $url) + { + if ($url !== '') { + print "\e[96mFinished previous url - crawling: " . $url . "\n"; + + $title = $urlInfo['title'] ?? ''; + $description = $urlInfo['description'] ?? ''; + $language = $urlInfo['language'] ?? 'en'; + $hash = md5($url); + $data = [$title, $description, $language, $hash]; + + Database::saveUrlData($data); + } + } + + +} \ No newline at end of file -- cgit v1.2.3