diff options
author | Marvin Borner | 2018-11-07 18:02:36 +0100 |
---|---|---|
committer | Marvin Borner | 2018-11-07 18:02:36 +0100 |
commit | 824a2d9f587ca017fc71b84d835e72f54f9c87c4 (patch) | |
tree | 765267ea4686f752aad1f69930cfee5680cc494a /crawler/CrawlController.php | |
parent | fe75612e86b493a4e66c4e104e22658679cc014f (diff) |
Began rewrite
Diffstat (limited to 'crawler/CrawlController.php')
-rw-r--r-- | crawler/CrawlController.php | 63 |
1 files changed, 0 insertions, 63 deletions
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php deleted file mode 100644 index 5b20b75..0000000 --- a/crawler/CrawlController.php +++ /dev/null @@ -1,63 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); -/** - * User: Marvin Borner - * Date: 14/09/2018 - * Time: 23:48 - */ - -require_once '../database/mysqlConf.inc'; -require_once '../database/Database.php'; -require_once 'WebRequest.php'; -require_once 'Algorithms.php'; - -class CrawlController -{ - public static $currentlyCrawled; - - public static function start($url = '') - { - set_time_limit(3600000); - - self::$currentlyCrawled = $url; - - while (true) { - self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled)); - } - } - - private static function crawl($url) - { - if ($url !== '' && Database::alreadyCrawled($url)) { - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('DESC'); - } else { - $requestResponse = WebRequest::getContent($url); - if ($requestResponse) { - self::$currentlyCrawled = $requestResponse[3]; - if (preg_match('/2\d\d/', $requestResponse[1])) { // success - print 'Download Size: ' . $requestResponse[2]; - - $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); - - $urlInfo = Algorithms::getUrlInfo($htmlPath); - Database::saveUrlData(self::$currentlyCrawled, $urlInfo); - - $allLinks = Algorithms::getLinks($htmlPath); - Database::insertIntoQueue($allLinks); - - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start - print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; - } else { - print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; - - Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url - Database::removeFromQueue(self::$currentlyCrawled); - self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end - print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; - } - } - } - } -}
\ No newline at end of file |