diff options
Diffstat (limited to 'crawler/CrawlController.php')
-rw-r--r-- | crawler/CrawlController.php | 66 |
1 files changed, 26 insertions, 40 deletions
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php index e5a270b..97edf25 100644 --- a/crawler/CrawlController.php +++ b/crawler/CrawlController.php @@ -5,70 +5,56 @@ * Time: 23:48 */ -include 'mysql_conf.inc'; +require_once 'mysql_conf.inc'; +require_once 'WebRequest.php'; +require_once 'Database.php'; +require_once 'Algorithms.php'; class CrawlController { - public function __construct() + private static $currentlyCrawled; + + public static function start($url = '') { set_time_limit(3600000); error_reporting(E_ERROR | E_PARSE); - $currentlyCrawled = $argv[1] ?? ''; - while (true) { - crawl($currentlyCrawled); + self::$currentlyCrawled = $url; + self::crawl(self::$currentlyCrawled); } } - public function crawl($url) + private static function crawl($url) { - global $currentlyCrawled; - if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) { - print "\t\e[91mUrl already crawled " . $url . "\n"; - - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = $this->getFromQueue('DESC'); + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('DESC'); } else { - $requestResponse = getContent($url); - $currentlyCrawled = $requestResponse[3]; + $requestResponse = WebRequest::getContent($url); + self::$currentlyCrawled = $requestResponse[3]; if (preg_match('/2\d\d/', $requestResponse[1])) { // success print 'Download Size: ' . $requestResponse[2]; $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); - $allLinks = Algorithms::getLinks($htmlPath); + Database::saveUrlData(self::$currentlyCrawled, $urlInfo); - Database::writeToQueue($allLinks); - $this->saveData($urlInfo, $currentlyCrawled); + $allLinks = Algorithms::getLinks($htmlPath); + Database::insertIntoQueue($allLinks); - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; } else { - print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; - Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; } } } - - public function saveData($urlInfo, $url) - { - if ($url !== '') { - print "\e[96mFinished previous url - crawling: " . $url . "\n"; - - $title = $urlInfo['title'] ?? ''; - $description = $urlInfo['description'] ?? ''; - $language = $urlInfo['language'] ?? 'en'; - $hash = md5($url); - $data = [$title, $description, $language, $hash]; - - Database::saveUrlData($data); - } - } - - }
\ No newline at end of file |