summaryrefslogtreecommitdiff
path: root/crawler/CrawlController.php
diff options
context:
space:
mode:
Diffstat (limited to 'crawler/CrawlController.php')
-rw-r--r--crawler/CrawlController.php57
1 files changed, 30 insertions, 27 deletions
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php
index 97edf25..53d5aac 100644
--- a/crawler/CrawlController.php
+++ b/crawler/CrawlController.php
@@ -1,4 +1,5 @@
<?php
+header('Content-type: text/plain; charset=utf-8');
/**
* User: Marvin Borner
* Date: 14/09/2018
@@ -12,48 +13,50 @@ require_once 'Algorithms.php';
class CrawlController
{
- private static $currentlyCrawled;
+ public static $currentlyCrawled;
public static function start($url = '')
{
set_time_limit(3600000);
- error_reporting(E_ERROR | E_PARSE);
+
+ self::$currentlyCrawled = $url;
while (true) {
- self::$currentlyCrawled = $url;
- self::crawl(self::$currentlyCrawled);
+ self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled));
}
}
private static function crawl($url)
{
- if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) {
+ if ($url !== '' && Database::alreadyCrawled($url)) {
Database::removeFromQueue(self::$currentlyCrawled);
self::$currentlyCrawled = Database::getFromQueue('DESC');
} else {
$requestResponse = WebRequest::getContent($url);
- self::$currentlyCrawled = $requestResponse[3];
- if (preg_match('/2\d\d/', $requestResponse[1])) { // success
- print 'Download Size: ' . $requestResponse[2];
-
- $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
-
- $urlInfo = Algorithms::getUrlInfo($htmlPath);
- Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
-
- $allLinks = Algorithms::getLinks($htmlPath);
- Database::insertIntoQueue($allLinks);
-
- Database::removeFromQueue(self::$currentlyCrawled);
- self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
- print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
- } else {
- print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
-
- Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
- Database::removeFromQueue(self::$currentlyCrawled);
- self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
- print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
+ if ($requestResponse) {
+ self::$currentlyCrawled = $requestResponse[3];
+ if (preg_match('/2\d\d/', $requestResponse[1])) { // success
+ print 'Download Size: ' . $requestResponse[2];
+
+ $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
+
+ $urlInfo = Algorithms::getUrlInfo($htmlPath);
+ Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
+
+ $allLinks = Algorithms::getLinks($htmlPath);
+ Database::insertIntoQueue($allLinks);
+
+ Database::removeFromQueue(self::$currentlyCrawled);
+ self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
+ print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
+ } else {
+ print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
+
+ Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
+ Database::removeFromQueue(self::$currentlyCrawled);
+ self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
+ print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
+ }
}
}
}