summaryrefslogtreecommitdiff
path: root/crawler/CrawlController.php
diff options
context:
space:
mode:
Diffstat (limited to 'crawler/CrawlController.php')
-rw-r--r--crawler/CrawlController.php66
1 files changed, 26 insertions, 40 deletions
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php
index e5a270b..97edf25 100644
--- a/crawler/CrawlController.php
+++ b/crawler/CrawlController.php
@@ -5,70 +5,56 @@
* Time: 23:48
*/
-include 'mysql_conf.inc';
+require_once 'mysql_conf.inc';
+require_once 'WebRequest.php';
+require_once 'Database.php';
+require_once 'Algorithms.php';
class CrawlController
{
- public function __construct()
+ private static $currentlyCrawled;
+
+ public static function start($url = '')
{
set_time_limit(3600000);
error_reporting(E_ERROR | E_PARSE);
- $currentlyCrawled = $argv[1] ?? '';
-
while (true) {
- crawl($currentlyCrawled);
+ self::$currentlyCrawled = $url;
+ self::crawl(self::$currentlyCrawled);
}
}
- public function crawl($url)
+ private static function crawl($url)
{
- global $currentlyCrawled;
-
if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) {
- print "\t\e[91mUrl already crawled " . $url . "\n";
-
- Database::removeFromQueue($currentlyCrawled);
- $currentlyCrawled = $this->getFromQueue('DESC');
+ Database::removeFromQueue(self::$currentlyCrawled);
+ self::$currentlyCrawled = Database::getFromQueue('DESC');
} else {
- $requestResponse = getContent($url);
- $currentlyCrawled = $requestResponse[3];
+ $requestResponse = WebRequest::getContent($url);
+ self::$currentlyCrawled = $requestResponse[3];
if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
$htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
+
$urlInfo = Algorithms::getUrlInfo($htmlPath);
- $allLinks = Algorithms::getLinks($htmlPath);
+ Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
- Database::writeToQueue($allLinks);
- $this->saveData($urlInfo, $currentlyCrawled);
+ $allLinks = Algorithms::getLinks($htmlPath);
+ Database::insertIntoQueue($allLinks);
- Database::removeFromQueue($currentlyCrawled);
- $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
+ Database::removeFromQueue(self::$currentlyCrawled);
+ self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
+ print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
} else {
- print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n";
+ print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
- Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url
- Database::removeFromQueue($currentlyCrawled);
- $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
+ Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
+ Database::removeFromQueue(self::$currentlyCrawled);
+ self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
+ print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
}
}
}
-
- public function saveData($urlInfo, $url)
- {
- if ($url !== '') {
- print "\e[96mFinished previous url - crawling: " . $url . "\n";
-
- $title = $urlInfo['title'] ?? '';
- $description = $urlInfo['description'] ?? '';
- $language = $urlInfo['language'] ?? 'en';
- $hash = md5($url);
- $data = [$title, $description, $language, $hash];
-
- Database::saveUrlData($data);
- }
- }
-
-
} \ No newline at end of file