blob: 53d5aac048684f34d664e24aafda8e9ea10adb75 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
<?php
header('Content-type: text/plain; charset=utf-8');
/**
* User: Marvin Borner
* Date: 14/09/2018
* Time: 23:48
*/
require_once 'mysql_conf.inc';
require_once 'WebRequest.php';
require_once 'Database.php';
require_once 'Algorithms.php';
class CrawlController
{
public static $currentlyCrawled;
public static function start($url = '')
{
set_time_limit(3600000);
self::$currentlyCrawled = $url;
while (true) {
self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled));
}
}
private static function crawl($url)
{
if ($url !== '' && Database::alreadyCrawled($url)) {
Database::removeFromQueue(self::$currentlyCrawled);
self::$currentlyCrawled = Database::getFromQueue('DESC');
} else {
$requestResponse = WebRequest::getContent($url);
if ($requestResponse) {
self::$currentlyCrawled = $requestResponse[3];
if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
$htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
$urlInfo = Algorithms::getUrlInfo($htmlPath);
Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
$allLinks = Algorithms::getLinks($htmlPath);
Database::insertIntoQueue($allLinks);
Database::removeFromQueue(self::$currentlyCrawled);
self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
} else {
print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
Database::removeFromQueue(self::$currentlyCrawled);
self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
}
}
}
}
}
|