blob: e5a270b40ef38d219c9a51f5e4fbd3a2d419d5cd (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
<?php
/**
* User: Marvin Borner
* Date: 14/09/2018
* Time: 23:48
*/
include 'mysql_conf.inc';
class CrawlController
{
public function __construct()
{
set_time_limit(3600000);
error_reporting(E_ERROR | E_PARSE);
$currentlyCrawled = $argv[1] ?? '';
while (true) {
crawl($currentlyCrawled);
}
}
public function crawl($url)
{
global $currentlyCrawled;
if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) {
print "\t\e[91mUrl already crawled " . $url . "\n";
Database::removeFromQueue($currentlyCrawled);
$currentlyCrawled = $this->getFromQueue('DESC');
} else {
$requestResponse = getContent($url);
$currentlyCrawled = $requestResponse[3];
if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
$htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
$urlInfo = Algorithms::getUrlInfo($htmlPath);
$allLinks = Algorithms::getLinks($htmlPath);
Database::writeToQueue($allLinks);
$this->saveData($urlInfo, $currentlyCrawled);
Database::removeFromQueue($currentlyCrawled);
$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
} else {
print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n";
Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url
Database::removeFromQueue($currentlyCrawled);
$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
}
}
}
public function saveData($urlInfo, $url)
{
if ($url !== '') {
print "\e[96mFinished previous url - crawling: " . $url . "\n";
$title = $urlInfo['title'] ?? '';
$description = $urlInfo['description'] ?? '';
$language = $urlInfo['language'] ?? 'en';
$hash = md5($url);
$data = [$title, $description, $language, $hash];
Database::saveUrlData($data);
}
}
}
|