summaryrefslogtreecommitdiff
path: root/crawler/CrawlController.php
blob: e5a270b40ef38d219c9a51f5e4fbd3a2d419d5cd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
<?php
/**
 * User: Marvin Borner
 * Date: 14/09/2018
 * Time: 23:48
 */

include 'mysql_conf.inc';

class CrawlController
{
    public function __construct()
    {
        set_time_limit(3600000);
        error_reporting(E_ERROR | E_PARSE);

        $currentlyCrawled = $argv[1] ?? '';

        while (true) {
            crawl($currentlyCrawled);
        }
    }

    public function crawl($url)
    {
        global $currentlyCrawled;

        if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) {
            print "\t\e[91mUrl already crawled " . $url . "\n";

            Database::removeFromQueue($currentlyCrawled);
            $currentlyCrawled = $this->getFromQueue('DESC');
        } else {
            $requestResponse = getContent($url);
            $currentlyCrawled = $requestResponse[3];
            if (preg_match('/2\d\d/', $requestResponse[1])) { // success
                print 'Download Size: ' . $requestResponse[2];

                $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
                $urlInfo = Algorithms::getUrlInfo($htmlPath);
                $allLinks = Algorithms::getLinks($htmlPath);

                Database::writeToQueue($allLinks);
                $this->saveData($urlInfo, $currentlyCrawled);

                Database::removeFromQueue($currentlyCrawled);
                $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
            } else {
                print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n";

                Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url
                Database::removeFromQueue($currentlyCrawled);
                $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
            }
        }
    }

    public function saveData($urlInfo, $url)
    {
        if ($url !== '') {
            print "\e[96mFinished previous url - crawling: " . $url . "\n";

            $title = $urlInfo['title'] ?? '';
            $description = $urlInfo['description'] ?? '';
            $language = $urlInfo['language'] ?? 'en';
            $hash = md5($url);
            $data = [$title, $description, $language, $hash];

            Database::saveUrlData($data);
        }
    }


}