summaryrefslogtreecommitdiff
path: root/crawler.php
blob: 1e4f401b6a8370a8f3461d43561689f34c335be8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
<?php
/**
 * User: Marvin Borner
 * Date: 14/09/2018
 * Time: 23:48
 */

include "mysql_conf.inc";

$currentUrl = $argv[1];
crawlLoop();

function crawlLoop()
{
    global $currentUrl;

    $content = getContent($currentUrl);
    $htmlPath = createPathFromHtml($content);
    $urlInfo = getUrlInfo($htmlPath);
    $allLinks = getLinks($htmlPath);

    writeToQueue($allLinks);
    saveData($urlInfo);
}


function getContent($url)
{
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
    $content = curl_exec($curl);
    print "Download Size: " . curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
    curl_close($curl);

    return $content;
}

function getUrlInfo($path)
{
    $urlInfo = [];

    foreach ($path->query("//html") as $html) $urlInfo["language"] = $html->getAttribute("lang");
    foreach ($path->query("//meta") as $meta) $urlInfo[$meta->getAttribute("name")] = $meta->getAttribute("content");
    foreach ($path->query("//link") as $link) $urlInfo[$link->getAttribute("rel")] = $link->getAttribute("href");
    $urlInfo["title"] = $path->query("//title")[0]->textContent;

    return $urlInfo;
}

function getLinks($path)
{
    global $currentUrl;
    $allLinks = [];

    foreach ($path->query("//a") as $ink) {
        $href = ltrim($ink->getAttribute("href"));

        if (!(substr($href, 0, 4) === "http")) {
            if (substr($href, 0, 3) === "www") $href = "http://" . $href;
            else if (substr($href, 0, 1) === "/") $href = $currentUrl . $href;
            else $href = $currentUrl . $href;
        }

        // if it's pure domain without slash (prevents duplicate domains because of slash)
        if (preg_match('/\w+\.\w{2,3}$/', $href)) $href = $href . "/";

        array_push($allLinks, $href);
    }

    return array_unique($allLinks);
}

function createPathFromHtml($content)
{
    $dom = new DOMDocument();
    libxml_use_internal_errors(true);
    $dom->loadHTML($content);
    libxml_use_internal_errors(false);
    return new DOMXPath($dom);
}

function writeToQueue($urls)
{
    $conn = initDbConnection();

    foreach ($urls as $url) {
        $hash = md5($url);

        $checkStmt = $conn->prepare('SELECT hash FROM url_data where hash = :hash');
        $checkStmt->execute(['hash' => $hash]);
        if ($checkStmt->rowCount() === 0) {
            $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)');
            $stmt->execute([':url' => $url, 'hash' => $hash]);
        }
    }
}

function saveData($urlInfo)
{
    global $currentUrl;

    print $currentUrl . "\n";

    $title = isset($urlInfo["title"]) ? $urlInfo["title"] : "";
    $description = isset($urlInfo["description"]) ? $urlInfo["description"] : "";
    $icon = isset($urlInfo["icon"]) ? $urlInfo["icon"] : "";
    $language = isset($urlInfo["language"]) ? $urlInfo["language"] : "en";
    $hash = md5($currentUrl);

    try {
        $conn = initDbConnection();
        $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, icon, lang, hash) VALUES (:url, :title, :description, :icon, :lang, :hash)');
        $stmt->execute([':url' => $currentUrl, ':title' => $title, ':description' => $description, ':icon' => $icon, ':lang' => $language, ':hash' => $hash]);
    } catch (PDOException $e) {
        print $e->getMessage();
    }
}

function initDbConnection()
{
    global $servername, $dbname, $username, $password;
    $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password);
    $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
    return $conn;
}