summaryrefslogtreecommitdiff
path: root/crawler.js
blob: ceacc5201c899124af377ac0b32d0edf64aaa9ce (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
const crawlService = require("crawler");
const crypto = require("crypto");
const database = require("./database");
const url = require("url");

const crawler = new crawlService({
    userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    rateLimit: 100, // TODO: Dynamic rate limit setting depending on errors
    maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429)
    callback: (error, res, done) => {
        if (error || res.statusCode !== 200) {
            console.log("Error: " + error);
            console.log("Code: " + res.statusCode);
        } else {
            const $ = res.$;
            const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
            database.exists("crawled", "site", urlHash).then(exists => {
                if (crawler.queueSize === 0 || !exists) {
                    console.log(crawler.queue());
                    console.log("\nCrawling: " + res.request.uri.href);
                    database.index('crawled', 'site', [
                        {
                            "id": urlHash,
                            "url": res.request.uri.href,
                            "title": $("title").text() || res.request.uri.href,
                            "description": $("meta[name=description]").attr("content") || "",
                            "lang": $("html").attr("lang") || $("meta[http-equiv=content-language]").attr("content") || $("meta[http-equiv=language]").attr("content") || $("meta[name=language]").attr("content") || "en",
                            "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
                        }
                    ]);

                    $("a").map((i, tag) => {
                        let parsed;
                        try {
                            parsed = new URL($(tag).attr("href"));
                        } catch (e) { // invalid url -> probably a path
                            try {
                                parsed = new URL($(tag).attr("href"), res.request.uri.href);
                            } catch (e) {
                                parsed = null;
                            }
                        }
                        if (parsed !== null && parsed.origin !== "null") {
                            //console.log("Queueing: " + parsed.origin + parsed.pathname);
                            crawler.queue(parsed.origin + parsed.pathname);
                        }
                    });
                }
            });
        }
        done();
    }
});

crawler.queue('http://wikipedia.com');