summaryrefslogtreecommitdiff
path: root/crawler.js
blob: dee3ce1b299c3e65114c05a2055cb142456cd5c8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
const crawlService = require("crawler");
const crypto = require("crypto");
const database = require("./database");
const url = require("url");

const crawler = new crawlService({
    maxConnections: 10,
    callback: (error, res, done) => {
        console.log(res);

        if (error) {
            console.log(error);
        } else {
            const $ = res.$;
            const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
            if (database.exists("crawled", "site", urlHash)) {
                console.log("\nCrawling: " + res.request.uri.href);
                database.index('crawled', 'site', [
                    {
                        "id": urlHash,
                        "url": res.request.uri.href,
                        "title": $("title").text() || res.request.uri.href,
                        "description": $("meta[name=description]").attr("content") || "",
                        "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
                    }
                ]);

                $("a").map((i, tag) => {
                    let parsed;
                    try {
                        parsed = new URL($(tag).attr("href"));
                    } catch (e) { // invalid url -> probably a path
                        parsed = new URL($(tag).attr("href"), res.request.uri.href);
                    }
                    if (parsed.origin !== "null") {
                        console.log("Queueing: " + parsed.origin + parsed.pathname);
                        crawler.queue(parsed.origin + parsed.pathname);
                    }
                });
            }
            done();
        }
    }
});

crawler.queue('http://stackoverflow.com');