diff options
author | Marvin Borner | 2018-11-08 21:21:54 +0100 |
---|---|---|
committer | Marvin Borner | 2018-11-08 21:21:54 +0100 |
commit | 5d479ed430bd420c73ff1c2df01f7cf2665e5033 (patch) | |
tree | e13c69059d88e5a1bd6b2700432db4137213efc0 | |
parent | d243f31f7d245efce4bd857b32dc758205c28758 (diff) |
Added indexing and improved crawling algorithm
-rw-r--r-- | crawler.js | 42 | ||||
-rw-r--r-- | database.js | 7 |
2 files changed, 38 insertions, 11 deletions
@@ -1,26 +1,46 @@ const crawlService = require("crawler"); const crypto = require("crypto"); const database = require("./database"); +const url = require("url"); const crawler = new crawlService({ maxConnections: 10, callback: (error, res, done) => { + console.log(res); + if (error) { console.log(error); } else { const $ = res.$; - database.index('crawled', 'site', [ - { - "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'), - "url": res.request.uri.href, - "title": $("title").text(), - "description": $("meta[name=description]").attr("content"), - "keywords": $("meta[name=keywords]").attr("content").split(", ") - } - ]); + const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64"); + if (database.exists("crawled", "site", urlHash)) { + console.log("\nCrawling: " + res.request.uri.href); + database.index('crawled', 'site', [ + { + "id": urlHash, + "url": res.request.uri.href, + "title": $("title").text() || res.request.uri.href, + "description": $("meta[name=description]").attr("content") || "", + "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : "" + } + ]); + + $("a").map((i, tag) => { + let parsed; + try { + parsed = new URL($(tag).attr("href")); + } catch (e) { // invalid url -> probably a path + parsed = new URL($(tag).attr("href"), res.request.uri.href); + } + if (parsed.origin !== "null") { + console.log("Queueing: " + parsed.origin + parsed.pathname); + crawler.queue(parsed.origin + parsed.pathname); + } + }); + } + done(); } - done(); } }); -crawler.queue('http://www.amazon.com'); +crawler.queue('http://stackoverflow.com'); diff --git a/database.js b/database.js index f48f87e..32365e7 100644 --- a/database.js +++ b/database.js @@ -32,6 +32,13 @@ module.exports = { }) .catch(console.err); }, + exists: (index, type, id, callback) => { + return esClient.exists({ + index: index, + type: type, + id: id + }) + }, search: (index, body) => { return esClient.search({index: index, body: body}); } |