From 5d479ed430bd420c73ff1c2df01f7cf2665e5033 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Thu, 8 Nov 2018 21:21:54 +0100 Subject: Added indexing and improved crawling algorithm --- crawler.js | 42 +++++++++++++++++++++++++++++++----------- database.js | 7 +++++++ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/crawler.js b/crawler.js index 6b2d5b6..dee3ce1 100644 --- a/crawler.js +++ b/crawler.js @@ -1,26 +1,46 @@ const crawlService = require("crawler"); const crypto = require("crypto"); const database = require("./database"); +const url = require("url"); const crawler = new crawlService({ maxConnections: 10, callback: (error, res, done) => { + console.log(res); + if (error) { console.log(error); } else { const $ = res.$; - database.index('crawled', 'site', [ - { - "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'), - "url": res.request.uri.href, - "title": $("title").text(), - "description": $("meta[name=description]").attr("content"), - "keywords": $("meta[name=keywords]").attr("content").split(", ") - } - ]); + const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64"); + if (database.exists("crawled", "site", urlHash)) { + console.log("\nCrawling: " + res.request.uri.href); + database.index('crawled', 'site', [ + { + "id": urlHash, + "url": res.request.uri.href, + "title": $("title").text() || res.request.uri.href, + "description": $("meta[name=description]").attr("content") || "", + "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : "" + } + ]); + + $("a").map((i, tag) => { + let parsed; + try { + parsed = new URL($(tag).attr("href")); + } catch (e) { // invalid url -> probably a path + parsed = new URL($(tag).attr("href"), res.request.uri.href); + } + if (parsed.origin !== "null") { + console.log("Queueing: " + parsed.origin + parsed.pathname); + crawler.queue(parsed.origin + parsed.pathname); + } + }); + } + done(); } - done(); } }); -crawler.queue('http://www.amazon.com'); +crawler.queue('http://stackoverflow.com'); diff --git a/database.js b/database.js index f48f87e..32365e7 100644 --- a/database.js +++ b/database.js @@ -32,6 +32,13 @@ module.exports = { }) .catch(console.err); }, + exists: (index, type, id, callback) => { + return esClient.exists({ + index: index, + type: type, + id: id + }) + }, search: (index, body) => { return esClient.search({index: index, body: body}); } -- cgit v1.2.3