diff options
author | Marvin Borner | 2018-11-09 20:30:13 +0100 |
---|---|---|
committer | Marvin Borner | 2018-11-09 20:30:13 +0100 |
commit | 2aba2904523fec49f9cb922ef3f2933aefa3b673 (patch) | |
tree | 8d924babcecfd436336d478fb2e93cdc27462ff8 | |
parent | 4d853c04be28f434c570390130de7a6e8836d5f6 (diff) |
Fixed endless database existing check
-rw-r--r-- | crawler.js | 58 | ||||
-rw-r--r-- | database.js | 4 |
2 files changed, 35 insertions, 27 deletions
@@ -5,7 +5,7 @@ const url = require("url"); const crawler = new crawlService({ userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - rateLimit: 100, + rateLimit: 100, // TODO: Dynamic rate limit setting depending on errors maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429) callback: (error, res, done) => { if (error || res.statusCode !== 200) { @@ -14,31 +14,39 @@ const crawler = new crawlService({ } else { const $ = res.$; const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64"); - if (database.exists("crawled", "site", urlHash)) { - console.log("\nCrawling: " + res.request.uri.href); - database.index('crawled', 'site', [ - { - "id": urlHash, - "url": res.request.uri.href, - "title": $("title").text() || res.request.uri.href, - "description": $("meta[name=description]").attr("content") || "", - "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : "" - } - ]); + database.exists("crawled", "site", urlHash).then(exists => { + if (crawler.queueSize === 0 || !exists) { + console.log(crawler.queue()); + console.log("\nCrawling: " + res.request.uri.href); + database.index('crawled', 'site', [ + { + "id": urlHash, + "url": res.request.uri.href, + "title": $("title").text() || res.request.uri.href, + "description": $("meta[name=description]").attr("content") || "", + "lang": $("html").attr("lang") || $("meta[http-equiv=content-language]").attr("content") || $("meta[http-equiv=language]").attr("content") || $("meta[name=language]").attr("content") || "en", + "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : "" + } + ]); - $("a").map((i, tag) => { - let parsed; - try { - parsed = new URL($(tag).attr("href")); - } catch (e) { // invalid url -> probably a path - parsed = new URL($(tag).attr("href"), res.request.uri.href); - } - if (parsed.origin !== "null") { - console.log("Queueing: " + parsed.origin + parsed.pathname); - crawler.queue(parsed.origin + parsed.pathname); - } - }); - } + $("a").map((i, tag) => { + let parsed; + try { + parsed = new URL($(tag).attr("href")); + } catch (e) { // invalid url -> probably a path + try { + parsed = new URL($(tag).attr("href"), res.request.uri.href); + } catch (e) { + parsed = null; + } + } + if (parsed !== null && parsed.origin !== "null") { + //console.log("Queueing: " + parsed.origin + parsed.pathname); + crawler.queue(parsed.origin + parsed.pathname); + } + }); + } + }); } done(); } diff --git a/database.js b/database.js index 32365e7..de7083c 100644 --- a/database.js +++ b/database.js @@ -32,8 +32,8 @@ module.exports = { }) .catch(console.err); }, - exists: (index, type, id, callback) => { - return esClient.exists({ + exists: async (index, type, id) => { + return await esClient.exists({ index: index, type: type, id: id |