summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-11-09 20:30:13 +0100
committerMarvin Borner2018-11-09 20:30:13 +0100
commit2aba2904523fec49f9cb922ef3f2933aefa3b673 (patch)
tree8d924babcecfd436336d478fb2e93cdc27462ff8
parent4d853c04be28f434c570390130de7a6e8836d5f6 (diff)
Fixed endless database existing check
-rw-r--r--crawler.js58
-rw-r--r--database.js4
2 files changed, 35 insertions, 27 deletions
diff --git a/crawler.js b/crawler.js
index 607342e..ceacc52 100644
--- a/crawler.js
+++ b/crawler.js
@@ -5,7 +5,7 @@ const url = require("url");
const crawler = new crawlService({
userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
- rateLimit: 100,
+ rateLimit: 100, // TODO: Dynamic rate limit setting depending on errors
maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429)
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
@@ -14,31 +14,39 @@ const crawler = new crawlService({
} else {
const $ = res.$;
const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
- if (database.exists("crawled", "site", urlHash)) {
- console.log("\nCrawling: " + res.request.uri.href);
- database.index('crawled', 'site', [
- {
- "id": urlHash,
- "url": res.request.uri.href,
- "title": $("title").text() || res.request.uri.href,
- "description": $("meta[name=description]").attr("content") || "",
- "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
- }
- ]);
+ database.exists("crawled", "site", urlHash).then(exists => {
+ if (crawler.queueSize === 0 || !exists) {
+ console.log(crawler.queue());
+ console.log("\nCrawling: " + res.request.uri.href);
+ database.index('crawled', 'site', [
+ {
+ "id": urlHash,
+ "url": res.request.uri.href,
+ "title": $("title").text() || res.request.uri.href,
+ "description": $("meta[name=description]").attr("content") || "",
+ "lang": $("html").attr("lang") || $("meta[http-equiv=content-language]").attr("content") || $("meta[http-equiv=language]").attr("content") || $("meta[name=language]").attr("content") || "en",
+ "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
+ }
+ ]);
- $("a").map((i, tag) => {
- let parsed;
- try {
- parsed = new URL($(tag).attr("href"));
- } catch (e) { // invalid url -> probably a path
- parsed = new URL($(tag).attr("href"), res.request.uri.href);
- }
- if (parsed.origin !== "null") {
- console.log("Queueing: " + parsed.origin + parsed.pathname);
- crawler.queue(parsed.origin + parsed.pathname);
- }
- });
- }
+ $("a").map((i, tag) => {
+ let parsed;
+ try {
+ parsed = new URL($(tag).attr("href"));
+ } catch (e) { // invalid url -> probably a path
+ try {
+ parsed = new URL($(tag).attr("href"), res.request.uri.href);
+ } catch (e) {
+ parsed = null;
+ }
+ }
+ if (parsed !== null && parsed.origin !== "null") {
+ //console.log("Queueing: " + parsed.origin + parsed.pathname);
+ crawler.queue(parsed.origin + parsed.pathname);
+ }
+ });
+ }
+ });
}
done();
}
diff --git a/database.js b/database.js
index 32365e7..de7083c 100644
--- a/database.js
+++ b/database.js
@@ -32,8 +32,8 @@ module.exports = {
})
.catch(console.err);
},
- exists: (index, type, id, callback) => {
- return esClient.exists({
+ exists: async (index, type, id) => {
+ return await esClient.exists({
index: index,
type: type,
id: id