summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-11-10 01:48:38 +0100
committerMarvin Borner2018-11-10 01:48:38 +0100
commitef5411a6e14c9ae6cbd1488122462f12ae197ce0 (patch)
tree00dcd148c0fb7648253bdc248858923bc228ca38
parent2aba2904523fec49f9cb922ef3f2933aefa3b673 (diff)
Improved performance
-rw-r--r--crawler.js3
1 files changed, 1 insertions, 2 deletions
diff --git a/crawler.js b/crawler.js
index ceacc52..0f09c4a 100644
--- a/crawler.js
+++ b/crawler.js
@@ -1,9 +1,9 @@
const crawlService = require("crawler");
const crypto = require("crypto");
const database = require("./database");
-const url = require("url");
const crawler = new crawlService({
+ skipDuplicates: true,
userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
rateLimit: 100, // TODO: Dynamic rate limit setting depending on errors
maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429)
@@ -16,7 +16,6 @@ const crawler = new crawlService({
const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
database.exists("crawled", "site", urlHash).then(exists => {
if (crawler.queueSize === 0 || !exists) {
- console.log(crawler.queue());
console.log("\nCrawling: " + res.request.uri.href);
database.index('crawled', 'site', [
{