diff options
author | Marvin Borner | 2018-11-10 01:48:38 +0100 |
---|---|---|
committer | Marvin Borner | 2018-11-10 01:48:38 +0100 |
commit | ef5411a6e14c9ae6cbd1488122462f12ae197ce0 (patch) | |
tree | 00dcd148c0fb7648253bdc248858923bc228ca38 | |
parent | 2aba2904523fec49f9cb922ef3f2933aefa3b673 (diff) |
Improved performance
-rw-r--r-- | crawler.js | 3 |
1 files changed, 1 insertions, 2 deletions
@@ -1,9 +1,9 @@ const crawlService = require("crawler"); const crypto = require("crypto"); const database = require("./database"); -const url = require("url"); const crawler = new crawlService({ + skipDuplicates: true, userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", rateLimit: 100, // TODO: Dynamic rate limit setting depending on errors maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429) @@ -16,7 +16,6 @@ const crawler = new crawlService({ const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64"); database.exists("crawled", "site", urlHash).then(exists => { if (crawler.queueSize === 0 || !exists) { - console.log(crawler.queue()); console.log("\nCrawling: " + res.request.uri.href); database.index('crawled', 'site', [ { |