From 4d853c04be28f434c570390130de7a6e8836d5f6 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Thu, 8 Nov 2018 21:53:24 +0100 Subject: Improved rate limit handling (probably need several proxies) --- crawler.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/crawler.js b/crawler.js index dee3ce1..607342e 100644 --- a/crawler.js +++ b/crawler.js @@ -4,12 +4,13 @@ const database = require("./database"); const url = require("url"); const crawler = new crawlService({ - maxConnections: 10, + userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + rateLimit: 100, + maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429) callback: (error, res, done) => { - console.log(res); - - if (error) { - console.log(error); + if (error || res.statusCode !== 200) { + console.log("Error: " + error); + console.log("Code: " + res.statusCode); } else { const $ = res.$; const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64"); @@ -38,9 +39,9 @@ const crawler = new crawlService({ } }); } - done(); } + done(); } }); -crawler.queue('http://stackoverflow.com'); +crawler.queue('http://wikipedia.com'); -- cgit v1.2.3