summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-11-08 21:53:24 +0100
committerMarvin Borner2018-11-08 21:53:24 +0100
commit4d853c04be28f434c570390130de7a6e8836d5f6 (patch)
tree82f703a81e86c5de4ca2f72e275d517cc7b22594
parent5d479ed430bd420c73ff1c2df01f7cf2665e5033 (diff)
Improved rate limit handling (probably need several proxies)
-rw-r--r--crawler.js15
1 files changed, 8 insertions, 7 deletions
diff --git a/crawler.js b/crawler.js
index dee3ce1..607342e 100644
--- a/crawler.js
+++ b/crawler.js
@@ -4,12 +4,13 @@ const database = require("./database");
const url = require("url");
const crawler = new crawlService({
- maxConnections: 10,
+ userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+ rateLimit: 100,
+ maxConnections: 1, // set to 10 (and remove the line above) for faster crawling but higher probability of rate limiting (429)
callback: (error, res, done) => {
- console.log(res);
-
- if (error) {
- console.log(error);
+ if (error || res.statusCode !== 200) {
+ console.log("Error: " + error);
+ console.log("Code: " + res.statusCode);
} else {
const $ = res.$;
const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
@@ -38,9 +39,9 @@ const crawler = new crawlService({
}
});
}
- done();
}
+ done();
}
});
-crawler.queue('http://stackoverflow.com');
+crawler.queue('http://wikipedia.com');