summaryrefslogtreecommitdiff
path: root/crawler.js
diff options
context:
space:
mode:
Diffstat (limited to 'crawler.js')
-rw-r--r--crawler.js42
1 files changed, 31 insertions, 11 deletions
diff --git a/crawler.js b/crawler.js
index 6b2d5b6..dee3ce1 100644
--- a/crawler.js
+++ b/crawler.js
@@ -1,26 +1,46 @@
const crawlService = require("crawler");
const crypto = require("crypto");
const database = require("./database");
+const url = require("url");
const crawler = new crawlService({
maxConnections: 10,
callback: (error, res, done) => {
+ console.log(res);
+
if (error) {
console.log(error);
} else {
const $ = res.$;
- database.index('crawled', 'site', [
- {
- "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'),
- "url": res.request.uri.href,
- "title": $("title").text(),
- "description": $("meta[name=description]").attr("content"),
- "keywords": $("meta[name=keywords]").attr("content").split(", ")
- }
- ]);
+ const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
+ if (database.exists("crawled", "site", urlHash)) {
+ console.log("\nCrawling: " + res.request.uri.href);
+ database.index('crawled', 'site', [
+ {
+ "id": urlHash,
+ "url": res.request.uri.href,
+ "title": $("title").text() || res.request.uri.href,
+ "description": $("meta[name=description]").attr("content") || "",
+ "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
+ }
+ ]);
+
+ $("a").map((i, tag) => {
+ let parsed;
+ try {
+ parsed = new URL($(tag).attr("href"));
+ } catch (e) { // invalid url -> probably a path
+ parsed = new URL($(tag).attr("href"), res.request.uri.href);
+ }
+ if (parsed.origin !== "null") {
+ console.log("Queueing: " + parsed.origin + parsed.pathname);
+ crawler.queue(parsed.origin + parsed.pathname);
+ }
+ });
+ }
+ done();
}
- done();
}
});
-crawler.queue('http://www.amazon.com');
+crawler.queue('http://stackoverflow.com');