summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-11-08 21:21:54 +0100
committerMarvin Borner2018-11-08 21:21:54 +0100
commit5d479ed430bd420c73ff1c2df01f7cf2665e5033 (patch)
treee13c69059d88e5a1bd6b2700432db4137213efc0
parentd243f31f7d245efce4bd857b32dc758205c28758 (diff)
Added indexing and improved crawling algorithm
-rw-r--r--crawler.js42
-rw-r--r--database.js7
2 files changed, 38 insertions, 11 deletions
diff --git a/crawler.js b/crawler.js
index 6b2d5b6..dee3ce1 100644
--- a/crawler.js
+++ b/crawler.js
@@ -1,26 +1,46 @@
const crawlService = require("crawler");
const crypto = require("crypto");
const database = require("./database");
+const url = require("url");
const crawler = new crawlService({
maxConnections: 10,
callback: (error, res, done) => {
+ console.log(res);
+
if (error) {
console.log(error);
} else {
const $ = res.$;
- database.index('crawled', 'site', [
- {
- "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'),
- "url": res.request.uri.href,
- "title": $("title").text(),
- "description": $("meta[name=description]").attr("content"),
- "keywords": $("meta[name=keywords]").attr("content").split(", ")
- }
- ]);
+ const urlHash = crypto.createHash("sha256").update(res.request.uri.href).digest("base64");
+ if (database.exists("crawled", "site", urlHash)) {
+ console.log("\nCrawling: " + res.request.uri.href);
+ database.index('crawled', 'site', [
+ {
+ "id": urlHash,
+ "url": res.request.uri.href,
+ "title": $("title").text() || res.request.uri.href,
+ "description": $("meta[name=description]").attr("content") || "",
+ "keywords": $("meta[name=keywords]").attr("content") ? $("meta[name=keywords]").attr("content").split(", ") : ""
+ }
+ ]);
+
+ $("a").map((i, tag) => {
+ let parsed;
+ try {
+ parsed = new URL($(tag).attr("href"));
+ } catch (e) { // invalid url -> probably a path
+ parsed = new URL($(tag).attr("href"), res.request.uri.href);
+ }
+ if (parsed.origin !== "null") {
+ console.log("Queueing: " + parsed.origin + parsed.pathname);
+ crawler.queue(parsed.origin + parsed.pathname);
+ }
+ });
+ }
+ done();
}
- done();
}
});
-crawler.queue('http://www.amazon.com');
+crawler.queue('http://stackoverflow.com');
diff --git a/database.js b/database.js
index f48f87e..32365e7 100644
--- a/database.js
+++ b/database.js
@@ -32,6 +32,13 @@ module.exports = {
})
.catch(console.err);
},
+ exists: (index, type, id, callback) => {
+ return esClient.exists({
+ index: index,
+ type: type,
+ id: id
+ })
+ },
search: (index, body) => {
return esClient.search({index: index, body: body});
}