summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-11-07 22:07:10 +0100
committerMarvin Borner2018-11-07 22:07:10 +0100
commit1e256f6575aabcc745cf7998d70ed8455d01d49b (patch)
tree8846d307e70e97d0d84033b2903835dafcd91ddb
parentbe2f575bca14f910c130012ebf0c3930a97f8a61 (diff)
Implemented web crawling
-rw-r--r--app.js4
-rw-r--r--crawler.js26
2 files changed, 26 insertions, 4 deletions
diff --git a/app.js b/app.js
index 365c6bc..761d682 100644
--- a/app.js
+++ b/app.js
@@ -8,10 +8,6 @@ const indexRouter = require('./routes/index');
const app = express();
-// view engine setup
-app.set('views', path.join(__dirname, 'views'));
-app.set('view engine', 'twig');
-
app.use(logger('dev'));
app.use(express.json());
app.use(express.urlencoded({extended: false}));
diff --git a/crawler.js b/crawler.js
new file mode 100644
index 0000000..6b2d5b6
--- /dev/null
+++ b/crawler.js
@@ -0,0 +1,26 @@
+const crawlService = require("crawler");
+const crypto = require("crypto");
+const database = require("./database");
+
+const crawler = new crawlService({
+ maxConnections: 10,
+ callback: (error, res, done) => {
+ if (error) {
+ console.log(error);
+ } else {
+ const $ = res.$;
+ database.index('crawled', 'site', [
+ {
+ "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'),
+ "url": res.request.uri.href,
+ "title": $("title").text(),
+ "description": $("meta[name=description]").attr("content"),
+ "keywords": $("meta[name=keywords]").attr("content").split(", ")
+ }
+ ]);
+ }
+ done();
+ }
+});
+
+crawler.queue('http://www.amazon.com');