From 1e256f6575aabcc745cf7998d70ed8455d01d49b Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Wed, 7 Nov 2018 22:07:10 +0100 Subject: Implemented web crawling --- app.js | 4 ---- crawler.js | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 crawler.js diff --git a/app.js b/app.js index 365c6bc..761d682 100644 --- a/app.js +++ b/app.js @@ -8,10 +8,6 @@ const indexRouter = require('./routes/index'); const app = express(); -// view engine setup -app.set('views', path.join(__dirname, 'views')); -app.set('view engine', 'twig'); - app.use(logger('dev')); app.use(express.json()); app.use(express.urlencoded({extended: false})); diff --git a/crawler.js b/crawler.js new file mode 100644 index 0000000..6b2d5b6 --- /dev/null +++ b/crawler.js @@ -0,0 +1,26 @@ +const crawlService = require("crawler"); +const crypto = require("crypto"); +const database = require("./database"); + +const crawler = new crawlService({ + maxConnections: 10, + callback: (error, res, done) => { + if (error) { + console.log(error); + } else { + const $ = res.$; + database.index('crawled', 'site', [ + { + "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'), + "url": res.request.uri.href, + "title": $("title").text(), + "description": $("meta[name=description]").attr("content"), + "keywords": $("meta[name=keywords]").attr("content").split(", ") + } + ]); + } + done(); + } +}); + +crawler.queue('http://www.amazon.com'); -- cgit v1.2.3