diff options
author | Marvin Borner | 2018-11-07 22:07:10 +0100 |
---|---|---|
committer | Marvin Borner | 2018-11-07 22:07:10 +0100 |
commit | 1e256f6575aabcc745cf7998d70ed8455d01d49b (patch) | |
tree | 8846d307e70e97d0d84033b2903835dafcd91ddb | |
parent | be2f575bca14f910c130012ebf0c3930a97f8a61 (diff) |
Implemented web crawling
-rw-r--r-- | app.js | 4 | ||||
-rw-r--r-- | crawler.js | 26 |
2 files changed, 26 insertions, 4 deletions
@@ -8,10 +8,6 @@ const indexRouter = require('./routes/index'); const app = express(); -// view engine setup -app.set('views', path.join(__dirname, 'views')); -app.set('view engine', 'twig'); - app.use(logger('dev')); app.use(express.json()); app.use(express.urlencoded({extended: false})); diff --git a/crawler.js b/crawler.js new file mode 100644 index 0000000..6b2d5b6 --- /dev/null +++ b/crawler.js @@ -0,0 +1,26 @@ +const crawlService = require("crawler"); +const crypto = require("crypto"); +const database = require("./database"); + +const crawler = new crawlService({ + maxConnections: 10, + callback: (error, res, done) => { + if (error) { + console.log(error); + } else { + const $ = res.$; + database.index('crawled', 'site', [ + { + "id": crypto.createHash('sha256').update(res.request.uri.href).digest('base64'), + "url": res.request.uri.href, + "title": $("title").text(), + "description": $("meta[name=description]").attr("content"), + "keywords": $("meta[name=keywords]").attr("content").split(", ") + } + ]); + } + done(); + } +}); + +crawler.queue('http://www.amazon.com'); |