diff options
author | Marvin Borner | 2018-09-15 15:15:25 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-15 15:15:25 +0200 |
commit | 940b46f419fb25366af675f5ab6da651a5f9c965 (patch) | |
tree | 8db0e198da659760dfa8227832bd7a3b7902341f |
Added basic crawling feature :sparkles: :tada:
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | crawler.php | 127 | ||||
-rw-r--r-- | database.sql | 64 | ||||
-rw-r--r-- | mysql_conf.inc | 5 |
4 files changed, 197 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62c8935 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/
\ No newline at end of file diff --git a/crawler.php b/crawler.php new file mode 100644 index 0000000..1e4f401 --- /dev/null +++ b/crawler.php @@ -0,0 +1,127 @@ +<?php
+/**
+ * User: Marvin Borner
+ * Date: 14/09/2018
+ * Time: 23:48
+ */
+
+include "mysql_conf.inc";
+
+$currentUrl = $argv[1];
+crawlLoop();
+
+function crawlLoop()
+{
+ global $currentUrl;
+
+ $content = getContent($currentUrl);
+ $htmlPath = createPathFromHtml($content);
+ $urlInfo = getUrlInfo($htmlPath);
+ $allLinks = getLinks($htmlPath);
+
+ writeToQueue($allLinks);
+ saveData($urlInfo);
+}
+
+
+function getContent($url)
+{
+ $curl = curl_init($url);
+ curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
+ curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
+ $content = curl_exec($curl);
+ print "Download Size: " . curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
+ curl_close($curl);
+
+ return $content;
+}
+
+function getUrlInfo($path)
+{
+ $urlInfo = [];
+
+ foreach ($path->query("//html") as $html) $urlInfo["language"] = $html->getAttribute("lang");
+ foreach ($path->query("//meta") as $meta) $urlInfo[$meta->getAttribute("name")] = $meta->getAttribute("content");
+ foreach ($path->query("//link") as $link) $urlInfo[$link->getAttribute("rel")] = $link->getAttribute("href");
+ $urlInfo["title"] = $path->query("//title")[0]->textContent;
+
+ return $urlInfo;
+}
+
+function getLinks($path)
+{
+ global $currentUrl;
+ $allLinks = [];
+
+ foreach ($path->query("//a") as $ink) {
+ $href = ltrim($ink->getAttribute("href"));
+
+ if (!(substr($href, 0, 4) === "http")) {
+ if (substr($href, 0, 3) === "www") $href = "http://" . $href;
+ else if (substr($href, 0, 1) === "/") $href = $currentUrl . $href;
+ else $href = $currentUrl . $href;
+ }
+
+ // if it's pure domain without slash (prevents duplicate domains because of slash)
+ if (preg_match('/\w+\.\w{2,3}$/', $href)) $href = $href . "/";
+
+ array_push($allLinks, $href);
+ }
+
+ return array_unique($allLinks);
+}
+
+function createPathFromHtml($content)
+{
+ $dom = new DOMDocument();
+ libxml_use_internal_errors(true);
+ $dom->loadHTML($content);
+ libxml_use_internal_errors(false);
+ return new DOMXPath($dom);
+}
+
+function writeToQueue($urls)
+{
+ $conn = initDbConnection();
+
+ foreach ($urls as $url) {
+ $hash = md5($url);
+
+ $checkStmt = $conn->prepare('SELECT hash FROM url_data where hash = :hash');
+ $checkStmt->execute(['hash' => $hash]);
+ if ($checkStmt->rowCount() === 0) {
+ $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)');
+ $stmt->execute([':url' => $url, 'hash' => $hash]);
+ }
+ }
+}
+
+function saveData($urlInfo)
+{
+ global $currentUrl;
+
+ print $currentUrl . "\n";
+
+ $title = isset($urlInfo["title"]) ? $urlInfo["title"] : "";
+ $description = isset($urlInfo["description"]) ? $urlInfo["description"] : "";
+ $icon = isset($urlInfo["icon"]) ? $urlInfo["icon"] : "";
+ $language = isset($urlInfo["language"]) ? $urlInfo["language"] : "en";
+ $hash = md5($currentUrl);
+
+ try {
+ $conn = initDbConnection();
+ $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, icon, lang, hash) VALUES (:url, :title, :description, :icon, :lang, :hash)');
+ $stmt->execute([':url' => $currentUrl, ':title' => $title, ':description' => $description, ':icon' => $icon, ':lang' => $language, ':hash' => $hash]);
+ } catch (PDOException $e) {
+ print $e->getMessage();
+ }
+}
+
+function initDbConnection()
+{
+ global $servername, $dbname, $username, $password;
+ $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password);
+ $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
+ return $conn;
+}
\ No newline at end of file diff --git a/database.sql b/database.sql new file mode 100644 index 0000000..29970b4 --- /dev/null +++ b/database.sql @@ -0,0 +1,64 @@ +-- MySQL dump 10.16 Distrib 10.1.26-MariaDB, for debian-linux-gnu (x86_64) +-- +-- Host: localhost Database: search_engine +-- ------------------------------------------------------ +-- Server version 10.1.26-MariaDB-0+deb9u1 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `queue` +-- + +DROP TABLE IF EXISTS `queue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `queue` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `queue_id_uindex` (`id`), + UNIQUE KEY `queue_hash_uindex` (`hash`) +) ENGINE=InnoDB AUTO_INCREMENT=557 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `url_data` +-- + +DROP TABLE IF EXISTS `url_data`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `url_data` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `title` varchar(60) COLLATE utf8mb4_unicode_ci NOT NULL, + `description` varchar(350) COLLATE utf8mb4_unicode_ci NOT NULL, + `icon` varchar(256) COLLATE utf8mb4_unicode_ci NOT NULL, + `lang` varchar(3) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `url_data_hash_uindex` (`hash`) +) ENGINE=InnoDB AUTO_INCREMENT=15 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2018-09-15 15:12:33 diff --git a/mysql_conf.inc b/mysql_conf.inc new file mode 100644 index 0000000..86d2a8f --- /dev/null +++ b/mysql_conf.inc @@ -0,0 +1,5 @@ +<?php
+$servername = "127.0.0.1";
+$username = "root";
+$password = "root";
+$dbname = "search_engine";
\ No newline at end of file |