From 940b46f419fb25366af675f5ab6da651a5f9c965 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sat, 15 Sep 2018 15:15:25 +0200 Subject: Added basic crawling feature :sparkles: :tada: --- .gitignore | 1 + crawler.php | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ database.sql | 64 +++++++++++++++++++++++++++++ mysql_conf.inc | 5 +++ 4 files changed, 197 insertions(+) create mode 100644 .gitignore create mode 100644 crawler.php create mode 100644 database.sql create mode 100644 mysql_conf.inc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62c8935 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ \ No newline at end of file diff --git a/crawler.php b/crawler.php new file mode 100644 index 0000000..1e4f401 --- /dev/null +++ b/crawler.php @@ -0,0 +1,127 @@ +query("//html") as $html) $urlInfo["language"] = $html->getAttribute("lang"); + foreach ($path->query("//meta") as $meta) $urlInfo[$meta->getAttribute("name")] = $meta->getAttribute("content"); + foreach ($path->query("//link") as $link) $urlInfo[$link->getAttribute("rel")] = $link->getAttribute("href"); + $urlInfo["title"] = $path->query("//title")[0]->textContent; + + return $urlInfo; +} + +function getLinks($path) +{ + global $currentUrl; + $allLinks = []; + + foreach ($path->query("//a") as $ink) { + $href = ltrim($ink->getAttribute("href")); + + if (!(substr($href, 0, 4) === "http")) { + if (substr($href, 0, 3) === "www") $href = "http://" . $href; + else if (substr($href, 0, 1) === "/") $href = $currentUrl . $href; + else $href = $currentUrl . $href; + } + + // if it's pure domain without slash (prevents duplicate domains because of slash) + if (preg_match('/\w+\.\w{2,3}$/', $href)) $href = $href . "/"; + + array_push($allLinks, $href); + } + + return array_unique($allLinks); +} + +function createPathFromHtml($content) +{ + $dom = new DOMDocument(); + libxml_use_internal_errors(true); + $dom->loadHTML($content); + libxml_use_internal_errors(false); + return new DOMXPath($dom); +} + +function writeToQueue($urls) +{ + $conn = initDbConnection(); + + foreach ($urls as $url) { + $hash = md5($url); + + $checkStmt = $conn->prepare('SELECT hash FROM url_data where hash = :hash'); + $checkStmt->execute(['hash' => $hash]); + if ($checkStmt->rowCount() === 0) { + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + } + } +} + +function saveData($urlInfo) +{ + global $currentUrl; + + print $currentUrl . "\n"; + + $title = isset($urlInfo["title"]) ? $urlInfo["title"] : ""; + $description = isset($urlInfo["description"]) ? $urlInfo["description"] : ""; + $icon = isset($urlInfo["icon"]) ? $urlInfo["icon"] : ""; + $language = isset($urlInfo["language"]) ? $urlInfo["language"] : "en"; + $hash = md5($currentUrl); + + try { + $conn = initDbConnection(); + $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, icon, lang, hash) VALUES (:url, :title, :description, :icon, :lang, :hash)'); + $stmt->execute([':url' => $currentUrl, ':title' => $title, ':description' => $description, ':icon' => $icon, ':lang' => $language, ':hash' => $hash]); + } catch (PDOException $e) { + print $e->getMessage(); + } +} + +function initDbConnection() +{ + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; +} \ No newline at end of file diff --git a/database.sql b/database.sql new file mode 100644 index 0000000..29970b4 --- /dev/null +++ b/database.sql @@ -0,0 +1,64 @@ +-- MySQL dump 10.16 Distrib 10.1.26-MariaDB, for debian-linux-gnu (x86_64) +-- +-- Host: localhost Database: search_engine +-- ------------------------------------------------------ +-- Server version 10.1.26-MariaDB-0+deb9u1 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `queue` +-- + +DROP TABLE IF EXISTS `queue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `queue` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `queue_id_uindex` (`id`), + UNIQUE KEY `queue_hash_uindex` (`hash`) +) ENGINE=InnoDB AUTO_INCREMENT=557 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `url_data` +-- + +DROP TABLE IF EXISTS `url_data`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `url_data` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `title` varchar(60) COLLATE utf8mb4_unicode_ci NOT NULL, + `description` varchar(350) COLLATE utf8mb4_unicode_ci NOT NULL, + `icon` varchar(256) COLLATE utf8mb4_unicode_ci NOT NULL, + `lang` varchar(3) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `url_data_hash_uindex` (`hash`) +) ENGINE=InnoDB AUTO_INCREMENT=15 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2018-09-15 15:12:33 diff --git a/mysql_conf.inc b/mysql_conf.inc new file mode 100644 index 0000000..86d2a8f --- /dev/null +++ b/mysql_conf.inc @@ -0,0 +1,5 @@ +