From cbe2b6db3b2c439a85cf0286720d7ebb235277a7 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Wed, 19 Sep 2018 18:39:12 +0200 Subject: Abstracted database :wrench: :package: --- crawler/CrawlController.php | 4 +-- crawler/Database.php | 79 ---------------------------------------- crawler/mysql_conf.inc | 5 --- database/Database.php | 87 +++++++++++++++++++++++++++++++++++++++++++++ database/mysqlConf.inc | 5 +++ 5 files changed, 94 insertions(+), 86 deletions(-) delete mode 100644 crawler/Database.php delete mode 100644 crawler/mysql_conf.inc create mode 100644 database/Database.php create mode 100644 database/mysqlConf.inc diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php index 53d5aac..5b20b75 100644 --- a/crawler/CrawlController.php +++ b/crawler/CrawlController.php @@ -6,9 +6,9 @@ header('Content-type: text/plain; charset=utf-8'); * Time: 23:48 */ -require_once 'mysql_conf.inc'; +require_once '../database/mysqlConf.inc'; +require_once '../database/Database.php'; require_once 'WebRequest.php'; -require_once 'Database.php'; require_once 'Algorithms.php'; class CrawlController diff --git a/crawler/Database.php b/crawler/Database.php deleted file mode 100644 index f27803e..0000000 --- a/crawler/Database.php +++ /dev/null @@ -1,79 +0,0 @@ -<?php -header('Content-type: text/plain; charset=utf-8'); -/** - * User: Marvin Borner - * Date: 16/09/2018 - * Time: 21:34 - */ - -class Database -{ - public static function getFromQueue($sort): string - { - print "\t\e[96mStarting at " . ($sort === 'DESC' ? 'bottom' : 'top') . " of queue\n"; - $conn = self::initDbConnection(); - $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); - - return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'] ?? ''; - } - - public static function insertIntoQueue($urls) - { - foreach ($urls as $url) { - if (self::alreadyCrawled($url)) { - print "\t\e[91mUrl already queued " . $url . "\n"; - } else { - print "\t\e[92mQueueing url " . $url . "\n"; - $conn = self::initDbConnection(); - $hash = md5($url); - $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); - $stmt->execute([':url' => $url, 'hash' => $hash]); - } - } - } - - public static function removeFromQueue($url) - { - $hash = md5($url); - $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); - $checkStmt->execute([':hash' => $hash]); - } - - public static function alreadyCrawled($url): bool - { - print "\t\e[96mChecking if url has already been crawled " . $url . "\n"; - $hash = md5($url); - $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); - $checkStmt->execute([':hash' => $hash]); - $alreadyCrawled = $checkStmt->rowCount() !== 0; - if ($alreadyCrawled) { - print "\t\e[91mUrl already crawled " . $url . "\n"; - } - return $alreadyCrawled; // return true if already crawled - } - - public static function urlHasError($url) - { - $hash = md5($url); - $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('INSERT IGNORE INTO error_url (url, hash) VALUES (:url, :hash)'); - $checkStmt->execute([':url' => $url, 'hash' => $hash]); - } - - public static function saveUrlData($url, $data) - { - $conn = self::initDbConnection(); - $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); - $stmt->execute([':url' => $url, ':title' => $data['title'] ?? '', ':description' => $data['description'] ?? '', ':lang' => $data['lang'] ?? 'en', ':hash' => md5($url)]); - } - - private static function initDbConnection(): PDO - { - global $servername, $dbname, $username, $password; - $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); - $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); - return $conn; - } -} \ No newline at end of file diff --git a/crawler/mysql_conf.inc b/crawler/mysql_conf.inc deleted file mode 100644 index 5d70dd5..0000000 --- a/crawler/mysql_conf.inc +++ /dev/null @@ -1,5 +0,0 @@ -<?php -$servername = '127.0.0.1'; -$username = 'root'; -$password = 'root'; -$dbname = 'search_engine'; \ No newline at end of file diff --git a/database/Database.php b/database/Database.php new file mode 100644 index 0000000..51e1430 --- /dev/null +++ b/database/Database.php @@ -0,0 +1,87 @@ +<?php +header('Content-type: text/plain; charset=utf-8'); +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:34 + */ + +class Database +{ + public static function getFromQueue($sort): string + { + print "\t\e[96mStarting at " . ($sort === 'DESC' ? 'bottom' : 'top') . " of queue\n"; + $conn = self::initDbConnection(); + $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); + + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'] ?? ''; + } + + public static function insertIntoQueue($urls) + { + foreach ($urls as $url) { + if (self::alreadyCrawled($url)) { + print "\t\e[91mUrl already queued " . $url . "\n"; + } else { + print "\t\e[92mQueueing url " . $url . "\n"; + $conn = self::initDbConnection(); + $hash = md5($url); + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + } + } + } + + public static function removeFromQueue($url) + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); + $checkStmt->execute([':hash' => $hash]); + } + + public static function alreadyCrawled($url): bool + { + print "\t\e[96mChecking if url has already been crawled " . $url . "\n"; + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt->execute([':hash' => $hash]); + $alreadyCrawled = $checkStmt->rowCount() !== 0; + if ($alreadyCrawled) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + } + return $alreadyCrawled; // return true if already crawled + } + + public static function urlHasError($url) + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('INSERT IGNORE INTO error_url (url, hash) VALUES (:url, :hash)'); + $checkStmt->execute([':url' => $url, 'hash' => $hash]); + } + + public static function saveUrlData($url, $data) + { + $conn = self::initDbConnection(); + $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); + $stmt->execute([':url' => $url, ':title' => $data['title'] ?? '', ':description' => $data['description'] ?? '', ':lang' => $data['lang'] ?? 'en', ':hash' => md5($url)]); + } + + public static function getUrlData($query) + { + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('SELECT url, title, description, lang FROM url_data WHERE title LIKE :query OR description LIKE :query'); + $checkStmt->execute([':query' => '%' . $query . '%']); + return $checkStmt->fetchAll(PDO::FETCH_ASSOC); + } + + private static function initDbConnection(): PDO + { + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; + } +} \ No newline at end of file diff --git a/database/mysqlConf.inc b/database/mysqlConf.inc new file mode 100644 index 0000000..5d70dd5 --- /dev/null +++ b/database/mysqlConf.inc @@ -0,0 +1,5 @@ +<?php +$servername = '127.0.0.1'; +$username = 'root'; +$password = 'root'; +$dbname = 'search_engine'; \ No newline at end of file -- cgit v1.2.3