diff options
Diffstat (limited to 'crawler/Database.php')
-rw-r--r-- | crawler/Database.php | 57 |
1 files changed, 33 insertions, 24 deletions
diff --git a/crawler/Database.php b/crawler/Database.php index 6d04009..0d500ad 100644 --- a/crawler/Database.php +++ b/crawler/Database.php @@ -13,46 +13,47 @@ class Database $conn = self::initDbConnection(); $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); - return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'] ?? ''; } - private static function initDbConnection(): PDO - { - global $servername, $dbname, $username, $password; - $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); - $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); - return $conn; - } - - public static function insertIntoQueue($url): bool + public static function insertIntoQueue($urls) { - if (!self::alreadyCrawled($url)) { - $conn = self::initDbConnection(); - $hash = md5($url); - $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); - $stmt->execute([':url' => $url, 'hash' => $hash]); - return $stmt->rowCount() > 0; + foreach ($urls as $url) { + if (self::alreadyCrawled($url)) { + print "\t\e[91mUrl already queued " . $url . "\n"; + } else { + print "\t\e[92mQueueing url " . $url . "\n"; + $conn = self::initDbConnection(); + $hash = md5($url); + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + } } } - public static function alreadyCrawled($url): bool + public static function removeFromQueue($url) { $hash = md5($url); $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); $checkStmt->execute([':hash' => $hash]); - return $checkStmt->rowCount() !== 0; // return true if already crawled } - public static function removeFromQueue($url): void + public static function alreadyCrawled($url): bool { + print "\t\e[96mChecking if url already has been crawled " . $url . "\n"; $hash = md5($url); $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); $checkStmt->execute([':hash' => $hash]); + $alreadyCrawled = $checkStmt->rowCount() !== 0; + if ($alreadyCrawled) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + } + return $alreadyCrawled; // return true if already crawled } - public static function urlHasError($url): void + public static function urlHasError($url) { $hash = md5($url); $conn = self::initDbConnection(); @@ -60,10 +61,18 @@ class Database $checkStmt->execute([':url' => $url, 'hash' => $hash]); } - public static function saveUrlData($data): void + public static function saveUrlData($url, $data) { $conn = self::initDbConnection(); $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); - $stmt->execute([':url' => $data[0], ':title' => $data[1], ':description' => $data[2], ':lang' => $data[3], ':hash' => $data[4]]); + $stmt->execute([':url' => $url, ':title' => $data['title'] ?? '', ':description' => $data['description'] ?? '', ':lang' => $data['lang'] ?? 'en', ':hash' => md5($url)]); + } + + private static function initDbConnection(): PDO + { + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; } }
\ No newline at end of file |