From 0e2b7fc11970b9c810c9b66739e0bd9130bf92ee Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sat, 15 Sep 2018 17:47:03 +0200 Subject: Some performance improvements :zap: --- crawler.php | 101 ++++++++++++++++++++++++++++++++++++++++++++++------------- database.sql | 20 +++++++++--- 2 files changed, 94 insertions(+), 27 deletions(-) diff --git a/crawler.php b/crawler.php index 1e4f401..7e3bcfd 100644 --- a/crawler.php +++ b/crawler.php @@ -5,36 +5,54 @@ * Time: 23:48 */ +error_reporting(E_ERROR | E_PARSE); + include "mysql_conf.inc"; $currentUrl = $argv[1]; -crawlLoop(); -function crawlLoop() +while (true) { + crawl($currentUrl); +} + +function crawl($url) { global $currentUrl; - $content = getContent($currentUrl); - $htmlPath = createPathFromHtml($content); - $urlInfo = getUrlInfo($htmlPath); - $allLinks = getLinks($htmlPath); + if (!alreadyCrawled(cleanUrl($url))) { + $requestResponse = getContent($url); + if ($requestResponse[1] != 404) { + print "Download Size: " . $requestResponse[2]; - writeToQueue($allLinks); - saveData($urlInfo); + $htmlPath = createPathFromHtml($requestResponse[0]); + $urlInfo = getUrlInfo($htmlPath); + $allLinks = getLinks($htmlPath); + + writeToQueue($allLinks); + saveData($urlInfo); + } + } + + $currentUrl = getFirstFromQueue(); // set new + removeFromQueue($currentUrl); + + return; } function getContent($url) { $curl = curl_init($url); + curl_setopt($curl, CURLOPT_USERAGENT, "Googlebot/2.1 (+http://www.google.com/bot.html)"); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); $content = curl_exec($curl); - print "Download Size: " . curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; + $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); + $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; curl_close($curl); - return $content; + return [$content, $responseCode, $downloadSize]; } function getUrlInfo($path) @@ -51,25 +69,37 @@ function getUrlInfo($path) function getLinks($path) { - global $currentUrl; $allLinks = []; foreach ($path->query("//a") as $ink) { - $href = ltrim($ink->getAttribute("href")); + $href = cleanUrl($ink->getAttribute("href")); + array_push($allLinks, $href); + } - if (!(substr($href, 0, 4) === "http")) { - if (substr($href, 0, 3) === "www") $href = "http://" . $href; - else if (substr($href, 0, 1) === "/") $href = $currentUrl . $href; - else $href = $currentUrl . $href; - } + return array_unique($allLinks); +} - // if it's pure domain without slash (prevents duplicate domains because of slash) - if (preg_match('/\w+\.\w{2,3}$/', $href)) $href = $href . "/"; +function cleanUrl($url) +{ + global $currentUrl; - array_push($allLinks, $href); + $url = ltrim($url); + + if (!(substr($url, 0, 4) === "http")) { + if (substr($url, 0, 3) === "www") $url = "http://" . $url; + else if (substr($url, 0, 1) === "/") $url = $currentUrl . $url; + else $url = $currentUrl . $url; } - return array_unique($allLinks); + // if it's pure domain without slash (prevents duplicate domains because of slash) + if (preg_match('/\w+\.\w{2,3}$/', $url)) $url = $url . "/"; + + // strip some things + $url = preg_replace('/([^:])(\/{2,})/', '$1/', $url); // double slashes + $url = strtok($url, '?'); // parameters + $url = strtok($url, '#'); // hash fragments + + return $url; } function createPathFromHtml($content) @@ -81,6 +111,15 @@ function createPathFromHtml($content) return new DOMXPath($dom); } +function getFirstFromQueue() +{ + $conn = initDbConnection(); + $checkStmt = $conn->prepare('SELECT url FROM queue LIMIT 1'); + $checkStmt->execute(); + + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]["url"]; +} + function writeToQueue($urls) { $conn = initDbConnection(); @@ -88,7 +127,7 @@ function writeToQueue($urls) foreach ($urls as $url) { $hash = md5($url); - $checkStmt = $conn->prepare('SELECT hash FROM url_data where hash = :hash'); + $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash'); $checkStmt->execute(['hash' => $hash]); if ($checkStmt->rowCount() === 0) { $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); @@ -97,6 +136,15 @@ function writeToQueue($urls) } } +function removeFromQueue($url) +{ + $hash = md5($url); + + $conn = initDbConnection(); + $checkStmt = $conn->prepare('DELETE FROM queue where hash = :hash'); + $checkStmt->execute(['hash' => $hash]); +} + function saveData($urlInfo) { global $currentUrl; @@ -118,6 +166,15 @@ function saveData($urlInfo) } } +function alreadyCrawled($url) +{ + $hash = md5($url); + $conn = initDbConnection(); + $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash'); + $checkStmt->execute(['hash' => $hash]); + return $checkStmt->rowCount() !== 0; // return true if already crawled +} + function initDbConnection() { global $servername, $dbname, $username, $password; diff --git a/database.sql b/database.sql index 29970b4..54d6a6f 100644 --- a/database.sql +++ b/database.sql @@ -28,8 +28,13 @@ CREATE TABLE `queue` ( `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `queue_id_uindex` (`id`), - UNIQUE KEY `queue_hash_uindex` (`hash`) -) ENGINE=InnoDB AUTO_INCREMENT=557 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + UNIQUE KEY `queue_hash_uindex` (`hash`), + KEY `queue_hash_index` (`hash`) +) + ENGINE = InnoDB + AUTO_INCREMENT = 5504 + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_unicode_ci; /*!40101 SET character_set_client = @saved_cs_client */; -- @@ -48,8 +53,13 @@ CREATE TABLE `url_data` ( `lang` varchar(3) COLLATE utf8mb4_unicode_ci NOT NULL, `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, PRIMARY KEY (`id`), - UNIQUE KEY `url_data_hash_uindex` (`hash`) -) ENGINE=InnoDB AUTO_INCREMENT=15 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + UNIQUE KEY `url_data_hash_uindex` (`hash`), + KEY `url_data_hash_index` (`hash`) +) + ENGINE = InnoDB + AUTO_INCREMENT = 59 + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_unicode_ci; /*!40101 SET character_set_client = @saved_cs_client */; /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; @@ -61,4 +71,4 @@ CREATE TABLE `url_data` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2018-09-15 15:12:33 +-- Dump completed on 2018-09-15 17:46:14 -- cgit v1.2.3