From 0fbf636361ee39a09ab984e6e2bad0ed989c4dde Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sun, 16 Sep 2018 14:25:08 +0200 Subject: Several small improvements :zap: --- crawler.php | 44 +++++++++++++++++++++++++++++--------------- database.sql | 23 ++++++++++++++++++++++- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/crawler.php b/crawler.php index 69728e8..45753f3 100644 --- a/crawler.php +++ b/crawler.php @@ -10,7 +10,7 @@ error_reporting(E_ERROR | E_PARSE); include 'mysql_conf.inc'; -$currentUrl = $argv[1]; +$currentUrl = $argv[1] ?? ''; while (true) { crawl($currentUrl); @@ -20,7 +20,12 @@ function crawl($url) { global $currentUrl; - if (!alreadyCrawled(cleanUrl($url))) { + if (alreadyCrawled(cleanUrl($url))) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + + removeFromQueue($currentUrl); + $currentUrl = getFromQueue('ASC'); + } else { $requestResponse = getContent($url); if (preg_match('/2\d\d/', $requestResponse[1])) { // success print 'Download Size: ' . $requestResponse[2]; @@ -32,23 +37,23 @@ function crawl($url) writeToQueue($allLinks); saveData($urlInfo); + removeFromQueue($currentUrl); $currentUrl = getFromQueue('ASC'); // set new from start } else { - if ($requestResponse[1] === 429) { - $currentUrl = getFromQueue('DESC'); // set new from end - } - print "\t\e[91mError " . $requestResponse[1] . "\n"; + print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentUrl . "\n"; + + urlHasError($currentUrl); // prevents re-crawling of error url + removeFromQueue($currentUrl); + $currentUrl = getFromQueue('DESC'); // set new from end } } - - removeFromQueue($currentUrl); } function getContent($url) { $curl = curl_init($url); - curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)'); + curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); curl_setopt($curl, CURLOPT_TIMEOUT, 5); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); @@ -100,8 +105,8 @@ function getLinks($path) { $allLinks = []; - foreach ($path->query('//a') as $ink) { - $href = cleanUrl($ink->getAttribute('href')); + foreach ($path->query('//a') as $link) { + $href = cleanUrl($link->getAttribute('href')); $allLinks[] = $href; } @@ -114,7 +119,7 @@ function cleanUrl($url) $url = ltrim($url); - if (!(strpos($url, 'http') === 0)) { + if (filter_var($url, FILTER_VALIDATE_URL) === false && !(strpos($url, 'http') === 0)) { if (strpos($url, 'www') === 0) { $url = 'http://' . $url; } else if (strpos($url, '/') === 0) { @@ -184,7 +189,16 @@ function removeFromQueue($url) $conn = initDbConnection(); $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); - $checkStmt->execute(['hash' => $hash]); + $checkStmt->execute([':hash' => $hash]); +} + +function urlHasError($url) +{ + $hash = md5($url); + + $conn = initDbConnection(); + $checkStmt = $conn->prepare('INSERT INTO error_url (url, hash) VALUES (:url, :hash)'); + $checkStmt->execute([':url' => $url, 'hash' => $hash]); } function saveData($urlInfo) @@ -211,8 +225,8 @@ function alreadyCrawled($url) { $hash = md5($url); $conn = initDbConnection(); - $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash'); - $checkStmt->execute(['hash' => $hash]); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt->execute([':hash' => $hash]); return $checkStmt->rowCount() !== 0; // return true if already crawled } diff --git a/database.sql b/database.sql index c53110f..1f7e2fe 100644 --- a/database.sql +++ b/database.sql @@ -15,6 +15,27 @@ /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; +-- +-- Table structure for table `error_url` +-- + +DROP TABLE IF EXISTS `error_url`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `error_url` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `error_url_hash_uindex` (`hash`), + UNIQUE KEY `error_url_id_uindex` (`id`), + KEY `error_url_hash_index` (`hash`) +) + ENGINE = InnoDB + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `queue` -- @@ -69,4 +90,4 @@ CREATE TABLE `url_data` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2018-09-16 0:57:12 +-- Dump completed on 2018-09-16 11:28:48 -- cgit v1.2.3