diff options
author | Marvin Borner | 2018-09-18 17:19:51 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-18 17:19:51 +0200 |
commit | 4a57c1fda6d50d655eb48ffb712bd43729ea5c10 (patch) | |
tree | 2071b9a9c51eb5e4f267d4319a0694debeef5815 | |
parent | 9e9442a222666c5aadb6db32140dadddcb807933 (diff) |
Tried to fix several small things :bug: :construction:
-rw-r--r-- | crawler/Algorithms.php | 2 | ||||
-rw-r--r-- | crawler/CrawlController.php | 66 | ||||
-rw-r--r-- | crawler/Database.php | 57 | ||||
-rw-r--r-- | crawler/WebRequest.php | 2 | ||||
-rw-r--r-- | crawler/crawler.php | 4 | ||||
-rw-r--r-- | crawler/main.php | 7 |
6 files changed, 65 insertions, 73 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php index 66e6461..73d2ecc 100644 --- a/crawler/Algorithms.php +++ b/crawler/Algorithms.php @@ -47,7 +47,7 @@ class Algorithms foreach ($path->query('//a') as $link) { $linkHref = $link->getAttribute('href'); if ($linkHref !== 'javascript:void(0)') { - $href = cleanUrl($linkHref); + $href = self::cleanUrl($linkHref); $allLinks[] = $href; } } diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php index e5a270b..97edf25 100644 --- a/crawler/CrawlController.php +++ b/crawler/CrawlController.php @@ -5,70 +5,56 @@ * Time: 23:48 */ -include 'mysql_conf.inc'; +require_once 'mysql_conf.inc'; +require_once 'WebRequest.php'; +require_once 'Database.php'; +require_once 'Algorithms.php'; class CrawlController { - public function __construct() + private static $currentlyCrawled; + + public static function start($url = '') { set_time_limit(3600000); error_reporting(E_ERROR | E_PARSE); - $currentlyCrawled = $argv[1] ?? ''; - while (true) { - crawl($currentlyCrawled); + self::$currentlyCrawled = $url; + self::crawl(self::$currentlyCrawled); } } - public function crawl($url) + private static function crawl($url) { - global $currentlyCrawled; - if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) { - print "\t\e[91mUrl already crawled " . $url . "\n"; - - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = $this->getFromQueue('DESC'); + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('DESC'); } else { - $requestResponse = getContent($url); - $currentlyCrawled = $requestResponse[3]; + $requestResponse = WebRequest::getContent($url); + self::$currentlyCrawled = $requestResponse[3]; if (preg_match('/2\d\d/', $requestResponse[1])) { // success print 'Download Size: ' . $requestResponse[2]; $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); - $allLinks = Algorithms::getLinks($htmlPath); + Database::saveUrlData(self::$currentlyCrawled, $urlInfo); - Database::writeToQueue($allLinks); - $this->saveData($urlInfo, $currentlyCrawled); + $allLinks = Algorithms::getLinks($htmlPath); + Database::insertIntoQueue($allLinks); - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n"; } else { - print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n"; - Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url - Database::removeFromQueue($currentlyCrawled); - $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue(self::$currentlyCrawled); + self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n"; } } } - - public function saveData($urlInfo, $url) - { - if ($url !== '') { - print "\e[96mFinished previous url - crawling: " . $url . "\n"; - - $title = $urlInfo['title'] ?? ''; - $description = $urlInfo['description'] ?? ''; - $language = $urlInfo['language'] ?? 'en'; - $hash = md5($url); - $data = [$title, $description, $language, $hash]; - - Database::saveUrlData($data); - } - } - - }
\ No newline at end of file diff --git a/crawler/Database.php b/crawler/Database.php index 6d04009..0d500ad 100644 --- a/crawler/Database.php +++ b/crawler/Database.php @@ -13,46 +13,47 @@ class Database $conn = self::initDbConnection(); $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); - return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'] ?? ''; } - private static function initDbConnection(): PDO - { - global $servername, $dbname, $username, $password; - $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); - $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); - return $conn; - } - - public static function insertIntoQueue($url): bool + public static function insertIntoQueue($urls) { - if (!self::alreadyCrawled($url)) { - $conn = self::initDbConnection(); - $hash = md5($url); - $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); - $stmt->execute([':url' => $url, 'hash' => $hash]); - return $stmt->rowCount() > 0; + foreach ($urls as $url) { + if (self::alreadyCrawled($url)) { + print "\t\e[91mUrl already queued " . $url . "\n"; + } else { + print "\t\e[92mQueueing url " . $url . "\n"; + $conn = self::initDbConnection(); + $hash = md5($url); + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + } } } - public static function alreadyCrawled($url): bool + public static function removeFromQueue($url) { $hash = md5($url); $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); $checkStmt->execute([':hash' => $hash]); - return $checkStmt->rowCount() !== 0; // return true if already crawled } - public static function removeFromQueue($url): void + public static function alreadyCrawled($url): bool { + print "\t\e[96mChecking if url already has been crawled " . $url . "\n"; $hash = md5($url); $conn = self::initDbConnection(); - $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); $checkStmt->execute([':hash' => $hash]); + $alreadyCrawled = $checkStmt->rowCount() !== 0; + if ($alreadyCrawled) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + } + return $alreadyCrawled; // return true if already crawled } - public static function urlHasError($url): void + public static function urlHasError($url) { $hash = md5($url); $conn = self::initDbConnection(); @@ -60,10 +61,18 @@ class Database $checkStmt->execute([':url' => $url, 'hash' => $hash]); } - public static function saveUrlData($data): void + public static function saveUrlData($url, $data) { $conn = self::initDbConnection(); $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); - $stmt->execute([':url' => $data[0], ':title' => $data[1], ':description' => $data[2], ':lang' => $data[3], ':hash' => $data[4]]); + $stmt->execute([':url' => $url, ':title' => $data['title'] ?? '', ':description' => $data['description'] ?? '', ':lang' => $data['lang'] ?? 'en', ':hash' => md5($url)]); + } + + private static function initDbConnection(): PDO + { + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; } }
\ No newline at end of file diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php index a72efc6..f25f31d 100644 --- a/crawler/WebRequest.php +++ b/crawler/WebRequest.php @@ -7,7 +7,7 @@ class WebRequest { - public function getContent($url) + public static function getContent($url): array { $curl = curl_init($url); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); diff --git a/crawler/crawler.php b/crawler/crawler.php index e69de29..b5df1dc 100644 --- a/crawler/crawler.php +++ b/crawler/crawler.php @@ -0,0 +1,4 @@ +<?php +require_once 'CrawlController.php'; + +CrawlController::start($argv[1]);
\ No newline at end of file diff --git a/crawler/main.php b/crawler/main.php deleted file mode 100644 index 041aed5..0000000 --- a/crawler/main.php +++ /dev/null @@ -1,7 +0,0 @@ -<?php -/** - * User: Marvin Borner - * Date: 16/09/2018 - * Time: 21:26 - */ - |