diff options
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/Algorithms.php | 107 | ||||
-rw-r--r-- | crawler/CrawlController.php | 74 | ||||
-rw-r--r-- | crawler/Database.php | 69 | ||||
-rw-r--r-- | crawler/WebRequest.php | 29 | ||||
-rw-r--r-- | crawler/crawler.php | 0 | ||||
-rw-r--r-- | crawler/main.php | 7 | ||||
-rw-r--r-- | crawler/mysql_conf.inc | 5 |
7 files changed, 291 insertions, 0 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php new file mode 100644 index 0000000..66e6461 --- /dev/null +++ b/crawler/Algorithms.php @@ -0,0 +1,107 @@ +<?php +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:51 + */ + +class Algorithms +{ + public static function getUrlInfo($path): array + { + $urlInfo = []; + + $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent); + foreach ($path->query('//html') as $language) { + $urlInfo['language'] = strip_tags($language->getAttribute('lang')); + } + foreach ($path->query('/html/head/meta[@name="description"]') as $description) { + $urlInfo['description'] = strip_tags($description->getAttribute('content')); + } + + // Fix empty information + if (!isset($urlInfo['description'])) { + $urlInfo['description'] = ''; + foreach ($path->query('//p') as $text) { + if (strlen($urlInfo['description']) < 350) { + $urlInfo['description'] .= $text->textContent . ' '; + } + } + } + if (empty($urlInfo['title'])) { + $urlInfo['title'] = ''; + if (strlen($urlInfo['title']) < 350) { + $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; + } + } + + print "\t\e[92mFound data: " . $urlInfo['title'] . "\n"; + + return $urlInfo; + } + + public static function getLinks($path): array + { + $allLinks = []; + + foreach ($path->query('//a') as $link) { + $linkHref = $link->getAttribute('href'); + if ($linkHref !== 'javascript:void(0)') { + $href = cleanUrl($linkHref); + $allLinks[] = $href; + } + } + + return array_unique($allLinks); + } + + public static function createPathFromHtml($content): \DOMXPath + { + $dom = new DOMDocument(); + libxml_use_internal_errors(true); + $dom->loadHTML($content); + libxml_use_internal_errors(false); + return new DOMXPath($dom); + } + + public static function cleanUrl($url): string + { + global $currentlyCrawled; + + $newUrl = ltrim($url); // trim whitespaces + + // normally only for links/href + if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) { + if (strpos($newUrl, 'www') === 0) { + $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning + } else if (strpos($newUrl, 'javascript:') === 0) { + $newUrl = ''; // fixes javascript void links + } else if (strpos($newUrl, '../') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path + } else if (strpos($newUrl, '/') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path + } else { + $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning + } + } + + // if it's pure domain without slash (prevents duplicate domains because of slash) + if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) { + $newUrl .= '/'; + } + + // strip some things + $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes + $newUrl = strtok($newUrl, '?'); // parameters + $newUrl = strtok($newUrl, '#'); // hash fragments + + if ($url !== $newUrl) { + print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; + } + + return $newUrl; + } +}
\ No newline at end of file diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php new file mode 100644 index 0000000..e5a270b --- /dev/null +++ b/crawler/CrawlController.php @@ -0,0 +1,74 @@ +<?php +/** + * User: Marvin Borner + * Date: 14/09/2018 + * Time: 23:48 + */ + +include 'mysql_conf.inc'; + +class CrawlController +{ + public function __construct() + { + set_time_limit(3600000); + error_reporting(E_ERROR | E_PARSE); + + $currentlyCrawled = $argv[1] ?? ''; + + while (true) { + crawl($currentlyCrawled); + } + } + + public function crawl($url) + { + global $currentlyCrawled; + + if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) { + print "\t\e[91mUrl already crawled " . $url . "\n"; + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = $this->getFromQueue('DESC'); + } else { + $requestResponse = getContent($url); + $currentlyCrawled = $requestResponse[3]; + if (preg_match('/2\d\d/', $requestResponse[1])) { // success + print 'Download Size: ' . $requestResponse[2]; + + $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); + $allLinks = Algorithms::getLinks($htmlPath); + + Database::writeToQueue($allLinks); + $this->saveData($urlInfo, $currentlyCrawled); + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + } else { + print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + + Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + } + } + } + + public function saveData($urlInfo, $url) + { + if ($url !== '') { + print "\e[96mFinished previous url - crawling: " . $url . "\n"; + + $title = $urlInfo['title'] ?? ''; + $description = $urlInfo['description'] ?? ''; + $language = $urlInfo['language'] ?? 'en'; + $hash = md5($url); + $data = [$title, $description, $language, $hash]; + + Database::saveUrlData($data); + } + } + + +}
\ No newline at end of file diff --git a/crawler/Database.php b/crawler/Database.php new file mode 100644 index 0000000..6d04009 --- /dev/null +++ b/crawler/Database.php @@ -0,0 +1,69 @@ +<?php +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:34 + */ + +class Database +{ + public static function getFromQueue($sort): string + { + print "\t\e[96mStarting at " . ($sort === 'DESC' ? 'bottom' : 'top') . " of queue\n"; + $conn = self::initDbConnection(); + $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); + + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; + } + + private static function initDbConnection(): PDO + { + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; + } + + public static function insertIntoQueue($url): bool + { + if (!self::alreadyCrawled($url)) { + $conn = self::initDbConnection(); + $hash = md5($url); + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + return $stmt->rowCount() > 0; + } + } + + public static function alreadyCrawled($url): bool + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt->execute([':hash' => $hash]); + return $checkStmt->rowCount() !== 0; // return true if already crawled + } + + public static function removeFromQueue($url): void + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); + $checkStmt->execute([':hash' => $hash]); + } + + public static function urlHasError($url): void + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('INSERT IGNORE INTO error_url (url, hash) VALUES (:url, :hash)'); + $checkStmt->execute([':url' => $url, 'hash' => $hash]); + } + + public static function saveUrlData($data): void + { + $conn = self::initDbConnection(); + $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); + $stmt->execute([':url' => $data[0], ':title' => $data[1], ':description' => $data[2], ':lang' => $data[3], ':hash' => $data[4]]); + } +}
\ No newline at end of file diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php new file mode 100644 index 0000000..a72efc6 --- /dev/null +++ b/crawler/WebRequest.php @@ -0,0 +1,29 @@ +<?php +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:53 + */ + +class WebRequest +{ + public function getContent($url) + { + $curl = curl_init($url); + curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); + curl_setopt($curl, CURLOPT_ENCODING, ''); + curl_setopt($curl, CURLOPT_TIMEOUT, 5); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); + $content = curl_exec($curl); + $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE); + $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n"; + if (preg_match('~Location: (.*)~i', $content, $match)) { + $updatedUrl = trim($match[1]); // update url on 301/302 + } + curl_close($curl); + + return [$content, $responseCode, $downloadSize, $updatedUrl ?? $url]; + } +}
\ No newline at end of file diff --git a/crawler/crawler.php b/crawler/crawler.php new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/crawler/crawler.php diff --git a/crawler/main.php b/crawler/main.php new file mode 100644 index 0000000..041aed5 --- /dev/null +++ b/crawler/main.php @@ -0,0 +1,7 @@ +<?php +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:26 + */ + diff --git a/crawler/mysql_conf.inc b/crawler/mysql_conf.inc new file mode 100644 index 0000000..5d70dd5 --- /dev/null +++ b/crawler/mysql_conf.inc @@ -0,0 +1,5 @@ +<?php +$servername = '127.0.0.1'; +$username = 'root'; +$password = 'root'; +$dbname = 'search_engine';
\ No newline at end of file |