From 9e9442a222666c5aadb6db32140dadddcb807933 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sun, 16 Sep 2018 22:07:50 +0200 Subject: Code cleanup :zap: :construction: --- crawler.php | 254 -------------------------------------------- crawler/Algorithms.php | 107 +++++++++++++++++++ crawler/CrawlController.php | 74 +++++++++++++ crawler/Database.php | 69 ++++++++++++ crawler/WebRequest.php | 29 +++++ crawler/crawler.php | 0 crawler/main.php | 7 ++ crawler/mysql_conf.inc | 5 + mysql_conf.inc | 5 - 9 files changed, 291 insertions(+), 259 deletions(-) delete mode 100644 crawler.php create mode 100644 crawler/Algorithms.php create mode 100644 crawler/CrawlController.php create mode 100644 crawler/Database.php create mode 100644 crawler/WebRequest.php create mode 100644 crawler/crawler.php create mode 100644 crawler/main.php create mode 100644 crawler/mysql_conf.inc delete mode 100644 mysql_conf.inc diff --git a/crawler.php b/crawler.php deleted file mode 100644 index 5ba28af..0000000 --- a/crawler.php +++ /dev/null @@ -1,254 +0,0 @@ -query('//title')[0]->textContent); - foreach ($path->query('//html') as $language) { - $urlInfo['language'] = strip_tags($language->getAttribute('lang')); - } - foreach ($path->query('/html/head/meta[@name="description"]') as $description) { - $urlInfo['description'] = strip_tags($description->getAttribute('content')); - } - - // Fix empty information - if (!isset($urlInfo['description'])) { - $urlInfo['description'] = ''; - foreach ($path->query('//p') as $text) { - if (strlen($urlInfo['description']) < 350) { - $urlInfo['description'] .= $text->textContent . ' '; - } - } - } - - if (empty($urlInfo['title'])) { - $urlInfo['title'] = ''; - if (strlen($urlInfo['title']) < 350) { - $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; - } - } - - return $urlInfo; -} - -function getLinks($path) -{ - $allLinks = []; - - foreach ($path->query('//a') as $link) { - $linkHref = $link->getAttribute('href'); - if ($linkHref !== 'javascript:void(0)') { - $href = cleanUrl($linkHref); - $allLinks[] = $href; - } - } - - return array_unique($allLinks); -} - -function cleanUrl($url) -{ - global $currentUrl; - - $newUrl = ltrim($url); // trim whitespaces - - // normally only for links/href - if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) { - if (strpos($newUrl, 'www') === 0) { - $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning - } else if ($newUrl === 'javascript:void') { - $newUrl = ''; - } else if (strpos($url, '/') === 0) { - $parsedUrl = parse_url($currentUrl); - $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path - } else { - $newUrl = $currentUrl . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning - } - } - - // if it's pure domain without slash (prevents duplicate domains because of slash) - if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) { - $newUrl .= '/'; - } - - // strip some things - $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes - $newUrl = strtok($newUrl, '?'); // parameters - $newUrl = strtok($newUrl, '#'); // hash fragments - - if ($url !== $newUrl) { - print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; - } - - return $newUrl; -} - -function createPathFromHtml($content) -{ - $dom = new DOMDocument(); - libxml_use_internal_errors(true); - $dom->loadHTML($content); - libxml_use_internal_errors(false); - return new DOMXPath($dom); -} - -function getFromQueue($sort) -{ - print "\t\e[96mStarting at " . ($sort === 'DESC' ? 'bottom' : 'top') . " of queue\n"; - $conn = initDbConnection(); - $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); - - return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; -} - -function writeToQueue($urls) -{ - $conn = initDbConnection(); - - foreach ($urls as $url) { - if ($url !== '') { - $hash = md5($url); - - print "\t\e[96mChecking if url already has been crawled " . $url . "\n"; - $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash'); - $checkStmt->execute(['hash' => $hash]); - if ($checkStmt->rowCount() === 0) { - $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); - $stmt->execute([':url' => $url, 'hash' => $hash]); - if ($stmt->rowCount() > 0) { - print "\t\e[92mQueueing url " . $url . "\n"; - } else { - print "\t\e[91mUrl already queued " . $url . "\n"; - } - } else { - print "\t\e[91mUrl already crawled " . $url . "\n"; - } - } - } -} - -function removeFromQueue($url) -{ - $hash = md5($url); - - $conn = initDbConnection(); - $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); - $checkStmt->execute([':hash' => $hash]); -} - -function urlHasError($url) -{ - $hash = md5($url); - - $conn = initDbConnection(); - $checkStmt = $conn->prepare('INSERT IGNORE INTO error_url (url, hash) VALUES (:url, :hash)'); - $checkStmt->execute([':url' => $url, 'hash' => $hash]); -} - -function saveData($urlInfo, $url) -{ - if ($url !== '') { - print "\e[96mFinished previous url - crawling: " . $url . "\n"; - - $title = mb_convert_encoding($urlInfo['title'] ?? '', 'Windows-1252', 'UTF-8'); - $description = mb_convert_encoding($urlInfo['description'] ?? '', 'Windows-1252', 'UTF-8'); - $language = $urlInfo['language'] ?? 'en'; - $hash = md5($url); - - try { - $conn = initDbConnection(); - $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); - $stmt->execute([':url' => $url, ':title' => $title, ':description' => $description, ':lang' => $language, ':hash' => $hash]); - } catch (PDOException $e) { - print $e->getMessage(); - } - } -} - -function alreadyCrawled($url) -{ - $hash = md5($url); - $conn = initDbConnection(); - $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); - $checkStmt->execute([':hash' => $hash]); - return $checkStmt->rowCount() !== 0; // return true if already crawled -} - -function initDbConnection() -{ - global $servername, $dbname, $username, $password; - $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); - $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); - return $conn; -} \ No newline at end of file diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php new file mode 100644 index 0000000..66e6461 --- /dev/null +++ b/crawler/Algorithms.php @@ -0,0 +1,107 @@ +query('//title')[0]->textContent); + foreach ($path->query('//html') as $language) { + $urlInfo['language'] = strip_tags($language->getAttribute('lang')); + } + foreach ($path->query('/html/head/meta[@name="description"]') as $description) { + $urlInfo['description'] = strip_tags($description->getAttribute('content')); + } + + // Fix empty information + if (!isset($urlInfo['description'])) { + $urlInfo['description'] = ''; + foreach ($path->query('//p') as $text) { + if (strlen($urlInfo['description']) < 350) { + $urlInfo['description'] .= $text->textContent . ' '; + } + } + } + if (empty($urlInfo['title'])) { + $urlInfo['title'] = ''; + if (strlen($urlInfo['title']) < 350) { + $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; + } + } + + print "\t\e[92mFound data: " . $urlInfo['title'] . "\n"; + + return $urlInfo; + } + + public static function getLinks($path): array + { + $allLinks = []; + + foreach ($path->query('//a') as $link) { + $linkHref = $link->getAttribute('href'); + if ($linkHref !== 'javascript:void(0)') { + $href = cleanUrl($linkHref); + $allLinks[] = $href; + } + } + + return array_unique($allLinks); + } + + public static function createPathFromHtml($content): \DOMXPath + { + $dom = new DOMDocument(); + libxml_use_internal_errors(true); + $dom->loadHTML($content); + libxml_use_internal_errors(false); + return new DOMXPath($dom); + } + + public static function cleanUrl($url): string + { + global $currentlyCrawled; + + $newUrl = ltrim($url); // trim whitespaces + + // normally only for links/href + if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) { + if (strpos($newUrl, 'www') === 0) { + $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning + } else if (strpos($newUrl, 'javascript:') === 0) { + $newUrl = ''; // fixes javascript void links + } else if (strpos($newUrl, '../') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path + } else if (strpos($newUrl, '/') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path + } else { + $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning + } + } + + // if it's pure domain without slash (prevents duplicate domains because of slash) + if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) { + $newUrl .= '/'; + } + + // strip some things + $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes + $newUrl = strtok($newUrl, '?'); // parameters + $newUrl = strtok($newUrl, '#'); // hash fragments + + if ($url !== $newUrl) { + print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; + } + + return $newUrl; + } +} \ No newline at end of file diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php new file mode 100644 index 0000000..e5a270b --- /dev/null +++ b/crawler/CrawlController.php @@ -0,0 +1,74 @@ +getFromQueue('DESC'); + } else { + $requestResponse = getContent($url); + $currentlyCrawled = $requestResponse[3]; + if (preg_match('/2\d\d/', $requestResponse[1])) { // success + print 'Download Size: ' . $requestResponse[2]; + + $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]); + $urlInfo = Algorithms::getUrlInfo($htmlPath); + $allLinks = Algorithms::getLinks($htmlPath); + + Database::writeToQueue($allLinks); + $this->saveData($urlInfo, $currentlyCrawled); + + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('DESC'); // set new from start + } else { + print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentlyCrawled . "\n"; + + Database::urlHasError($currentlyCrawled); // prevents re-crawling of error url + Database::removeFromQueue($currentlyCrawled); + $currentlyCrawled = Database::getFromQueue('ASC'); // set new from end + } + } + } + + public function saveData($urlInfo, $url) + { + if ($url !== '') { + print "\e[96mFinished previous url - crawling: " . $url . "\n"; + + $title = $urlInfo['title'] ?? ''; + $description = $urlInfo['description'] ?? ''; + $language = $urlInfo['language'] ?? 'en'; + $hash = md5($url); + $data = [$title, $description, $language, $hash]; + + Database::saveUrlData($data); + } + } + + +} \ No newline at end of file diff --git a/crawler/Database.php b/crawler/Database.php new file mode 100644 index 0000000..6d04009 --- /dev/null +++ b/crawler/Database.php @@ -0,0 +1,69 @@ +query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); + + return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; + } + + private static function initDbConnection(): PDO + { + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname;charset=utf8", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + return $conn; + } + + public static function insertIntoQueue($url): bool + { + if (!self::alreadyCrawled($url)) { + $conn = self::initDbConnection(); + $hash = md5($url); + $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); + $stmt->execute([':url' => $url, 'hash' => $hash]); + return $stmt->rowCount() > 0; + } + } + + public static function alreadyCrawled($url): bool + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)'); + $checkStmt->execute([':hash' => $hash]); + return $checkStmt->rowCount() !== 0; // return true if already crawled + } + + public static function removeFromQueue($url): void + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); + $checkStmt->execute([':hash' => $hash]); + } + + public static function urlHasError($url): void + { + $hash = md5($url); + $conn = self::initDbConnection(); + $checkStmt = $conn->prepare('INSERT IGNORE INTO error_url (url, hash) VALUES (:url, :hash)'); + $checkStmt->execute([':url' => $url, 'hash' => $hash]); + } + + public static function saveUrlData($data): void + { + $conn = self::initDbConnection(); + $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)'); + $stmt->execute([':url' => $data[0], ':title' => $data[1], ':description' => $data[2], ':lang' => $data[3], ':hash' => $data[4]]); + } +} \ No newline at end of file diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php new file mode 100644 index 0000000..a72efc6 --- /dev/null +++ b/crawler/WebRequest.php @@ -0,0 +1,29 @@ +