diff options
author | Marvin Borner | 2018-09-16 14:25:08 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-16 14:25:08 +0200 |
commit | 0fbf636361ee39a09ab984e6e2bad0ed989c4dde (patch) | |
tree | 3c90b91fc1fd614be9b44a5738e6d8c9c46d8b21 | |
parent | b9e5d52cbf96374811edf09a1c9b7edb2d63e9d7 (diff) |
Several small improvements :zap:
-rw-r--r-- | crawler.php | 44 | ||||
-rw-r--r-- | database.sql | 23 |
2 files changed, 51 insertions, 16 deletions
diff --git a/crawler.php b/crawler.php index 69728e8..45753f3 100644 --- a/crawler.php +++ b/crawler.php @@ -10,7 +10,7 @@ error_reporting(E_ERROR | E_PARSE); include 'mysql_conf.inc';
-$currentUrl = $argv[1];
+$currentUrl = $argv[1] ?? '';
while (true) {
crawl($currentUrl);
@@ -20,7 +20,12 @@ function crawl($url) {
global $currentUrl;
- if (!alreadyCrawled(cleanUrl($url))) {
+ if (alreadyCrawled(cleanUrl($url))) {
+ print "\t\e[91mUrl already crawled " . $url . "\n";
+
+ removeFromQueue($currentUrl);
+ $currentUrl = getFromQueue('ASC');
+ } else {
$requestResponse = getContent($url);
if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
@@ -32,23 +37,23 @@ function crawl($url) writeToQueue($allLinks);
saveData($urlInfo);
+ removeFromQueue($currentUrl);
$currentUrl = getFromQueue('ASC'); // set new from start
} else {
- if ($requestResponse[1] === 429) {
- $currentUrl = getFromQueue('DESC'); // set new from end
- }
- print "\t\e[91mError " . $requestResponse[1] . "\n";
+ print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentUrl . "\n";
+
+ urlHasError($currentUrl); // prevents re-crawling of error url
+ removeFromQueue($currentUrl);
+ $currentUrl = getFromQueue('DESC'); // set new from end
}
}
-
- removeFromQueue($currentUrl);
}
function getContent($url)
{
$curl = curl_init($url);
- curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
+ curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_TIMEOUT, 5);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
@@ -100,8 +105,8 @@ function getLinks($path) {
$allLinks = [];
- foreach ($path->query('//a') as $ink) {
- $href = cleanUrl($ink->getAttribute('href'));
+ foreach ($path->query('//a') as $link) {
+ $href = cleanUrl($link->getAttribute('href'));
$allLinks[] = $href;
}
@@ -114,7 +119,7 @@ function cleanUrl($url) $url = ltrim($url);
- if (!(strpos($url, 'http') === 0)) {
+ if (filter_var($url, FILTER_VALIDATE_URL) === false && !(strpos($url, 'http') === 0)) {
if (strpos($url, 'www') === 0) {
$url = 'http://' . $url;
} else if (strpos($url, '/') === 0) {
@@ -184,7 +189,16 @@ function removeFromQueue($url) $conn = initDbConnection();
$checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash');
- $checkStmt->execute(['hash' => $hash]);
+ $checkStmt->execute([':hash' => $hash]);
+}
+
+function urlHasError($url)
+{
+ $hash = md5($url);
+
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('INSERT INTO error_url (url, hash) VALUES (:url, :hash)');
+ $checkStmt->execute([':url' => $url, 'hash' => $hash]);
}
function saveData($urlInfo)
@@ -211,8 +225,8 @@ function alreadyCrawled($url) {
$hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
- $checkStmt->execute(['hash' => $hash]);
+ $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)');
+ $checkStmt->execute([':hash' => $hash]);
return $checkStmt->rowCount() !== 0; // return true if already crawled
}
diff --git a/database.sql b/database.sql index c53110f..1f7e2fe 100644 --- a/database.sql +++ b/database.sql @@ -16,6 +16,27 @@ /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; -- +-- Table structure for table `error_url` +-- + +DROP TABLE IF EXISTS `error_url`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8 */; +CREATE TABLE `error_url` ( + `id` int(8) NOT NULL AUTO_INCREMENT, + `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL, + `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `error_url_hash_uindex` (`hash`), + UNIQUE KEY `error_url_id_uindex` (`id`), + KEY `error_url_hash_index` (`hash`) +) + ENGINE = InnoDB + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- -- Table structure for table `queue` -- @@ -69,4 +90,4 @@ CREATE TABLE `url_data` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2018-09-16 0:57:12 +-- Dump completed on 2018-09-16 11:28:48 |