summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-09-16 14:25:08 +0200
committerMarvin Borner2018-09-16 14:25:08 +0200
commit0fbf636361ee39a09ab984e6e2bad0ed989c4dde (patch)
tree3c90b91fc1fd614be9b44a5738e6d8c9c46d8b21
parentb9e5d52cbf96374811edf09a1c9b7edb2d63e9d7 (diff)
Several small improvements :zap:
-rw-r--r--crawler.php44
-rw-r--r--database.sql23
2 files changed, 51 insertions, 16 deletions
diff --git a/crawler.php b/crawler.php
index 69728e8..45753f3 100644
--- a/crawler.php
+++ b/crawler.php
@@ -10,7 +10,7 @@ error_reporting(E_ERROR | E_PARSE);
include 'mysql_conf.inc';
-$currentUrl = $argv[1];
+$currentUrl = $argv[1] ?? '';
while (true) {
crawl($currentUrl);
@@ -20,7 +20,12 @@ function crawl($url)
{
global $currentUrl;
- if (!alreadyCrawled(cleanUrl($url))) {
+ if (alreadyCrawled(cleanUrl($url))) {
+ print "\t\e[91mUrl already crawled " . $url . "\n";
+
+ removeFromQueue($currentUrl);
+ $currentUrl = getFromQueue('ASC');
+ } else {
$requestResponse = getContent($url);
if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
@@ -32,23 +37,23 @@ function crawl($url)
writeToQueue($allLinks);
saveData($urlInfo);
+ removeFromQueue($currentUrl);
$currentUrl = getFromQueue('ASC'); // set new from start
} else {
- if ($requestResponse[1] === 429) {
- $currentUrl = getFromQueue('DESC'); // set new from end
- }
- print "\t\e[91mError " . $requestResponse[1] . "\n";
+ print "\t\e[91mError " . $requestResponse[1] . ' ' . $currentUrl . "\n";
+
+ urlHasError($currentUrl); // prevents re-crawling of error url
+ removeFromQueue($currentUrl);
+ $currentUrl = getFromQueue('DESC'); // set new from end
}
}
-
- removeFromQueue($currentUrl);
}
function getContent($url)
{
$curl = curl_init($url);
- curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
+ curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_TIMEOUT, 5);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
@@ -100,8 +105,8 @@ function getLinks($path)
{
$allLinks = [];
- foreach ($path->query('//a') as $ink) {
- $href = cleanUrl($ink->getAttribute('href'));
+ foreach ($path->query('//a') as $link) {
+ $href = cleanUrl($link->getAttribute('href'));
$allLinks[] = $href;
}
@@ -114,7 +119,7 @@ function cleanUrl($url)
$url = ltrim($url);
- if (!(strpos($url, 'http') === 0)) {
+ if (filter_var($url, FILTER_VALIDATE_URL) === false && !(strpos($url, 'http') === 0)) {
if (strpos($url, 'www') === 0) {
$url = 'http://' . $url;
} else if (strpos($url, '/') === 0) {
@@ -184,7 +189,16 @@ function removeFromQueue($url)
$conn = initDbConnection();
$checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash');
- $checkStmt->execute(['hash' => $hash]);
+ $checkStmt->execute([':hash' => $hash]);
+}
+
+function urlHasError($url)
+{
+ $hash = md5($url);
+
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('INSERT INTO error_url (url, hash) VALUES (:url, :hash)');
+ $checkStmt->execute([':url' => $url, 'hash' => $hash]);
}
function saveData($urlInfo)
@@ -211,8 +225,8 @@ function alreadyCrawled($url)
{
$hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
- $checkStmt->execute(['hash' => $hash]);
+ $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)');
+ $checkStmt->execute([':hash' => $hash]);
return $checkStmt->rowCount() !== 0; // return true if already crawled
}
diff --git a/database.sql b/database.sql
index c53110f..1f7e2fe 100644
--- a/database.sql
+++ b/database.sql
@@ -16,6 +16,27 @@
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
--
+-- Table structure for table `error_url`
+--
+
+DROP TABLE IF EXISTS `error_url`;
+/*!40101 SET @saved_cs_client = @@character_set_client */;
+/*!40101 SET character_set_client = utf8 */;
+CREATE TABLE `error_url` (
+ `id` int(8) NOT NULL AUTO_INCREMENT,
+ `url` varchar(2083) COLLATE utf8mb4_unicode_ci NOT NULL,
+ `hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
+ PRIMARY KEY (`id`),
+ UNIQUE KEY `error_url_hash_uindex` (`hash`),
+ UNIQUE KEY `error_url_id_uindex` (`id`),
+ KEY `error_url_hash_index` (`hash`)
+)
+ ENGINE = InnoDB
+ DEFAULT CHARSET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
-- Table structure for table `queue`
--
@@ -69,4 +90,4 @@ CREATE TABLE `url_data` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
--- Dump completed on 2018-09-16 0:57:12
+-- Dump completed on 2018-09-16 11:28:48