diff options
author | Marvin Borner | 2018-09-16 02:02:51 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-16 02:02:51 +0200 |
commit | efd766e16b295bded2f2c552ca2e07324876f99b (patch) | |
tree | 74fb6f6eafdd2880506a04b03e7c26912e50eb6b | |
parent | 0d7867ebc1e7733c8fccde934c9bf2c8334b846d (diff) |
"Fixed" 429 bug :hankey: :bug:
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | crawler.php | 21 | ||||
-rw-r--r-- | request.php | 21 | ||||
-rw-r--r-- | todo.md | 4 |
4 files changed, 40 insertions, 8 deletions
@@ -6,7 +6,7 @@ After it has finished the information gathering, it will go on by using the firs 1. Create a mysql database: `mysql -u username -p` and `CREATE DATABASE database_name;` 2. Import the `database.sql` file into your database with `mysql -u username -p database_name < database.sql` 3. Edit `mysql_conf.inc` according to your databases credentials -4. Run `php crawler.php http://dmoztools.net/` +4. Run `php crawler.php http://dmoztools.net/` (or any other domain) 5. For future runs, just execute `php crawler.php` and it will automatically start with the first url of the queue 6. Finished!
\ No newline at end of file diff --git a/crawler.php b/crawler.php index 1c1f562..69728e8 100644 --- a/crawler.php +++ b/crawler.php @@ -5,6 +5,7 @@ * Time: 23:48
*/
+set_time_limit(3600000);
error_reporting(E_ERROR | E_PARSE);
include 'mysql_conf.inc';
@@ -21,7 +22,7 @@ function crawl($url) if (!alreadyCrawled(cleanUrl($url))) {
$requestResponse = getContent($url);
- if (preg_match('/2\d\d/', $requestResponse[1])) {
+ if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
$htmlPath = createPathFromHtml($requestResponse[0]);
@@ -30,10 +31,16 @@ function crawl($url) writeToQueue($allLinks);
saveData($urlInfo);
+
+ $currentUrl = getFromQueue('ASC'); // set new from start
+ } else {
+ if ($requestResponse[1] === 429) {
+ $currentUrl = getFromQueue('DESC'); // set new from end
+ }
+ print "\t\e[91mError " . $requestResponse[1] . "\n";
}
}
- $currentUrl = getFirstFromQueue(); // set new
removeFromQueue($currentUrl);
}
@@ -139,10 +146,10 @@ function createPathFromHtml($content) return new DOMXPath($dom);
}
-function getFirstFromQueue()
+function getFromQueue($sort)
{
$conn = initDbConnection();
- $checkStmt = $conn->query('SELECT url FROM queue LIMIT 1');
+ $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1');
return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'];
}
@@ -155,7 +162,7 @@ function writeToQueue($urls) $hash = md5($url);
print "\t\e[96mChecking if url already has been crawled " . $url . "\n";
- $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
+ $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
if ($checkStmt->rowCount() === 0) {
$stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)');
@@ -176,7 +183,7 @@ function removeFromQueue($url) $hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('DELETE FROM queue where hash = :hash');
+ $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
}
@@ -204,7 +211,7 @@ function alreadyCrawled($url) {
$hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
+ $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
return $checkStmt->rowCount() !== 0; // return true if already crawled
}
diff --git a/request.php b/request.php new file mode 100644 index 0000000..e649103 --- /dev/null +++ b/request.php @@ -0,0 +1,21 @@ +<?php +include 'mysql_conf.inc'; + +print_r(getContent($argv[1])); + +function getContent($query) +{ + $conn = initDbConnection(); + $checkStmt = $conn->prepare('SELECT url, title, description, lang FROM url_data WHERE title LIKE :query OR description LIKE :query'); + $checkStmt->execute([':query' => '%' . $query . '%']); + return $checkStmt->fetchAll(PDO::FETCH_ASSOC); +} + +function initDbConnection() +{ + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + $conn->exec('SET CHARACTER SET utf8'); + return $conn; +}
\ No newline at end of file @@ -0,0 +1,4 @@ +# TODOs +- [ ] Respect robots.txt +- [ ] Clean up code (eg. external algorithm functions) +- [ ] Photo crawling
\ No newline at end of file |