From efd766e16b295bded2f2c552ca2e07324876f99b Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sun, 16 Sep 2018 02:02:51 +0200 Subject: "Fixed" 429 bug :hankey: :bug: --- README.md | 2 +- crawler.php | 21 ++++++++++++++------- request.php | 21 +++++++++++++++++++++ todo.md | 4 ++++ 4 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 request.php create mode 100644 todo.md diff --git a/README.md b/README.md index bd669c6..e166b43 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ After it has finished the information gathering, it will go on by using the firs 1. Create a mysql database: `mysql -u username -p` and `CREATE DATABASE database_name;` 2. Import the `database.sql` file into your database with `mysql -u username -p database_name < database.sql` 3. Edit `mysql_conf.inc` according to your databases credentials -4. Run `php crawler.php http://dmoztools.net/` +4. Run `php crawler.php http://dmoztools.net/` (or any other domain) 5. For future runs, just execute `php crawler.php` and it will automatically start with the first url of the queue 6. Finished! \ No newline at end of file diff --git a/crawler.php b/crawler.php index 1c1f562..69728e8 100644 --- a/crawler.php +++ b/crawler.php @@ -5,6 +5,7 @@ * Time: 23:48 */ +set_time_limit(3600000); error_reporting(E_ERROR | E_PARSE); include 'mysql_conf.inc'; @@ -21,7 +22,7 @@ function crawl($url) if (!alreadyCrawled(cleanUrl($url))) { $requestResponse = getContent($url); - if (preg_match('/2\d\d/', $requestResponse[1])) { + if (preg_match('/2\d\d/', $requestResponse[1])) { // success print 'Download Size: ' . $requestResponse[2]; $htmlPath = createPathFromHtml($requestResponse[0]); @@ -30,10 +31,16 @@ function crawl($url) writeToQueue($allLinks); saveData($urlInfo); + + $currentUrl = getFromQueue('ASC'); // set new from start + } else { + if ($requestResponse[1] === 429) { + $currentUrl = getFromQueue('DESC'); // set new from end + } + print "\t\e[91mError " . $requestResponse[1] . "\n"; } } - $currentUrl = getFirstFromQueue(); // set new removeFromQueue($currentUrl); } @@ -139,10 +146,10 @@ function createPathFromHtml($content) return new DOMXPath($dom); } -function getFirstFromQueue() +function getFromQueue($sort) { $conn = initDbConnection(); - $checkStmt = $conn->query('SELECT url FROM queue LIMIT 1'); + $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1'); return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url']; } @@ -155,7 +162,7 @@ function writeToQueue($urls) $hash = md5($url); print "\t\e[96mChecking if url already has been crawled " . $url . "\n"; - $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash'); + $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash'); $checkStmt->execute(['hash' => $hash]); if ($checkStmt->rowCount() === 0) { $stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)'); @@ -176,7 +183,7 @@ function removeFromQueue($url) $hash = md5($url); $conn = initDbConnection(); - $checkStmt = $conn->prepare('DELETE FROM queue where hash = :hash'); + $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash'); $checkStmt->execute(['hash' => $hash]); } @@ -204,7 +211,7 @@ function alreadyCrawled($url) { $hash = md5($url); $conn = initDbConnection(); - $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash'); + $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash'); $checkStmt->execute(['hash' => $hash]); return $checkStmt->rowCount() !== 0; // return true if already crawled } diff --git a/request.php b/request.php new file mode 100644 index 0000000..e649103 --- /dev/null +++ b/request.php @@ -0,0 +1,21 @@ +prepare('SELECT url, title, description, lang FROM url_data WHERE title LIKE :query OR description LIKE :query'); + $checkStmt->execute([':query' => '%' . $query . '%']); + return $checkStmt->fetchAll(PDO::FETCH_ASSOC); +} + +function initDbConnection() +{ + global $servername, $dbname, $username, $password; + $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password); + $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + $conn->exec('SET CHARACTER SET utf8'); + return $conn; +} \ No newline at end of file diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..e68cf4e --- /dev/null +++ b/todo.md @@ -0,0 +1,4 @@ +# TODOs +- [ ] Respect robots.txt +- [ ] Clean up code (eg. external algorithm functions) +- [ ] Photo crawling \ No newline at end of file -- cgit v1.2.3