summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-09-16 02:02:51 +0200
committerMarvin Borner2018-09-16 02:02:51 +0200
commitefd766e16b295bded2f2c552ca2e07324876f99b (patch)
tree74fb6f6eafdd2880506a04b03e7c26912e50eb6b
parent0d7867ebc1e7733c8fccde934c9bf2c8334b846d (diff)
"Fixed" 429 bug :hankey: :bug:
-rw-r--r--README.md2
-rw-r--r--crawler.php21
-rw-r--r--request.php21
-rw-r--r--todo.md4
4 files changed, 40 insertions, 8 deletions
diff --git a/README.md b/README.md
index bd669c6..e166b43 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ After it has finished the information gathering, it will go on by using the firs
1. Create a mysql database: `mysql -u username -p` and `CREATE DATABASE database_name;`
2. Import the `database.sql` file into your database with `mysql -u username -p database_name < database.sql`
3. Edit `mysql_conf.inc` according to your databases credentials
-4. Run `php crawler.php http://dmoztools.net/`
+4. Run `php crawler.php http://dmoztools.net/` (or any other domain)
5. For future runs, just execute `php crawler.php` and it will automatically start with the first url of the queue
6. Finished!
\ No newline at end of file
diff --git a/crawler.php b/crawler.php
index 1c1f562..69728e8 100644
--- a/crawler.php
+++ b/crawler.php
@@ -5,6 +5,7 @@
* Time: 23:48
*/
+set_time_limit(3600000);
error_reporting(E_ERROR | E_PARSE);
include 'mysql_conf.inc';
@@ -21,7 +22,7 @@ function crawl($url)
if (!alreadyCrawled(cleanUrl($url))) {
$requestResponse = getContent($url);
- if (preg_match('/2\d\d/', $requestResponse[1])) {
+ if (preg_match('/2\d\d/', $requestResponse[1])) { // success
print 'Download Size: ' . $requestResponse[2];
$htmlPath = createPathFromHtml($requestResponse[0]);
@@ -30,10 +31,16 @@ function crawl($url)
writeToQueue($allLinks);
saveData($urlInfo);
+
+ $currentUrl = getFromQueue('ASC'); // set new from start
+ } else {
+ if ($requestResponse[1] === 429) {
+ $currentUrl = getFromQueue('DESC'); // set new from end
+ }
+ print "\t\e[91mError " . $requestResponse[1] . "\n";
}
}
- $currentUrl = getFirstFromQueue(); // set new
removeFromQueue($currentUrl);
}
@@ -139,10 +146,10 @@ function createPathFromHtml($content)
return new DOMXPath($dom);
}
-function getFirstFromQueue()
+function getFromQueue($sort)
{
$conn = initDbConnection();
- $checkStmt = $conn->query('SELECT url FROM queue LIMIT 1');
+ $checkStmt = $conn->query('SELECT url FROM queue ORDER BY id ' . $sort . ' LIMIT 1');
return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'];
}
@@ -155,7 +162,7 @@ function writeToQueue($urls)
$hash = md5($url);
print "\t\e[96mChecking if url already has been crawled " . $url . "\n";
- $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
+ $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
if ($checkStmt->rowCount() === 0) {
$stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)');
@@ -176,7 +183,7 @@ function removeFromQueue($url)
$hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('DELETE FROM queue where hash = :hash');
+ $checkStmt = $conn->prepare('DELETE FROM queue WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
}
@@ -204,7 +211,7 @@ function alreadyCrawled($url)
{
$hash = md5($url);
$conn = initDbConnection();
- $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
+ $checkStmt = $conn->prepare('SELECT null FROM url_data WHERE hash = :hash');
$checkStmt->execute(['hash' => $hash]);
return $checkStmt->rowCount() !== 0; // return true if already crawled
}
diff --git a/request.php b/request.php
new file mode 100644
index 0000000..e649103
--- /dev/null
+++ b/request.php
@@ -0,0 +1,21 @@
+<?php
+include 'mysql_conf.inc';
+
+print_r(getContent($argv[1]));
+
+function getContent($query)
+{
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('SELECT url, title, description, lang FROM url_data WHERE title LIKE :query OR description LIKE :query');
+ $checkStmt->execute([':query' => '%' . $query . '%']);
+ return $checkStmt->fetchAll(PDO::FETCH_ASSOC);
+}
+
+function initDbConnection()
+{
+ global $servername, $dbname, $username, $password;
+ $conn = new PDO("mysql:host=$servername;dbname=$dbname", $username, $password);
+ $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
+ $conn->exec('SET CHARACTER SET utf8');
+ return $conn;
+} \ No newline at end of file
diff --git a/todo.md b/todo.md
new file mode 100644
index 0000000..e68cf4e
--- /dev/null
+++ b/todo.md
@@ -0,0 +1,4 @@
+# TODOs
+- [ ] Respect robots.txt
+- [ ] Clean up code (eg. external algorithm functions)
+- [ ] Photo crawling \ No newline at end of file