summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarvin Borner2018-09-15 17:47:03 +0200
committerMarvin Borner2018-09-15 17:47:03 +0200
commit0e2b7fc11970b9c810c9b66739e0bd9130bf92ee (patch)
tree7ca89a7ba6a6d6e986915fc3dadde715eff0bf6d
parent940b46f419fb25366af675f5ab6da651a5f9c965 (diff)
Some performance improvements :zap:
-rw-r--r--crawler.php101
-rw-r--r--database.sql20
2 files changed, 94 insertions, 27 deletions
diff --git a/crawler.php b/crawler.php
index 1e4f401..7e3bcfd 100644
--- a/crawler.php
+++ b/crawler.php
@@ -5,36 +5,54 @@
* Time: 23:48
*/
+error_reporting(E_ERROR | E_PARSE);
+
include "mysql_conf.inc";
$currentUrl = $argv[1];
-crawlLoop();
-function crawlLoop()
+while (true) {
+ crawl($currentUrl);
+}
+
+function crawl($url)
{
global $currentUrl;
- $content = getContent($currentUrl);
- $htmlPath = createPathFromHtml($content);
- $urlInfo = getUrlInfo($htmlPath);
- $allLinks = getLinks($htmlPath);
+ if (!alreadyCrawled(cleanUrl($url))) {
+ $requestResponse = getContent($url);
+ if ($requestResponse[1] != 404) {
+ print "Download Size: " . $requestResponse[2];
- writeToQueue($allLinks);
- saveData($urlInfo);
+ $htmlPath = createPathFromHtml($requestResponse[0]);
+ $urlInfo = getUrlInfo($htmlPath);
+ $allLinks = getLinks($htmlPath);
+
+ writeToQueue($allLinks);
+ saveData($urlInfo);
+ }
+ }
+
+ $currentUrl = getFirstFromQueue(); // set new
+ removeFromQueue($currentUrl);
+
+ return;
}
function getContent($url)
{
$curl = curl_init($url);
+ curl_setopt($curl, CURLOPT_USERAGENT, "Googlebot/2.1 (+http://www.google.com/bot.html)");
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
$content = curl_exec($curl);
- print "Download Size: " . curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
+ $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
+ $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
curl_close($curl);
- return $content;
+ return [$content, $responseCode, $downloadSize];
}
function getUrlInfo($path)
@@ -51,25 +69,37 @@ function getUrlInfo($path)
function getLinks($path)
{
- global $currentUrl;
$allLinks = [];
foreach ($path->query("//a") as $ink) {
- $href = ltrim($ink->getAttribute("href"));
+ $href = cleanUrl($ink->getAttribute("href"));
+ array_push($allLinks, $href);
+ }
- if (!(substr($href, 0, 4) === "http")) {
- if (substr($href, 0, 3) === "www") $href = "http://" . $href;
- else if (substr($href, 0, 1) === "/") $href = $currentUrl . $href;
- else $href = $currentUrl . $href;
- }
+ return array_unique($allLinks);
+}
- // if it's pure domain without slash (prevents duplicate domains because of slash)
- if (preg_match('/\w+\.\w{2,3}$/', $href)) $href = $href . "/";
+function cleanUrl($url)
+{
+ global $currentUrl;
- array_push($allLinks, $href);
+ $url = ltrim($url);
+
+ if (!(substr($url, 0, 4) === "http")) {
+ if (substr($url, 0, 3) === "www") $url = "http://" . $url;
+ else if (substr($url, 0, 1) === "/") $url = $currentUrl . $url;
+ else $url = $currentUrl . $url;
}
- return array_unique($allLinks);
+ // if it's pure domain without slash (prevents duplicate domains because of slash)
+ if (preg_match('/\w+\.\w{2,3}$/', $url)) $url = $url . "/";
+
+ // strip some things
+ $url = preg_replace('/([^:])(\/{2,})/', '$1/', $url); // double slashes
+ $url = strtok($url, '?'); // parameters
+ $url = strtok($url, '#'); // hash fragments
+
+ return $url;
}
function createPathFromHtml($content)
@@ -81,6 +111,15 @@ function createPathFromHtml($content)
return new DOMXPath($dom);
}
+function getFirstFromQueue()
+{
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('SELECT url FROM queue LIMIT 1');
+ $checkStmt->execute();
+
+ return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]["url"];
+}
+
function writeToQueue($urls)
{
$conn = initDbConnection();
@@ -88,7 +127,7 @@ function writeToQueue($urls)
foreach ($urls as $url) {
$hash = md5($url);
- $checkStmt = $conn->prepare('SELECT hash FROM url_data where hash = :hash');
+ $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
$checkStmt->execute(['hash' => $hash]);
if ($checkStmt->rowCount() === 0) {
$stmt = $conn->prepare('INSERT IGNORE INTO queue (url, hash) VALUES (:url, :hash)');
@@ -97,6 +136,15 @@ function writeToQueue($urls)
}
}
+function removeFromQueue($url)
+{
+ $hash = md5($url);
+
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('DELETE FROM queue where hash = :hash');
+ $checkStmt->execute(['hash' => $hash]);
+}
+
function saveData($urlInfo)
{
global $currentUrl;
@@ -118,6 +166,15 @@ function saveData($urlInfo)
}
}
+function alreadyCrawled($url)
+{
+ $hash = md5($url);
+ $conn = initDbConnection();
+ $checkStmt = $conn->prepare('SELECT null FROM url_data where hash = :hash');
+ $checkStmt->execute(['hash' => $hash]);
+ return $checkStmt->rowCount() !== 0; // return true if already crawled
+}
+
function initDbConnection()
{
global $servername, $dbname, $username, $password;
diff --git a/database.sql b/database.sql
index 29970b4..54d6a6f 100644
--- a/database.sql
+++ b/database.sql
@@ -28,8 +28,13 @@ CREATE TABLE `queue` (
`hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `queue_id_uindex` (`id`),
- UNIQUE KEY `queue_hash_uindex` (`hash`)
-) ENGINE=InnoDB AUTO_INCREMENT=557 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+ UNIQUE KEY `queue_hash_uindex` (`hash`),
+ KEY `queue_hash_index` (`hash`)
+)
+ ENGINE = InnoDB
+ AUTO_INCREMENT = 5504
+ DEFAULT CHARSET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
--
@@ -48,8 +53,13 @@ CREATE TABLE `url_data` (
`lang` varchar(3) COLLATE utf8mb4_unicode_ci NOT NULL,
`hash` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`id`),
- UNIQUE KEY `url_data_hash_uindex` (`hash`)
-) ENGINE=InnoDB AUTO_INCREMENT=15 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+ UNIQUE KEY `url_data_hash_uindex` (`hash`),
+ KEY `url_data_hash_index` (`hash`)
+)
+ ENGINE = InnoDB
+ AUTO_INCREMENT = 59
+ DEFAULT CHARSET = utf8mb4
+ COLLATE = utf8mb4_unicode_ci;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
@@ -61,4 +71,4 @@ CREATE TABLE `url_data` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
--- Dump completed on 2018-09-15 15:12:33
+-- Dump completed on 2018-09-15 17:46:14