diff options
author | Marvin Borner | 2018-09-15 20:01:14 +0200 |
---|---|---|
committer | Marvin Borner | 2018-09-15 20:01:14 +0200 |
commit | ccee1a98c96f8dfa860b063278c057dcef7c8575 (patch) | |
tree | 6cd8ab14f312d68cee97de5e191b6e041f0dd3ec | |
parent | 0e2b7fc11970b9c810c9b66739e0bd9130bf92ee (diff) |
Improved information gathering algorithms :zap:
-rw-r--r-- | crawler.php | 27 |
1 files changed, 20 insertions, 7 deletions
diff --git a/crawler.php b/crawler.php index 7e3bcfd..503e190 100644 --- a/crawler.php +++ b/crawler.php @@ -59,10 +59,24 @@ function getUrlInfo($path) {
$urlInfo = [];
- foreach ($path->query("//html") as $html) $urlInfo["language"] = $html->getAttribute("lang");
- foreach ($path->query("//meta") as $meta) $urlInfo[$meta->getAttribute("name")] = $meta->getAttribute("content");
- foreach ($path->query("//link") as $link) $urlInfo[$link->getAttribute("rel")] = $link->getAttribute("href");
- $urlInfo["title"] = $path->query("//title")[0]->textContent;
+ $urlInfo["title"] = strip_tags($path->query("//title")[0]->textContent);
+ foreach ($path->query("//html") as $language) $urlInfo["language"] = strip_tags($language->getAttribute("lang"));
+ foreach ($path->query("/html/head/meta[@name=\"description\"]") as $description) $urlInfo["description"] = strip_tags($description->getAttribute("content"));
+
+ // Fix empty information
+ if (!(isset($urlInfo["description"]))) {
+ $urlInfo["description"] = "";
+ foreach ($path->query("//p") as $text) {
+ if (strlen($urlInfo["description"]) < 350)
+ $urlInfo["description"] .= $text->textContent . " ";
+ }
+ }
+
+ if (empty($urlInfo["title"])) {
+ $urlInfo["title"] = "";
+ if (strlen($urlInfo["title"]) < 350)
+ $urlInfo["title"] .= $path->query("//h1")[0]->textContent . " ";
+ }
return $urlInfo;
}
@@ -153,14 +167,13 @@ function saveData($urlInfo) $title = isset($urlInfo["title"]) ? $urlInfo["title"] : "";
$description = isset($urlInfo["description"]) ? $urlInfo["description"] : "";
- $icon = isset($urlInfo["icon"]) ? $urlInfo["icon"] : "";
$language = isset($urlInfo["language"]) ? $urlInfo["language"] : "en";
$hash = md5($currentUrl);
try {
$conn = initDbConnection();
- $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, icon, lang, hash) VALUES (:url, :title, :description, :icon, :lang, :hash)');
- $stmt->execute([':url' => $currentUrl, ':title' => $title, ':description' => $description, ':icon' => $icon, ':lang' => $language, ':hash' => $hash]);
+ $stmt = $conn->prepare('INSERT IGNORE INTO url_data (url, title, description, lang, hash) VALUES (:url, :title, :description, :lang, :hash)');
+ $stmt->execute([':url' => $currentUrl, ':title' => $title, ':description' => $description, ':lang' => $language, ':hash' => $hash]);
} catch (PDOException $e) {
print $e->getMessage();
}
|