summaryrefslogtreecommitdiff
path: root/crawler/Algorithms.php
diff options
context:
space:
mode:
Diffstat (limited to 'crawler/Algorithms.php')
-rw-r--r--crawler/Algorithms.php79
1 files changed, 57 insertions, 22 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php
index 73d2ecc..6c8d513 100644
--- a/crawler/Algorithms.php
+++ b/crawler/Algorithms.php
@@ -1,10 +1,14 @@
<?php
+header('Content-type: text/plain; charset=utf-8');
+
/**
* User: Marvin Borner
* Date: 16/09/2018
* Time: 21:51
*/
+require_once 'CrawlController.php';
+
class Algorithms
{
public static function getUrlInfo($path): array
@@ -23,14 +27,14 @@ class Algorithms
if (!isset($urlInfo['description'])) {
$urlInfo['description'] = '';
foreach ($path->query('//p') as $text) {
- if (strlen($urlInfo['description']) < 350) {
+ if (mb_strlen($urlInfo['description']) < 350) {
$urlInfo['description'] .= $text->textContent . ' ';
}
}
}
if (empty($urlInfo['title'])) {
$urlInfo['title'] = '';
- if (strlen($urlInfo['title']) < 350) {
+ if (mb_strlen($urlInfo['title']) < 350) {
$urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
}
}
@@ -46,10 +50,8 @@ class Algorithms
foreach ($path->query('//a') as $link) {
$linkHref = $link->getAttribute('href');
- if ($linkHref !== 'javascript:void(0)') {
- $href = self::cleanUrl($linkHref);
- $allLinks[] = $href;
- }
+ $href = self::cleanUrl($linkHref);
+ $allLinks[] = $href;
}
return array_unique($allLinks);
@@ -66,25 +68,25 @@ class Algorithms
public static function cleanUrl($url): string
{
- global $currentlyCrawled;
-
- $newUrl = ltrim($url); // trim whitespaces
+ $newUrl = self::fixEncoding(ltrim($url)); // trim whitespaces
// normally only for links/href
- if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) {
- if (strpos($newUrl, 'www') === 0) {
+ if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || mb_strpos($newUrl, 'http') !== 0) {
+ if (mb_strpos($newUrl, 'www') === 0) {
$newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning
- } else if (strpos($newUrl, 'javascript:') === 0) {
- $newUrl = ''; // fixes javascript void links
- } else if (strpos($newUrl, '../') === 0) {
- $parsedUrl = parse_url($currentlyCrawled);
- $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
- $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
- } else if (strpos($newUrl, '/') === 0) {
- $parsedUrl = parse_url($currentlyCrawled);
+ } else if (mb_strpos($newUrl, 'javascript:') === 0 || mb_strpos($newUrl, 'mailto') === 0) {
+ $newUrl = CrawlController::$currentlyCrawled; // fixes javascript void links
+ } else if (mb_strpos($newUrl, '../') === 0) {
+ $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
+ $backCount = mb_substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
+ if ($backCount >= 1) {
+ $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
+ }
+ } else if (mb_strpos($newUrl, '/') === 0) {
+ $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
$newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path
} else {
- $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
+ $newUrl = '/' . CrawlController::$currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
}
}
@@ -95,8 +97,12 @@ class Algorithms
// strip some things
$newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes
- $newUrl = strtok($newUrl, '?'); // parameters
- $newUrl = strtok($newUrl, '#'); // hash fragments
+ $newUrl = self::mb_strtok($newUrl, '?'); // parameters
+ $newUrl = self::mb_strtok($newUrl, '#'); // hash fragments
+
+ if (mb_strpos($newUrl, '/') === 0) {
+ $newUrl = mb_substr($newUrl, 1); // remove first slash from domain, which could have been added
+ }
if ($url !== $newUrl) {
print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n";
@@ -104,4 +110,33 @@ class Algorithms
return $newUrl;
}
+
+ private static function fixEncoding($text): string
+ {
+ return iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8', $text);
+ }
+
+ private static function mb_strtok($str, $delimiters)
+ {
+ $pos = 0;
+ $string = $str;
+
+ $token = '';
+
+ while ($pos < mb_strlen($string)) {
+ $char = mb_substr($string, $pos, 1);
+ $pos++;
+ if (mb_strpos($delimiters, $char) === FALSE) {
+ $token .= $char;
+ } else if ($token !== '') {
+ return $token;
+ }
+ }
+
+ if ($token !== '') {
+ return $token;
+ }
+
+ return false;
+ }
} \ No newline at end of file