diff options
Diffstat (limited to 'crawler/Algorithms.php')
-rw-r--r-- | crawler/Algorithms.php | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php new file mode 100644 index 0000000..66e6461 --- /dev/null +++ b/crawler/Algorithms.php @@ -0,0 +1,107 @@ +<?php +/** + * User: Marvin Borner + * Date: 16/09/2018 + * Time: 21:51 + */ + +class Algorithms +{ + public static function getUrlInfo($path): array + { + $urlInfo = []; + + $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent); + foreach ($path->query('//html') as $language) { + $urlInfo['language'] = strip_tags($language->getAttribute('lang')); + } + foreach ($path->query('/html/head/meta[@name="description"]') as $description) { + $urlInfo['description'] = strip_tags($description->getAttribute('content')); + } + + // Fix empty information + if (!isset($urlInfo['description'])) { + $urlInfo['description'] = ''; + foreach ($path->query('//p') as $text) { + if (strlen($urlInfo['description']) < 350) { + $urlInfo['description'] .= $text->textContent . ' '; + } + } + } + if (empty($urlInfo['title'])) { + $urlInfo['title'] = ''; + if (strlen($urlInfo['title']) < 350) { + $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' '; + } + } + + print "\t\e[92mFound data: " . $urlInfo['title'] . "\n"; + + return $urlInfo; + } + + public static function getLinks($path): array + { + $allLinks = []; + + foreach ($path->query('//a') as $link) { + $linkHref = $link->getAttribute('href'); + if ($linkHref !== 'javascript:void(0)') { + $href = cleanUrl($linkHref); + $allLinks[] = $href; + } + } + + return array_unique($allLinks); + } + + public static function createPathFromHtml($content): \DOMXPath + { + $dom = new DOMDocument(); + libxml_use_internal_errors(true); + $dom->loadHTML($content); + libxml_use_internal_errors(false); + return new DOMXPath($dom); + } + + public static function cleanUrl($url): string + { + global $currentlyCrawled; + + $newUrl = ltrim($url); // trim whitespaces + + // normally only for links/href + if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) { + if (strpos($newUrl, 'www') === 0) { + $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning + } else if (strpos($newUrl, 'javascript:') === 0) { + $newUrl = ''; // fixes javascript void links + } else if (strpos($newUrl, '../') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly) + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path + } else if (strpos($newUrl, '/') === 0) { + $parsedUrl = parse_url($currentlyCrawled); + $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path + } else { + $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning + } + } + + // if it's pure domain without slash (prevents duplicate domains because of slash) + if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) { + $newUrl .= '/'; + } + + // strip some things + $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes + $newUrl = strtok($newUrl, '?'); // parameters + $newUrl = strtok($newUrl, '#'); // hash fragments + + if ($url !== $newUrl) { + print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n"; + } + + return $newUrl; + } +}
\ No newline at end of file |