crawler/Algorithms.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

<?php
/**
 * User: Marvin Borner
 * Date: 16/09/2018
 * Time: 21:51
 */

class Algorithms
{
    public static function getUrlInfo($path): array
    {
        $urlInfo = [];

        $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent);
        foreach ($path->query('//html') as $language) {
            $urlInfo['language'] = strip_tags($language->getAttribute('lang'));
        }
        foreach ($path->query('/html/head/meta[@name="description"]') as $description) {
            $urlInfo['description'] = strip_tags($description->getAttribute('content'));
        }

        // Fix empty information
        if (!isset($urlInfo['description'])) {
            $urlInfo['description'] = '';
            foreach ($path->query('//p') as $text) {
                if (strlen($urlInfo['description']) < 350) {
                    $urlInfo['description'] .= $text->textContent . ' ';
                }
            }
        }
        if (empty($urlInfo['title'])) {
            $urlInfo['title'] = '';
            if (strlen($urlInfo['title']) < 350) {
                $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
            }
        }

        print "\t\e[92mFound data: " . $urlInfo['title'] . "\n";

        return $urlInfo;
    }

    public static function getLinks($path): array
    {
        $allLinks = [];

        foreach ($path->query('//a') as $link) {
            $linkHref = $link->getAttribute('href');
            if ($linkHref !== 'javascript:void(0)') {
                $href = cleanUrl($linkHref);
                $allLinks[] = $href;
            }
        }

        return array_unique($allLinks);
    }

    public static function createPathFromHtml($content): \DOMXPath
    {
        $dom = new DOMDocument();
        libxml_use_internal_errors(true);
        $dom->loadHTML($content);
        libxml_use_internal_errors(false);
        return new DOMXPath($dom);
    }

    public static function cleanUrl($url): string
    {
        global $currentlyCrawled;

        $newUrl = ltrim($url); // trim whitespaces

        // normally only for links/href
        if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) {
            if (strpos($newUrl, 'www') === 0) {
                $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning
            } else if (strpos($newUrl, 'javascript:') === 0) {
                $newUrl = ''; // fixes javascript void links
            } else if (strpos($newUrl, '../') === 0) {
                $parsedUrl = parse_url($currentlyCrawled);
                $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
                $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
            } else if (strpos($newUrl, '/') === 0) {
                $parsedUrl = parse_url($currentlyCrawled);
                $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path
            } else {
                $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
            }
        }

        // if it's pure domain without slash (prevents duplicate domains because of slash)
        if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) {
            $newUrl .= '/';
        }

        // strip some things
        $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes
        $newUrl = strtok($newUrl, '?'); // parameters
        $newUrl = strtok($newUrl, '#'); // hash fragments

        if ($url !== $newUrl) {
            print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n";
        }

        return $newUrl;
    }
}