summaryrefslogtreecommitdiff
path: root/crawler/WebRequest.php
blob: 6053bae2e21bded605c841d3f36e459cfd80bd89 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
<?php
header('Content-type: text/plain; charset=utf-8');

/**
 * User: Marvin Borner
 * Date: 16/09/2018
 * Time: 21:53
 */
class WebRequest
{
    private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';

    public static function getContent($url)
    {
        if (self::checkRobotsTxt($url)) {
            $curl = curl_init($url);
            curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent);
            curl_setopt($curl, CURLOPT_ENCODING, '');
            curl_setopt($curl, CURLOPT_TIMEOUT, 5);
            curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
            curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
            $content = curl_exec($curl);
            $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
            $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
            $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302
            curl_close($curl);

            return [$content, $responseCode, $downloadSize, $updatedUrl];
        }

        return false;
    }

    public static function checkRobotsTxt($url): bool
    {
        $userAgent = self::$userAgent;
        $parsed = parse_url($url);
        $agents = array(preg_quote('*', NULL));
        if ($userAgent) {
            $agents[] = preg_quote($userAgent, NULL);
        }
        $agents = implode('|', $agents);
        $robotsTxt = @file("http://{$parsed['host']}/robots.txt");
        if (empty($robotsTxt)) {
            return true;
        }
        $rules = array();
        $ruleApplies = false;
        foreach ($robotsTxt as $line) {
            if (!$line = trim($line)) {
                continue;
            }
            if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
                $ruleApplies = preg_match("/($agents)/i", $match[1]);
            }
            if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
                if (!$regs[1]) {
                    return true;
                }
                $rules[] = preg_quote(trim($regs[1]), '/');
            }
        }
        foreach ($rules as $rule) {
            if (preg_match("/^$rule/", $parsed['path'])) {
                return false;
            }
        }
        return true;
    }
}