1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
<?php
header('Content-type: text/plain; charset=utf-8');
/**
* User: Marvin Borner
* Date: 16/09/2018
* Time: 21:53
*/
class WebRequest
{
private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
public static function getContent($url)
{
if (self::checkRobotsTxt($url)) {
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent);
curl_setopt($curl, CURLOPT_ENCODING, '');
curl_setopt($curl, CURLOPT_TIMEOUT, 5);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
$content = curl_exec($curl);
$responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
$downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
$updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302
curl_close($curl);
return [$content, $responseCode, $downloadSize, $updatedUrl];
}
return false;
}
public static function checkRobotsTxt($url): bool
{
$userAgent = self::$userAgent;
$parsed = parse_url($url);
$agents = array(preg_quote('*', NULL));
if ($userAgent) {
$agents[] = preg_quote($userAgent, NULL);
}
$agents = implode('|', $agents);
$robotsTxt = @file("http://{$parsed['host']}/robots.txt");
if (empty($robotsTxt)) {
return true;
}
$rules = array();
$ruleApplies = false;
foreach ($robotsTxt as $line) {
if (!$line = trim($line)) {
continue;
}
if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
$ruleApplies = preg_match("/($agents)/i", $match[1]);
}
if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
if (!$regs[1]) {
return true;
}
$rules[] = preg_quote(trim($regs[1]), '/');
}
}
foreach ($rules as $rule) {
if (preg_match("/^$rule/", $parsed['path'])) {
return false;
}
}
return true;
}
}
|