1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
<?php
/**
* User: Marvin Borner
* Date: 16/09/2018
* Time: 21:51
*/
class Algorithms
{
public static function getUrlInfo($path): array
{
$urlInfo = [];
$urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent);
foreach ($path->query('//html') as $language) {
$urlInfo['language'] = strip_tags($language->getAttribute('lang'));
}
foreach ($path->query('/html/head/meta[@name="description"]') as $description) {
$urlInfo['description'] = strip_tags($description->getAttribute('content'));
}
// Fix empty information
if (!isset($urlInfo['description'])) {
$urlInfo['description'] = '';
foreach ($path->query('//p') as $text) {
if (strlen($urlInfo['description']) < 350) {
$urlInfo['description'] .= $text->textContent . ' ';
}
}
}
if (empty($urlInfo['title'])) {
$urlInfo['title'] = '';
if (strlen($urlInfo['title']) < 350) {
$urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
}
}
print "\t\e[92mFound data: " . $urlInfo['title'] . "\n";
return $urlInfo;
}
public static function getLinks($path): array
{
$allLinks = [];
foreach ($path->query('//a') as $link) {
$linkHref = $link->getAttribute('href');
if ($linkHref !== 'javascript:void(0)') {
$href = cleanUrl($linkHref);
$allLinks[] = $href;
}
}
return array_unique($allLinks);
}
public static function createPathFromHtml($content): \DOMXPath
{
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($content);
libxml_use_internal_errors(false);
return new DOMXPath($dom);
}
public static function cleanUrl($url): string
{
global $currentlyCrawled;
$newUrl = ltrim($url); // trim whitespaces
// normally only for links/href
if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) {
if (strpos($newUrl, 'www') === 0) {
$newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning
} else if (strpos($newUrl, 'javascript:') === 0) {
$newUrl = ''; // fixes javascript void links
} else if (strpos($newUrl, '../') === 0) {
$parsedUrl = parse_url($currentlyCrawled);
$backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
$newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
} else if (strpos($newUrl, '/') === 0) {
$parsedUrl = parse_url($currentlyCrawled);
$newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path
} else {
$newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
}
}
// if it's pure domain without slash (prevents duplicate domains because of slash)
if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) {
$newUrl .= '/';
}
// strip some things
$newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes
$newUrl = strtok($newUrl, '?'); // parameters
$newUrl = strtok($newUrl, '#'); // hash fragments
if ($url !== $newUrl) {
print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n";
}
return $newUrl;
}
}
|