summaryrefslogtreecommitdiff
path: root/crawler
diff options
context:
space:
mode:
Diffstat (limited to 'crawler')
-rw-r--r--crawler/Algorithms.php142
-rw-r--r--crawler/CrawlController.php63
-rw-r--r--crawler/WebRequest.php71
-rw-r--r--crawler/crawler.php5
4 files changed, 0 insertions, 281 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php
deleted file mode 100644
index 6c8d513..0000000
--- a/crawler/Algorithms.php
+++ /dev/null
@@ -1,142 +0,0 @@
-<?php
-header('Content-type: text/plain; charset=utf-8');
-
-/**
- * User: Marvin Borner
- * Date: 16/09/2018
- * Time: 21:51
- */
-
-require_once 'CrawlController.php';
-
-class Algorithms
-{
- public static function getUrlInfo($path): array
- {
- $urlInfo = [];
-
- $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent);
- foreach ($path->query('//html') as $language) {
- $urlInfo['language'] = strip_tags($language->getAttribute('lang'));
- }
- foreach ($path->query('/html/head/meta[@name="description"]') as $description) {
- $urlInfo['description'] = strip_tags($description->getAttribute('content'));
- }
-
- // Fix empty information
- if (!isset($urlInfo['description'])) {
- $urlInfo['description'] = '';
- foreach ($path->query('//p') as $text) {
- if (mb_strlen($urlInfo['description']) < 350) {
- $urlInfo['description'] .= $text->textContent . ' ';
- }
- }
- }
- if (empty($urlInfo['title'])) {
- $urlInfo['title'] = '';
- if (mb_strlen($urlInfo['title']) < 350) {
- $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
- }
- }
-
- print "\t\e[92mFound data: " . $urlInfo['title'] . "\n";
-
- return $urlInfo;
- }
-
- public static function getLinks($path): array
- {
- $allLinks = [];
-
- foreach ($path->query('//a') as $link) {
- $linkHref = $link->getAttribute('href');
- $href = self::cleanUrl($linkHref);
- $allLinks[] = $href;
- }
-
- return array_unique($allLinks);
- }
-
- public static function createPathFromHtml($content): \DOMXPath
- {
- $dom = new DOMDocument();
- libxml_use_internal_errors(true);
- $dom->loadHTML($content);
- libxml_use_internal_errors(false);
- return new DOMXPath($dom);
- }
-
- public static function cleanUrl($url): string
- {
- $newUrl = self::fixEncoding(ltrim($url)); // trim whitespaces
-
- // normally only for links/href
- if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || mb_strpos($newUrl, 'http') !== 0) {
- if (mb_strpos($newUrl, 'www') === 0) {
- $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning
- } else if (mb_strpos($newUrl, 'javascript:') === 0 || mb_strpos($newUrl, 'mailto') === 0) {
- $newUrl = CrawlController::$currentlyCrawled; // fixes javascript void links
- } else if (mb_strpos($newUrl, '../') === 0) {
- $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
- $backCount = mb_substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
- if ($backCount >= 1) {
- $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
- }
- } else if (mb_strpos($newUrl, '/') === 0) {
- $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
- $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path
- } else {
- $newUrl = '/' . CrawlController::$currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
- }
- }
-
- // if it's pure domain without slash (prevents duplicate domains because of slash)
- if (preg_match('/\w+\.\w{2,3}$/', $newUrl)) {
- $newUrl .= '/';
- }
-
- // strip some things
- $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes
- $newUrl = self::mb_strtok($newUrl, '?'); // parameters
- $newUrl = self::mb_strtok($newUrl, '#'); // hash fragments
-
- if (mb_strpos($newUrl, '/') === 0) {
- $newUrl = mb_substr($newUrl, 1); // remove first slash from domain, which could have been added
- }
-
- if ($url !== $newUrl) {
- print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n";
- }
-
- return $newUrl;
- }
-
- private static function fixEncoding($text): string
- {
- return iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8', $text);
- }
-
- private static function mb_strtok($str, $delimiters)
- {
- $pos = 0;
- $string = $str;
-
- $token = '';
-
- while ($pos < mb_strlen($string)) {
- $char = mb_substr($string, $pos, 1);
- $pos++;
- if (mb_strpos($delimiters, $char) === FALSE) {
- $token .= $char;
- } else if ($token !== '') {
- return $token;
- }
- }
-
- if ($token !== '') {
- return $token;
- }
-
- return false;
- }
-} \ No newline at end of file
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php
deleted file mode 100644
index 5b20b75..0000000
--- a/crawler/CrawlController.php
+++ /dev/null
@@ -1,63 +0,0 @@
-<?php
-header('Content-type: text/plain; charset=utf-8');
-/**
- * User: Marvin Borner
- * Date: 14/09/2018
- * Time: 23:48
- */
-
-require_once '../database/mysqlConf.inc';
-require_once '../database/Database.php';
-require_once 'WebRequest.php';
-require_once 'Algorithms.php';
-
-class CrawlController
-{
- public static $currentlyCrawled;
-
- public static function start($url = '')
- {
- set_time_limit(3600000);
-
- self::$currentlyCrawled = $url;
-
- while (true) {
- self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled));
- }
- }
-
- private static function crawl($url)
- {
- if ($url !== '' && Database::alreadyCrawled($url)) {
- Database::removeFromQueue(self::$currentlyCrawled);
- self::$currentlyCrawled = Database::getFromQueue('DESC');
- } else {
- $requestResponse = WebRequest::getContent($url);
- if ($requestResponse) {
- self::$currentlyCrawled = $requestResponse[3];
- if (preg_match('/2\d\d/', $requestResponse[1])) { // success
- print 'Download Size: ' . $requestResponse[2];
-
- $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
-
- $urlInfo = Algorithms::getUrlInfo($htmlPath);
- Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
-
- $allLinks = Algorithms::getLinks($htmlPath);
- Database::insertIntoQueue($allLinks);
-
- Database::removeFromQueue(self::$currentlyCrawled);
- self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
- print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
- } else {
- print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
-
- Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
- Database::removeFromQueue(self::$currentlyCrawled);
- self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
- print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
- }
- }
- }
- }
-} \ No newline at end of file
diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php
deleted file mode 100644
index 6053bae..0000000
--- a/crawler/WebRequest.php
+++ /dev/null
@@ -1,71 +0,0 @@
-<?php
-header('Content-type: text/plain; charset=utf-8');
-
-/**
- * User: Marvin Borner
- * Date: 16/09/2018
- * Time: 21:53
- */
-class WebRequest
-{
- private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
-
- public static function getContent($url)
- {
- if (self::checkRobotsTxt($url)) {
- $curl = curl_init($url);
- curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent);
- curl_setopt($curl, CURLOPT_ENCODING, '');
- curl_setopt($curl, CURLOPT_TIMEOUT, 5);
- curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
- $content = curl_exec($curl);
- $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
- $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
- $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302
- curl_close($curl);
-
- return [$content, $responseCode, $downloadSize, $updatedUrl];
- }
-
- return false;
- }
-
- public static function checkRobotsTxt($url): bool
- {
- $userAgent = self::$userAgent;
- $parsed = parse_url($url);
- $agents = array(preg_quote('*', NULL));
- if ($userAgent) {
- $agents[] = preg_quote($userAgent, NULL);
- }
- $agents = implode('|', $agents);
- $robotsTxt = @file("http://{$parsed['host']}/robots.txt");
- if (empty($robotsTxt)) {
- return true;
- }
- $rules = array();
- $ruleApplies = false;
- foreach ($robotsTxt as $line) {
- if (!$line = trim($line)) {
- continue;
- }
- if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
- $ruleApplies = preg_match("/($agents)/i", $match[1]);
- }
- if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
- if (!$regs[1]) {
- return true;
- }
- $rules[] = preg_quote(trim($regs[1]), '/');
- }
- }
- foreach ($rules as $rule) {
- if (preg_match("/^$rule/", $parsed['path'])) {
- return false;
- }
- }
- return true;
- }
-} \ No newline at end of file
diff --git a/crawler/crawler.php b/crawler/crawler.php
deleted file mode 100644
index 1e121e4..0000000
--- a/crawler/crawler.php
+++ /dev/null
@@ -1,5 +0,0 @@
-<?php
-header('Content-type: text/plain; charset=utf-8');
-require_once 'CrawlController.php';
-
-CrawlController::start($argv[1]); \ No newline at end of file