Finally fixed almost all bugs :zap: :fire:

author: Marvin Borner 2018-09-19 17:43:33 +0200
committer: Marvin Borner 2018-09-19 17:43:33 +0200
commit: 4f751868837a0f78423c927c6cd4aeb24bf37c00 (patch)
tree: 25b3462724a03bf0081fef410f2484d35bf43fcf
parent: 4a57c1fda6d50d655eb48ffb712bd43729ea5c10 (diff)
5 files changed, 148 insertions, 66 deletions
diff --git a/crawler/Algorithms.php b/crawler/Algorithms.php
index 73d2ecc..6c8d513 100644
--- a/crawler/Algorithms.php
+++ b/crawler/Algorithms.php
@@ -1,10 +1,14 @@
 <?php
+header('Content-type: text/plain; charset=utf-8');
+
 /**
  * User: Marvin Borner
  * Date: 16/09/2018
  * Time: 21:51
  */
 
+require_once 'CrawlController.php';
+
 class Algorithms
 {
     public static function getUrlInfo($path): array
@@ -23,14 +27,14 @@ class Algorithms
         if (!isset($urlInfo['description'])) {
             $urlInfo['description'] = '';
             foreach ($path->query('//p') as $text) {
-                if (strlen($urlInfo['description']) < 350) {
+                if (mb_strlen($urlInfo['description']) < 350) {
                     $urlInfo['description'] .= $text->textContent . ' ';
                 }
             }
         }
         if (empty($urlInfo['title'])) {
             $urlInfo['title'] = '';
-            if (strlen($urlInfo['title']) < 350) {
+            if (mb_strlen($urlInfo['title']) < 350) {
                 $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
             }
         }
@@ -46,10 +50,8 @@ class Algorithms
 
         foreach ($path->query('//a') as $link) {
             $linkHref = $link->getAttribute('href');
-            if ($linkHref !== 'javascript:void(0)') {
-                $href = self::cleanUrl($linkHref);
-                $allLinks[] = $href;
-            }
+            $href = self::cleanUrl($linkHref);
+            $allLinks[] = $href;
         }
 
         return array_unique($allLinks);
@@ -66,25 +68,25 @@ class Algorithms
 
     public static function cleanUrl($url): string
     {
-        global $currentlyCrawled;
-
-        $newUrl = ltrim($url); // trim whitespaces
+        $newUrl = self::fixEncoding(ltrim($url)); // trim whitespaces
 
         // normally only for links/href
-        if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || (strpos($newUrl, 'http') !== 0)) {
-            if (strpos($newUrl, 'www') === 0) {
+        if (filter_var($newUrl, FILTER_VALIDATE_URL) === false || mb_strpos($newUrl, 'http') !== 0) {
+            if (mb_strpos($newUrl, 'www') === 0) {
                 $newUrl = 'http://' . $newUrl; // fixes eg. "www.example.com" by adding http:// at beginning
-            } else if (strpos($newUrl, 'javascript:') === 0) {
-                $newUrl = ''; // fixes javascript void links
-            } else if (strpos($newUrl, '../') === 0) {
-                $parsedUrl = parse_url($currentlyCrawled);
-                $backCount = substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
-                $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
-            } else if (strpos($newUrl, '/') === 0) {
-                $parsedUrl = parse_url($currentlyCrawled);
+            } else if (mb_strpos($newUrl, 'javascript:') === 0 || mb_strpos($newUrl, 'mailto') === 0) {
+                $newUrl = CrawlController::$currentlyCrawled; // fixes javascript void links
+            } else if (mb_strpos($newUrl, '../') === 0) {
+                $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
+                $backCount = mb_substr_count($parsedUrl['path'], '../'); // TODO: Better back counter (../../foo/../bar isn't parsed correctly)
+                if ($backCount >= 1) {
+                    $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . dirname($parsedUrl['path'] ?? '', $backCount) . $newUrl; // fixes eg. "../sub_dir" by going back and adding new path
+                }
+            } else if (mb_strpos($newUrl, '/') === 0) {
+                $parsedUrl = parse_url(CrawlController::$currentlyCrawled);
                 $newUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $newUrl; // fixes eg. "/sub_dir" by removing path and adding new path
             } else {
-                $newUrl = $currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
+                $newUrl = '/' . CrawlController::$currentlyCrawled . $newUrl; // fixes eg. "sub_dir" by adding currently crawled url at beginning
             }
         }
 
@@ -95,8 +97,12 @@ class Algorithms
 
         // strip some things
         $newUrl = preg_replace('/([^:])(\/{2,})/', '$1/', $newUrl); // double slashes
-        $newUrl = strtok($newUrl, '?'); // parameters
-        $newUrl = strtok($newUrl, '#'); // hash fragments
+        $newUrl = self::mb_strtok($newUrl, '?'); // parameters
+        $newUrl = self::mb_strtok($newUrl, '#'); // hash fragments
+
+        if (mb_strpos($newUrl, '/') === 0) {
+            $newUrl = mb_substr($newUrl, 1); // remove first slash from domain, which could have been added
+        }
 
         if ($url !== $newUrl) {
             print "\t\e[92mChanged " . $url . ' to ' . $newUrl . "\n";
@@ -104,4 +110,33 @@ class Algorithms
 
         return $newUrl;
     }
+
+    private static function fixEncoding($text): string
+    {
+        return iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8', $text);
+    }
+
+    private static function mb_strtok($str, $delimiters)
+    {
+        $pos = 0;
+        $string = $str;
+
+        $token = '';
+
+        while ($pos < mb_strlen($string)) {
+            $char = mb_substr($string, $pos, 1);
+            $pos++;
+            if (mb_strpos($delimiters, $char) === FALSE) {
+                $token .= $char;
+            } else if ($token !== '') {
+                return $token;
+            }
+        }
+
+        if ($token !== '') {
+            return $token;
+        }
+
+        return false;
+    }
 }
 \ No newline at end of file
diff --git a/crawler/CrawlController.php b/crawler/CrawlController.php
index 97edf25..53d5aac 100644
--- a/crawler/CrawlController.php
+++ b/crawler/CrawlController.php
@@ -1,4 +1,5 @@
 <?php
+header('Content-type: text/plain; charset=utf-8');
 /**
  * User: Marvin Borner
  * Date: 14/09/2018
@@ -12,48 +13,50 @@ require_once 'Algorithms.php';
 
 class CrawlController
 {
-    private static $currentlyCrawled;
+    public static $currentlyCrawled;
 
     public static function start($url = '')
     {
         set_time_limit(3600000);
-        error_reporting(E_ERROR | E_PARSE);
+
+        self::$currentlyCrawled = $url;
 
         while (true) {
-            self::$currentlyCrawled = $url;
-            self::crawl(self::$currentlyCrawled);
+            self::crawl(Algorithms::cleanUrl(self::$currentlyCrawled));
         }
     }
 
     private static function crawl($url)
     {
-        if (Database::alreadyCrawled(Algorithms::cleanUrl($url))) {
+        if ($url !== '' && Database::alreadyCrawled($url)) {
             Database::removeFromQueue(self::$currentlyCrawled);
             self::$currentlyCrawled = Database::getFromQueue('DESC');
         } else {
             $requestResponse = WebRequest::getContent($url);
-            self::$currentlyCrawled = $requestResponse[3];
-            if (preg_match('/2\d\d/', $requestResponse[1])) { // success
-                print 'Download Size: ' . $requestResponse[2];
-
-                $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
-
-                $urlInfo = Algorithms::getUrlInfo($htmlPath);
-                Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
-
-                $allLinks = Algorithms::getLinks($htmlPath);
-                Database::insertIntoQueue($allLinks);
-
-                Database::removeFromQueue(self::$currentlyCrawled);
-                self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
-                print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
-            } else {
-                print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
-
-                Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
-                Database::removeFromQueue(self::$currentlyCrawled);
-                self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
-                print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
+            if ($requestResponse) {
+                self::$currentlyCrawled = $requestResponse[3];
+                if (preg_match('/2\d\d/', $requestResponse[1])) { // success
+                    print 'Download Size: ' . $requestResponse[2];
+
+                    $htmlPath = Algorithms::createPathFromHtml($requestResponse[0]);
+
+                    $urlInfo = Algorithms::getUrlInfo($htmlPath);
+                    Database::saveUrlData(self::$currentlyCrawled, $urlInfo);
+
+                    $allLinks = Algorithms::getLinks($htmlPath);
+                    Database::insertIntoQueue($allLinks);
+
+                    Database::removeFromQueue(self::$currentlyCrawled);
+                    self::$currentlyCrawled = Database::getFromQueue('DESC'); // set new from start
+                    print "\e[96mFinished previous url - crawling: " . self::$currentlyCrawled . "\n";
+                } else {
+                    print "\t\e[91mError " . $requestResponse[1] . ' ' . self::$currentlyCrawled . "\n";
+
+                    Database::urlHasError(self::$currentlyCrawled); // prevents re-crawling of error url
+                    Database::removeFromQueue(self::$currentlyCrawled);
+                    self::$currentlyCrawled = Database::getFromQueue('ASC'); // set new from end
+                    print "\e[91mFinished previous url with error - crawling: " . self::$currentlyCrawled . "\n";
+                }
             }
         }
     }
diff --git a/crawler/Database.php b/crawler/Database.php
index 0d500ad..f27803e 100644
--- a/crawler/Database.php
+++ b/crawler/Database.php
@@ -1,4 +1,5 @@
 <?php
+header('Content-type: text/plain; charset=utf-8');
 /**
  * User: Marvin Borner
  * Date: 16/09/2018
@@ -41,7 +42,7 @@ class Database
 
     public static function alreadyCrawled($url): bool
     {
-        print "\t\e[96mChecking if url already has been crawled " . $url . "\n";
+        print "\t\e[96mChecking if url has already been crawled " . $url . "\n";
         $hash = md5($url);
         $conn = self::initDbConnection();
         $checkStmt = $conn->prepare('(SELECT null FROM url_data WHERE hash = :hash) UNION (SELECT null FROM error_url WHERE hash = :hash)');
diff --git a/crawler/WebRequest.php b/crawler/WebRequest.php
index f25f31d..6053bae 100644
--- a/crawler/WebRequest.php
+++ b/crawler/WebRequest.php
@@ -1,29 +1,71 @@
 <?php
+header('Content-type: text/plain; charset=utf-8');
+
 /**
  * User: Marvin Borner
  * Date: 16/09/2018
  * Time: 21:53
  */
-
 class WebRequest
 {
-    public static function getContent($url): array
+    private static $userAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
+
+    public static function getContent($url)
     {
-        $curl = curl_init($url);
-        curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
-        curl_setopt($curl, CURLOPT_ENCODING, '');
-        curl_setopt($curl, CURLOPT_TIMEOUT, 5);
-        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
-        curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
-        curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
-        $content = curl_exec($curl);
-        $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
-        $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
-        if (preg_match('~Location: (.*)~i', $content, $match)) {
-            $updatedUrl = trim($match[1]); // update url on 301/302
+        if (self::checkRobotsTxt($url)) {
+            $curl = curl_init($url);
+            curl_setopt($curl, CURLOPT_USERAGENT, self::$userAgent);
+            curl_setopt($curl, CURLOPT_ENCODING, '');
+            curl_setopt($curl, CURLOPT_TIMEOUT, 5);
+            curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
+            curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
+            curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
+            $content = curl_exec($curl);
+            $responseCode = curl_getinfo($curl, CURLINFO_HTTP_CODE);
+            $downloadSize = curl_getinfo($curl, CURLINFO_SIZE_DOWNLOAD) / 1000 . "KB\n";
+            $updatedUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // update on 301/302
+            curl_close($curl);
+
+            return [$content, $responseCode, $downloadSize, $updatedUrl];
         }
-        curl_close($curl);
 
-        return [$content, $responseCode, $downloadSize, $updatedUrl ?? $url];
+        return false;
+    }
+
+    public static function checkRobotsTxt($url): bool
+    {
+        $userAgent = self::$userAgent;
+        $parsed = parse_url($url);
+        $agents = array(preg_quote('*', NULL));
+        if ($userAgent) {
+            $agents[] = preg_quote($userAgent, NULL);
+        }
+        $agents = implode('|', $agents);
+        $robotsTxt = @file("http://{$parsed['host']}/robots.txt");
+        if (empty($robotsTxt)) {
+            return true;
+        }
+        $rules = array();
+        $ruleApplies = false;
+        foreach ($robotsTxt as $line) {
+            if (!$line = trim($line)) {
+                continue;
+            }
+            if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
+                $ruleApplies = preg_match("/($agents)/i", $match[1]);
+            }
+            if ($ruleApplies && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
+                if (!$regs[1]) {
+                    return true;
+                }
+                $rules[] = preg_quote(trim($regs[1]), '/');
+            }
+        }
+        foreach ($rules as $rule) {
+            if (preg_match("/^$rule/", $parsed['path'])) {
+                return false;
+            }
+        }
+        return true;
     }
 }
 \ No newline at end of file
diff --git a/crawler/crawler.php b/crawler/crawler.php
index b5df1dc..1e121e4 100644
--- a/crawler/crawler.php
+++ b/crawler/crawler.php
@@ -1,4 +1,5 @@
 <?php
+header('Content-type: text/plain; charset=utf-8');
 require_once 'CrawlController.php';
 
 CrawlController::start($argv[1]);
 \ No newline at end of file
author	Marvin Borner	2018-09-19 17:43:33 +0200
committer	Marvin Borner	2018-09-19 17:43:33 +0200
commit	4f751868837a0f78423c927c6cd4aeb24bf37c00 (patch)
tree	25b3462724a03bf0081fef410f2484d35bf43fcf
parent	4a57c1fda6d50d655eb48ffb712bd43729ea5c10 (diff)