From 5230054b9b8f75ee8eb1661be46e8ff5dd8e70c6 Mon Sep 17 00:00:00 2001
From: Marvin Borner
Date: Sat, 15 Sep 2018 20:48:33 +0200
Subject: Applied official guidelines :fire: :zap:

---
 crawler.php    | 73 +++++++++++++++++++++++++++++++++-------------------------
 mysql_conf.inc |  8 +++----
 2 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/crawler.php b/crawler.php
index 503e190..abe79a8 100644
--- a/crawler.php
+++ b/crawler.php
@@ -7,7 +7,7 @@
 
 error_reporting(E_ERROR | E_PARSE);
 
-include "mysql_conf.inc";
+include 'mysql_conf.inc';
 
 $currentUrl = $argv[1];
 
@@ -21,8 +21,8 @@ function crawl($url)
 
     if (!alreadyCrawled(cleanUrl($url))) {
         $requestResponse = getContent($url);
-        if ($requestResponse[1] != 404) {
-            print "Download Size: " . $requestResponse[2];
+        if ($requestResponse[1] !== 404) {
+            print 'Download Size: ' . $requestResponse[2];
 
             $htmlPath = createPathFromHtml($requestResponse[0]);
             $urlInfo = getUrlInfo($htmlPath);
@@ -35,15 +35,13 @@ function crawl($url)
 
     $currentUrl = getFirstFromQueue(); // set new
     removeFromQueue($currentUrl);
-
-    return;
 }
 
 
 function getContent($url)
 {
     $curl = curl_init($url);
-    curl_setopt($curl, CURLOPT_USERAGENT, "Googlebot/2.1 (+http://www.google.com/bot.html)");
+    curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
     curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
     curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
@@ -59,23 +57,29 @@ function getUrlInfo($path)
 {
     $urlInfo = [];
 
-    $urlInfo["title"] = strip_tags($path->query("//title")[0]->textContent);
-    foreach ($path->query("//html") as $language) $urlInfo["language"] = strip_tags($language->getAttribute("lang"));
-    foreach ($path->query("/html/head/meta[@name=\"description\"]") as $description) $urlInfo["description"] = strip_tags($description->getAttribute("content"));
+    $urlInfo['title'] = strip_tags($path->query('//title')[0]->textContent);
+    foreach ($path->query('//html') as $language) {
+        $urlInfo['language'] = strip_tags($language->getAttribute('lang'));
+    }
+    foreach ($path->query('/html/head/meta[@name="description"]') as $description) {
+        $urlInfo['description'] = strip_tags($description->getAttribute('content'));
+    }
 
     // Fix empty information
-    if (!(isset($urlInfo["description"]))) {
-        $urlInfo["description"] = "";
-        foreach ($path->query("//p") as $text) {
-            if (strlen($urlInfo["description"]) < 350)
-                $urlInfo["description"] .= $text->textContent . " ";
+    if (!isset($urlInfo['description'])) {
+        $urlInfo['description'] = '';
+        foreach ($path->query('//p') as $text) {
+            if (strlen($urlInfo['description']) < 350) {
+                $urlInfo['description'] .= $text->textContent . ' ';
+            }
         }
     }
 
-    if (empty($urlInfo["title"])) {
-        $urlInfo["title"] = "";
-        if (strlen($urlInfo["title"]) < 350)
-            $urlInfo["title"] .= $path->query("//h1")[0]->textContent . " ";
+    if (empty($urlInfo['title'])) {
+        $urlInfo['title'] = '';
+        if (strlen($urlInfo['title']) < 350) {
+            $urlInfo['title'] .= $path->query('//h1')[0]->textContent . ' ';
+        }
     }
 
     return $urlInfo;
@@ -85,9 +89,9 @@ function getLinks($path)
 {
     $allLinks = [];
 
-    foreach ($path->query("//a") as $ink) {
-        $href = cleanUrl($ink->getAttribute("href"));
-        array_push($allLinks, $href);
+    foreach ($path->query('//a') as $ink) {
+        $href = cleanUrl($ink->getAttribute('href'));
+        $allLinks[] = $href;
     }
 
     return array_unique($allLinks);
@@ -99,14 +103,20 @@ function cleanUrl($url)
 
     $url = ltrim($url);
 
-    if (!(substr($url, 0, 4) === "http")) {
-        if (substr($url, 0, 3) === "www") $url = "http://" . $url;
-        else if (substr($url, 0, 1) === "/") $url = $currentUrl . $url;
-        else $url = $currentUrl . $url;
+    if (!(strpos($url, 'http') === 0)) {
+        if (strpos($url, 'www') === 0) {
+            $url = 'http://' . $url;
+        } else if (strpos($url, '/') === 0) {
+            $url = $currentUrl . $url;
+        } else {
+            $url = $currentUrl . $url;
+        }
     }
 
     // if it's pure domain without slash (prevents duplicate domains because of slash)
-    if (preg_match('/\w+\.\w{2,3}$/', $url)) $url = $url . "/";
+    if (preg_match('/\w+\.\w{2,3}$/', $url)) {
+        $url .= '/';
+    }
 
     // strip some things
     $url = preg_replace('/([^:])(\/{2,})/', '$1/', $url); // double slashes
@@ -128,10 +138,9 @@ function createPathFromHtml($content)
 function getFirstFromQueue()
 {
     $conn = initDbConnection();
-    $checkStmt = $conn->prepare('SELECT url FROM queue LIMIT 1');
-    $checkStmt->execute();
+    $checkStmt = $conn->query('SELECT url FROM queue LIMIT 1');
 
-    return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]["url"];
+    return $checkStmt->fetchAll(PDO::FETCH_ASSOC)[0]['url'];
 }
 
 function writeToQueue($urls)
@@ -165,9 +174,9 @@ function saveData($urlInfo)
 
     print $currentUrl . "\n";
 
-    $title = isset($urlInfo["title"]) ? $urlInfo["title"] : "";
-    $description = isset($urlInfo["description"]) ? $urlInfo["description"] : "";
-    $language = isset($urlInfo["language"]) ? $urlInfo["language"] : "en";
+    $title = $urlInfo['title'] ?? '';
+    $description = $urlInfo['description'] ?? '';
+    $language = $urlInfo['language'] ?? 'en';
     $hash = md5($currentUrl);
 
     try {
diff --git a/mysql_conf.inc b/mysql_conf.inc
index 86d2a8f..3353ac8 100644
--- a/mysql_conf.inc
+++ b/mysql_conf.inc
@@ -1,5 +1,5 @@
 <?php
-$servername = "127.0.0.1";
-$username = "root";
-$password = "root";
-$dbname = "search_engine";
\ No newline at end of file
+$servername = '127.0.0.1';
+$username = 'root';
+$password = 'root';
+$dbname = 'search_engine';
\ No newline at end of file
-- 
cgit v1.2.3