URLHeadBear.py: Use robots.txt

Requests that are not allowed by robots.txt are reported. Closes coala#1782
PrajwalM2212 · Mar 16, 2019 · be132e1 · be132e1
1 parent fd5a5a7
commit be132e1
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 13 deletions.
diff --git a/bears/general/InvalidLinkBear.py b/bears/general/InvalidLinkBear.py
@@ -42,7 +42,7 @@ def run(self, filename, file,
         :param follow_redirects: Set to true to autocorrect redirects.
         """
         for result in dependency_results.get(URLHeadBear.name, []):
-            line_number, link, code, context = result.contents
+            line_number, link, code, context, robots_allowed = result.contents
             if context is context.xml_namespace:
                 if code and 200 <= code < 300:
                     pass
@@ -54,6 +54,14 @@ def run(self, filename, file,
                         file=filename,
                         line=line_number,
                         severity=RESULT_SEVERITY.INFO)
+            elif not robots_allowed:
+                yield Result.from_values(
+                    origin=self,
+                    message=('robots.txt does not allow request to '
+                             '{url}').format(url=link),
+                    file=filename,
+                    line=line_number,
+                    severity=RESULT_SEVERITY.NORMAL)
             elif code is None:
                 yield Result.from_values(
                     origin=self,

diff --git a/bears/general/URLHeadBear.py b/bears/general/URLHeadBear.py
@@ -11,6 +11,7 @@
 from coalib.settings.Setting import typed_dict
 from coala_utils.decorators import (enforce_signature, generate_ordering,
                                     generate_repr)
+from urllib import robotparser
 
 
 @generate_repr(('id', hex),
@@ -41,7 +42,8 @@ class URLHeadResult(HiddenResult):
     def __init__(self, origin, affected_code,
                  link: str,
                  head_response: (requests.models.Response, Exception),
-                 link_context: LINK_CONTEXT):
+                 link_context: LINK_CONTEXT,
+                 robots_allowed: bool):
 
         http_status_code = (head_response.status_code if
                             isinstance(head_response,
@@ -52,11 +54,12 @@ def __init__(self, origin, affected_code,
                         affected_code)
 
         self.contents = [affected_code[0].start.line, link, http_status_code,
-                         link_context]
+                         link_context, robots_allowed]
         self.link = link
         self.http_status_code = http_status_code
         self.link_context = link_context
         self.head_response = head_response
+        self.robots_allowed = robots_allowed
 
 
 class URLHeadBear(LocalBear):
@@ -88,6 +91,13 @@ def get_head_response(url, timeout):
         except requests.exceptions.RequestException as exc:
             return exc
 
+    @staticmethod
+    def get_robots_file(host):
+        rp = robotparser.RobotFileParser()
+        rp.set_url('https://' + host + '/robots.txt')
+        rp.read()
+        return rp
+
     @deprecate_settings(network_timeout=('timeout', lambda t: {'*': t}))
     def run(self, filename, file, dependency_results=dict(),
             network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
@@ -117,15 +127,24 @@ def run(self, filename, file, dependency_results=dict(),
                            if not url == '*' else '*': timeout
                            for url, timeout in network_timeout.items()}
 
+        robots_dict = {}
+
         for result in dependency_results.get(URLBear.name, []):
             host = urlparse(result.link).netloc
-            head_resp = self.get_head_response(
-                result.link,
-                network_timeout.get(host)
-                if host in network_timeout
-                else network_timeout.get('*')
-                if '*' in network_timeout
-                else URLHeadBear.DEFAULT_TIMEOUT)
-
-            yield URLHeadResult(self, result.affected_code, result.link,
-                                head_resp, result.link_context)
+            if host not in robots_dict.keys():
+                robots_dict[host] = self.get_robots_file(host)
+            if robots_dict[host].can_fetch('*', result.link):
+                head_resp = self.get_head_response(
+                    result.link,
+                    network_timeout.get(host)
+                    if host in network_timeout
+                    else network_timeout.get('*')
+                    if '*' in network_timeout
+                    else URLHeadBear.DEFAULT_TIMEOUT)
+
+                yield URLHeadResult(self, result.affected_code, result.link,
+                                    head_resp, result.link_context, True)
+            else:
+                yield URLHeadResult(self, result.affected_code, result.link,
+                                    requests.models.Response(),
+                                    result.link_context, False)