Skip to content

Commit

Permalink
URLHeadBear.py: Use robots.txt
Browse files Browse the repository at this point in the history
Requests that are not allowed by robots.txt
are reported.

Closes coala#1782
  • Loading branch information
PrajwalM2212 committed Mar 16, 2019
1 parent fd5a5a7 commit be132e1
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 13 deletions.
10 changes: 9 additions & 1 deletion bears/general/InvalidLinkBear.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def run(self, filename, file,
:param follow_redirects: Set to true to autocorrect redirects.
"""
for result in dependency_results.get(URLHeadBear.name, []):
line_number, link, code, context = result.contents
line_number, link, code, context, robots_allowed = result.contents
if context is context.xml_namespace:
if code and 200 <= code < 300:
pass
Expand All @@ -54,6 +54,14 @@ def run(self, filename, file,
file=filename,
line=line_number,
severity=RESULT_SEVERITY.INFO)
elif not robots_allowed:
yield Result.from_values(
origin=self,
message=('robots.txt does not allow request to '
'{url}').format(url=link),
file=filename,
line=line_number,
severity=RESULT_SEVERITY.NORMAL)
elif code is None:
yield Result.from_values(
origin=self,
Expand Down
43 changes: 31 additions & 12 deletions bears/general/URLHeadBear.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from coalib.settings.Setting import typed_dict
from coala_utils.decorators import (enforce_signature, generate_ordering,
generate_repr)
from urllib import robotparser


@generate_repr(('id', hex),
Expand Down Expand Up @@ -41,7 +42,8 @@ class URLHeadResult(HiddenResult):
def __init__(self, origin, affected_code,
link: str,
head_response: (requests.models.Response, Exception),
link_context: LINK_CONTEXT):
link_context: LINK_CONTEXT,
robots_allowed: bool):

http_status_code = (head_response.status_code if
isinstance(head_response,
Expand All @@ -52,11 +54,12 @@ def __init__(self, origin, affected_code,
affected_code)

self.contents = [affected_code[0].start.line, link, http_status_code,
link_context]
link_context, robots_allowed]
self.link = link
self.http_status_code = http_status_code
self.link_context = link_context
self.head_response = head_response
self.robots_allowed = robots_allowed


class URLHeadBear(LocalBear):
Expand Down Expand Up @@ -88,6 +91,13 @@ def get_head_response(url, timeout):
except requests.exceptions.RequestException as exc:
return exc

@staticmethod
def get_robots_file(host):
rp = robotparser.RobotFileParser()
rp.set_url('https://' + host + '/robots.txt')
rp.read()
return rp

@deprecate_settings(network_timeout=('timeout', lambda t: {'*': t}))
def run(self, filename, file, dependency_results=dict(),
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
Expand Down Expand Up @@ -117,15 +127,24 @@ def run(self, filename, file, dependency_results=dict(),
if not url == '*' else '*': timeout
for url, timeout in network_timeout.items()}

robots_dict = {}

for result in dependency_results.get(URLBear.name, []):
host = urlparse(result.link).netloc
head_resp = self.get_head_response(
result.link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else URLHeadBear.DEFAULT_TIMEOUT)

yield URLHeadResult(self, result.affected_code, result.link,
head_resp, result.link_context)
if host not in robots_dict.keys():
robots_dict[host] = self.get_robots_file(host)
if robots_dict[host].can_fetch('*', result.link):
head_resp = self.get_head_response(
result.link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else URLHeadBear.DEFAULT_TIMEOUT)

yield URLHeadResult(self, result.affected_code, result.link,
head_resp, result.link_context, True)
else:
yield URLHeadResult(self, result.affected_code, result.link,
requests.models.Response(),
result.link_context, False)

0 comments on commit be132e1

Please sign in to comment.