From fd68d6b24a48368877714e0ae76edfe432dc3dbb Mon Sep 17 00:00:00 2001 From: IGARASHI Masanao Date: Wed, 16 Sep 2015 00:25:15 +0900 Subject: [PATCH 1/2] Refactor checking robots_exclusions --- nikola/plugins/task/sitemap/__init__.py | 32 +++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py index a76de139ec..641b41d1c9 100644 --- a/nikola/plugins/task/sitemap/__init__.py +++ b/nikola/plugins/task/sitemap/__init__.py @@ -142,6 +142,7 @@ def gen_tasks(self): def scan_locs(): """Scan site locations.""" + robots_rules = parse_robots_exclusions(kw['robots_exclusions']) for root, dirs, files in os.walk(output, followlinks=True): if not dirs and not files and not kw['sitemap_include_fileless_dirs']: continue # Totally empty, not on sitemap @@ -174,8 +175,12 @@ def scan_locs(): if path.endswith(kw['index_file']) and kw['strip_indexes']: # ignore index files when stripping urls continue - if not robot_fetch(path): - continue + if robots_rules: + abspath = '/' + path + if sys.version_info[0] == 2: + abspath = abspath.encode('utf-8') + if not robots_rules.can_fetch('*', abspath): + continue # read in binary mode to make ancient files work fh = open(real_path, 'rb') @@ -223,18 +228,19 @@ def scan_locs(): alternates.append(alternates_format.format(lang, alt_url)) urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates)) - def robot_fetch(path): - """Check if robots can fetch a file.""" - for rule in kw["robots_exclusions"]: + def parse_robots_exclusions(exclusions): + """Parse rules to check fetchable.""" + rules = [] + for rule in exclusions: + rules.append('Disallow: {0}'.format(rule)) + if len(rules): robot = robotparser.RobotFileParser() - robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)]) - if sys.version_info[0] == 3: - if not robot.can_fetch("*", '/' + path): - return False # not robot food - else: - if not robot.can_fetch("*", ('/' + path).encode('utf-8')): - return False # not robot food - return True + rules = ['User-Agent: *'] + rules + if sys.version_info[0] == 2: + rules = [ line.encode('utf-8') for line in rules ] + robot.parse(rules) + return robot + return None def write_sitemap(): """Write sitemap to file.""" From cac59c73d27063836b4550e772acf2ea9605a043 Mon Sep 17 00:00:00 2001 From: IGARASHI Masanao Date: Wed, 16 Sep 2015 06:00:00 +0900 Subject: [PATCH 2/2] Read the robots.txt while creating sitemaps If the robots.txt file already exist, read it and ignore ROBOTS_EXCLUSIONS. --- nikola/plugins/task/sitemap/__init__.py | 26 +++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py index 641b41d1c9..8e80ae8d71 100644 --- a/nikola/plugins/task/sitemap/__init__.py +++ b/nikola/plugins/task/sitemap/__init__.py @@ -40,7 +40,7 @@ import urllib.robotparser as robotparser # NOQA from nikola.plugin_categories import LateTask -from nikola.utils import apply_filters, config_changed, encodelink +from nikola.utils import apply_filters, config_changed, encodelink, get_asset_path urlset_header = """ @@ -118,6 +118,7 @@ def gen_tasks(self): "base_url": self.site.config["BASE_URL"], "site_url": self.site.config["SITE_URL"], "output_folder": self.site.config["OUTPUT_FOLDER"], + "files_folders": self.site.config['FILES_FOLDERS'], "strip_indexes": self.site.config["STRIP_INDEXES"], "index_file": self.site.config["INDEX_FILE"], "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"], @@ -140,9 +141,8 @@ def gen_tasks(self): sitemapindex = {} urlset = {} - def scan_locs(): + def scan_locs(robots_rules): """Scan site locations.""" - robots_rules = parse_robots_exclusions(kw['robots_exclusions']) for root, dirs, files in os.walk(output, followlinks=True): if not dirs and not files and not kw['sitemap_include_fileless_dirs']: continue # Totally empty, not on sitemap @@ -228,6 +228,16 @@ def scan_locs(): alternates.append(alternates_format.format(lang, alt_url)) urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates)) + def parse_robotstxt(path): + robot = robotparser.RobotFileParser() + fh = io.open(path, 'r', encoding='utf-8-sig') + rules = fh.readlines() + if sys.version_info[0] == 2: + rules = [ line.encode('utf-8') for line in rules ] + fh.close() + robot.parse(rules) + return robot + def parse_robots_exclusions(exclusions): """Parse rules to check fetchable.""" rules = [] @@ -268,7 +278,12 @@ def scan_locs_task(): Other tasks can depend on this output, instead of having to scan locations. """ - scan_locs() + robotstxt = get_asset_path("robots.txt", [], files_folders=kw["files_folders"], output_dir=False) + if robotstxt: + robots_rules = parse_robotstxt(robotstxt) + else: + robots_rules = parse_robots_exclusions(kw['robots_exclusions']) + scan_locs(robots_rules) # Generate a list of file dependencies for the actual generation # task, so rebuilds are triggered. (Issue #1032) @@ -289,6 +304,9 @@ def scan_locs_task(): if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')): file_dep.append(p + 'index.html') + if robotstxt: + file_dep.append(os.path.join(output, 'robots.txt')) + return {'file_dep': file_dep} yield {