From 85f4f3630df62169c0a301c33fc8226e0cb740aa Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 15:20:11 +0200 Subject: [PATCH] middlewares: add support for crawling only once * Adds: extends `scrapy-crawl-once` plug-in for supporting custom data-fields in DB per spider. * Adds: enables `scrapy-crawl-once` plug-in. Closes #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/middlewares.py | 124 ++++++++++++++++++++++++++++++++++++++++ hepcrawl/settings.py | 5 ++ 2 files changed, 129 insertions(+) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 5554c502..86a9998d 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -11,6 +11,17 @@ from __future__ import absolute_import, division, print_function +import os +import time + +from ftplib import FTP +from six.moves.urllib.parse import urlparse + +from scrapy.exceptions import IgnoreRequest +from scrapy_crawl_once.middlewares import CrawlOnceMiddleware + +from hepcrawl.utils import ftp_connection_info + class ErrorHandlingMiddleware(object): """Log errors.""" @@ -34,3 +45,116 @@ def process_exception(self, request, exception, spider): 'exception': exception, 'sender': request, }) + + +class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): + """ + This spider and downloader middleware allows to avoid re-crawling pages + which were already downloaded in previous crawls. + + To enable it, modify ``settings.py``:: + + SPIDER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 100, + # ... + } + + DOWNLOADER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 50, + # ... + } + + By default it does nothing. To avoid crawling a particular page + multiple times set ``request.meta['crawl_once'] = True``. Other + ``request.meta`` keys: + + * ``crawl_once_value`` - a value to store in DB. By default, timestamp + is stored for Http/Https requests and last-modified is stored for FTP/File requests. + * ``crawl_once_key`` - unique file name is used. + + Settings: + + * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware. + Default is True. + * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database. + By default ``.scrapy/crawl_once/`` path is used; this folder contains + ``.sqlite`` files with databases of seen requests. + * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key + (False by default). When True, all requests are handled by + this middleware unless disabled explicitly using + ``request.meta['crawl_once'] = False``. + """ + + @staticmethod + def _is_newer(this_timestamp, than_this_timestamp, scheme): + if scheme in ['ftp', 'file']: + return this_timestamp > than_this_timestamp + + def _has_to_be_crawled(self, request, spider): + request_db_key = self._get_key(request) + + if request_db_key not in self.db: + return True + + new_request_timestamp = self._get_timestamp(request, spider) + parsed_url = urlparse(request.url) + if self._is_newer( + new_request_timestamp, + self.db.get(key=request_db_key), + scheme=parsed_url.scheme, + ): + return True + + return False + + def process_request(self, request, spider): + if not request.meta.get('crawl_once', self.default): + return + + request.meta['crawl_once_key'] = os.path.basename(request.url) + request.meta['crawl_once_value'] = self._get_timestamp(request, spider) + + if not self._has_to_be_crawled(request, spider): + self.stats.inc_value('crawl_once/ignored') + raise IgnoreRequest() + + @staticmethod + def _get_timestamp(request, spider): + def _get_ftp_relative_path(url, host): + return url.replace( + 'ftp://{0}/'.format(host), + '', + ) + + def _get_ftp_timestamp(spider, url): + ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc) + ftp = FTP( + host=ftp_host, + user=params['ftp_user'], + passwd=params['ftp_password'], + ) + return ftp.sendcmd( + 'MDTM {}'.format( + _get_ftp_relative_path( + url=url, + host=ftp_host + ) + ) + ) + + def _get_file_timestamp(url): + file_path = url.replace('file://', '') + return os.stat(file_path).st_mtime + + parsed_url = urlparse(request.url) + full_url = request.url + if parsed_url.scheme == 'ftp': + last_modified = _get_ftp_timestamp(spider, full_url) + elif parsed_url.scheme == 'file': + last_modified = _get_file_timestamp(full_url) + else: + last_modified = time.time() + + return last_modified diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 31d608bf..91f82829 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -62,14 +62,19 @@ # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +CRAWL_ONCE_ENABLED = True +CRAWL_ONCE_DEFAULT = True + # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html EXTENSIONS = {