From 62da410e9d99c7065046c4a7a589ae3d77ac408a Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 15:20:11 +0200 Subject: [PATCH] middlewares: add support for crawling only once * Adds: extends `scrapy-crawl-once` plug-in for supporting custom data-fields in DB per spider. * Adds: enables `scrapy-crawl-once` plug-in. Closes #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/middlewares.py | 121 ++++++++++++++++++++++++++++++++++++++++ hepcrawl/settings.py | 5 ++ 2 files changed, 126 insertions(+) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 5554c502..faecf8e8 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -11,6 +11,17 @@ from __future__ import absolute_import, division, print_function +import os +import time + +from ftplib import FTP +from six.moves.urllib.parse import urlparse + +from scrapy.exceptions import IgnoreRequest +from scrapy_crawl_once.middlewares import CrawlOnceMiddleware + +from hepcrawl.utils import ftp_connection_info + class ErrorHandlingMiddleware(object): """Log errors.""" @@ -34,3 +45,113 @@ def process_exception(self, request, exception, spider): 'exception': exception, 'sender': request, }) + + +class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): + """ + This spider and downloader middleware allows to avoid re-crawling pages + which were already downloaded in previous crawls. + + To enable it, modify ``settings.py``:: + + SPIDER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 100, + # ... + } + + DOWNLOADER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 50, + # ... + } + + By default it does nothing. To avoid crawling a particular page + multiple times set ``request.meta['crawl_once'] = True``. Other + ``request.meta`` keys: + + * ``crawl_once_value`` - a value to store in DB. By default, timestamp + is stored for Http/Https requests and last-modified is stored for FTP/File requests. + * ``crawl_once_key`` - unique file name is used. + + Settings: + + * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware. + Default is True. + * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database. + By default ``.scrapy/crawl_once/`` path is used; this folder contains + ``.sqlite`` files with databases of seen requests. + * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key + (False by default). When True, all requests are handled by + this middleware unless disabled explicitly using + ``request.meta['crawl_once'] = False``. + """ + + def __init__(self, *args, **kwargs): + super(HepcrawlCrawlOnceMiddleware, self).__init__(*args, **kwargs) + + # def process_spider_output(self, response, result, spider): + # for r in result: + # yield r + # + # if response.meta.get('crawl_once', self.default): + # key = self._get_key(response.request) + # self.db[key] = response.meta.get('crawl_once_value') + # self.stats.inc_value('crawl_once/stored') + + @staticmethod + def _is_newer(this_timestamp, than_this_timestamp, scheme): + if scheme in ['ftp', 'file']: + return this_timestamp > than_this_timestamp + + def _has_to_be_crawled(self, request, spider): + request_db_key = self._get_key(request) + + if request_db_key not in self.db: + return True + + new_request_timestamp = self._get_timestamp(request, spider) + parsed_url = urlparse(request.url) + if self._is_newer( + new_request_timestamp, + self.db.get(key=request_db_key), + scheme=parsed_url.scheme, + ): + return True + + return False + + def process_request(self, request, spider): + request.meta['crawl_once_key'] = os.path.basename(request.url) + request.meta['crawl_once_value'] = self._get_timestamp(request, spider) + + if not request.meta.get('crawl_once', self.default): + return + + if not self._has_to_be_crawled(request, spider): + self.stats.inc_value('crawl_once/ignored') + raise IgnoreRequest() + + @staticmethod + def _get_timestamp(request, spider): + parsed_url = urlparse(request.url) + full_url = request.url + if parsed_url.scheme == 'ftp': + ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc) + ftp = FTP( + host=ftp_host, + user=params['ftp_user'], + passwd=params['ftp_password'], + ) + file_path = full_url.replace( + '{0}://{1}/'.format(parsed_url.scheme, ftp_host), + '', + ) + last_modified = ftp.sendcmd('MDTM {}'.format(file_path)) + elif parsed_url.scheme == 'file': + file_path = full_url.replace('file://', '') + last_modified = os.stat(file_path).st_mtime + else: + last_modified = time.time() + + return last_modified diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 71dcfc75..044520bc 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -62,14 +62,19 @@ # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +CRAWL_ONCE_ENABLED = True +CRAWL_ONCE_DEFAULT = True + # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html EXTENSIONS = {