From b97cd4b23b8d88fc24110f2101355e92b382b40e Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 08:30:06 +0200 Subject: [PATCH 01/26] middlwares: indentation fix Signed-off-by: Spiros Delviniotis --- hepcrawl/middlewares.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index dab5c7e4..5554c502 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -11,8 +11,8 @@ from __future__ import absolute_import, division, print_function -class ErrorHandlingMiddleware(object): +class ErrorHandlingMiddleware(object): """Log errors.""" @classmethod From 15ab37df2ba3284964a9b6c54fa84aa8da630fd6 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 15:05:21 +0200 Subject: [PATCH 02/26] setup: add `scrapy-crawl-once` plug-in Addresses #161 Signed-off-by: Spiros Delviniotis --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4d1518a8..ca1e0337 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ 'inspire-schemas~=46.0', 'inspire-dojson~=46.0', 'Scrapy>=1.1.0', + 'scrapy-crawl-once>=0.1.1', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', 'scrapyd-client>=1.0.1', From 7694fe37fa7a9cd3dae2b5905aaeaf179c7c72be Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 15:20:11 +0200 Subject: [PATCH 03/26] middlewares: add support for crawling only once * Adds: extends `scrapy-crawl-once` plug-in for supporting custom data-fields in DB per spider. * Adds: enables `scrapy-crawl-once` plug-in. Closes #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/middlewares.py | 124 ++++++++++++++++++++++++++++++++++++++++ hepcrawl/settings.py | 5 ++ 2 files changed, 129 insertions(+) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 5554c502..86a9998d 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -11,6 +11,17 @@ from __future__ import absolute_import, division, print_function +import os +import time + +from ftplib import FTP +from six.moves.urllib.parse import urlparse + +from scrapy.exceptions import IgnoreRequest +from scrapy_crawl_once.middlewares import CrawlOnceMiddleware + +from hepcrawl.utils import ftp_connection_info + class ErrorHandlingMiddleware(object): """Log errors.""" @@ -34,3 +45,116 @@ def process_exception(self, request, exception, spider): 'exception': exception, 'sender': request, }) + + +class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): + """ + This spider and downloader middleware allows to avoid re-crawling pages + which were already downloaded in previous crawls. + + To enable it, modify ``settings.py``:: + + SPIDER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 100, + # ... + } + + DOWNLOADER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 50, + # ... + } + + By default it does nothing. To avoid crawling a particular page + multiple times set ``request.meta['crawl_once'] = True``. Other + ``request.meta`` keys: + + * ``crawl_once_value`` - a value to store in DB. By default, timestamp + is stored for Http/Https requests and last-modified is stored for FTP/File requests. + * ``crawl_once_key`` - unique file name is used. + + Settings: + + * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware. + Default is True. + * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database. + By default ``.scrapy/crawl_once/`` path is used; this folder contains + ``.sqlite`` files with databases of seen requests. + * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key + (False by default). When True, all requests are handled by + this middleware unless disabled explicitly using + ``request.meta['crawl_once'] = False``. + """ + + @staticmethod + def _is_newer(this_timestamp, than_this_timestamp, scheme): + if scheme in ['ftp', 'file']: + return this_timestamp > than_this_timestamp + + def _has_to_be_crawled(self, request, spider): + request_db_key = self._get_key(request) + + if request_db_key not in self.db: + return True + + new_request_timestamp = self._get_timestamp(request, spider) + parsed_url = urlparse(request.url) + if self._is_newer( + new_request_timestamp, + self.db.get(key=request_db_key), + scheme=parsed_url.scheme, + ): + return True + + return False + + def process_request(self, request, spider): + if not request.meta.get('crawl_once', self.default): + return + + request.meta['crawl_once_key'] = os.path.basename(request.url) + request.meta['crawl_once_value'] = self._get_timestamp(request, spider) + + if not self._has_to_be_crawled(request, spider): + self.stats.inc_value('crawl_once/ignored') + raise IgnoreRequest() + + @staticmethod + def _get_timestamp(request, spider): + def _get_ftp_relative_path(url, host): + return url.replace( + 'ftp://{0}/'.format(host), + '', + ) + + def _get_ftp_timestamp(spider, url): + ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc) + ftp = FTP( + host=ftp_host, + user=params['ftp_user'], + passwd=params['ftp_password'], + ) + return ftp.sendcmd( + 'MDTM {}'.format( + _get_ftp_relative_path( + url=url, + host=ftp_host + ) + ) + ) + + def _get_file_timestamp(url): + file_path = url.replace('file://', '') + return os.stat(file_path).st_mtime + + parsed_url = urlparse(request.url) + full_url = request.url + if parsed_url.scheme == 'ftp': + last_modified = _get_ftp_timestamp(spider, full_url) + elif parsed_url.scheme == 'file': + last_modified = _get_file_timestamp(full_url) + else: + last_modified = time.time() + + return last_modified diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 31d608bf..91f82829 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -62,14 +62,19 @@ # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +CRAWL_ONCE_ENABLED = True +CRAWL_ONCE_DEFAULT = True + # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html EXTENSIONS = { From a283b21ac3616911c86a04d30eafb98b10ffe49b Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 17:10:46 +0200 Subject: [PATCH 04/26] tests: support for `scrapy-crawl-once` Addresses #161 Signed-off-by: Spiros Delviniotis --- tests/functional/arxiv/test_arxiv.py | 4 ++++ tests/functional/wsp/test_wsp.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index 0f58b17d..2ef86b87 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -11,6 +11,7 @@ from __future__ import absolute_import, division, print_function +import os from time import sleep import pytest @@ -20,6 +21,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) @@ -51,6 +53,8 @@ def set_up_local_environment(): } } + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + @pytest.mark.parametrize( 'expected_results', diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 42f691c9..ffa4c4c9 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -56,6 +56,7 @@ def set_up_ftp_environment(): } clean_dir(path='/tmp/WSP/') + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) @pytest.fixture(scope="function") @@ -81,6 +82,7 @@ def set_up_local_environment(): def remove_generated_files(package_location): clean_dir(path='/tmp/WSP/') + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) _, dirs, files = next(os.walk(package_location)) for dir_name in dirs: From 1301d255530a1d1ec3beb6c3f81274a5e4ebd265 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 17 Aug 2017 17:33:04 +0200 Subject: [PATCH 05/26] tests: add tests for `scrapy-clawl-once` * Adds: tests about `scrapy-clawl-once` for FTP and FILE. Addresses #161 Signed-off-by: Spiros Delviniotis --- tests/functional/arxiv/test_arxiv.py | 49 ++++++++++++++ tests/functional/wsp/test_wsp.py | 98 +++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index 2ef86b87..ce01eecb 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -88,3 +88,52 @@ def test_arxiv(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'arxiv', + 'fixtures', + 'arxiv_smoke_record.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_arxiv_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + events_limit=1, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index ffa4c4c9..d034e7aa 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -44,7 +44,7 @@ def set_up_ftp_environment(): ) # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(10) + sleep(7) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', @@ -126,6 +126,54 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): assert gotten_results == expected_results +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] + + @pytest.mark.parametrize( 'expected_results', [ @@ -158,3 +206,51 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] From 98a7796a8d0985cab16c45712e7b68c986e24f77 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:15:14 +0200 Subject: [PATCH 06/26] tests: delete unused function for `arxiv` tests Signed-off-by: Spiros Delviniotis --- tests/unit/test_arxiv_all.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index 1f4155c9..45689def 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -44,6 +44,7 @@ def _get_processed_record(item, spider): ) ) + assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) From a7588190af45090a2e9dd5c3bee06e4bf0c19b21 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:32:51 +0200 Subject: [PATCH 07/26] wsp: add temporary folder to the crawlings * Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider` methods deletes this temporary folder. Addresses #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/pipelines.py | 5 +++++ hepcrawl/spiders/wsp_spider.py | 13 +++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 8cd31c0e..4ff2b3c9 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function import os +import shutil import requests @@ -175,6 +176,10 @@ def close_spider(self, spider): """Post results to BROKER API.""" from celery.utils.log import get_task_logger logger = get_task_logger(__name__) + + if hasattr(spider, 'tmp_dir'): + shutil.rmtree(path=spider.tmp_dir, ignore_errors=True) + if 'SCRAPY_JOB' in os.environ and self.count > 0: task_endpoint = spider.settings[ 'API_PIPELINE_TASK_ENDPOINT_MAPPING' diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 058e6cc0..bc569bb3 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -78,6 +78,7 @@ def __init__( ftp_folder="/WSP", ftp_host=None, ftp_netrc=None, + tmp_dir=None, *args, **kwargs ): @@ -86,10 +87,8 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.target_folder = "/tmp/WSP" + self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') self.package_path = package_path - if not os.path.exists(self.target_folder): - os.makedirs(self.target_folder) def start_requests(self): """List selected folder on remote FTP and yield new zip files.""" @@ -116,9 +115,10 @@ def start_requests(self): # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( - self.target_folder, + self.tmp_dir, os.path.basename(remote_file) ) + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) yield Request( str(remote_url), @@ -132,6 +132,7 @@ def handle_package_ftp(self, response): zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), @@ -142,8 +143,8 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - zip_target_folder, dummy = os.path.splitext(zip_filepath) - xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + xml_files = unzip_xml_files(zip_filepath, self.tmp_dir) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), From 28a2603cc0a32c4e9aa5a077640fe01f18a8fd68 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:38:29 +0200 Subject: [PATCH 08/26] testlib: refactored `clean_dir` default arguments * Adds: new default argument to `clean_dir` method is the generated DB folder from `scrapy-crawl-once` plugin. Addresses #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/testlib/fixtures.py | 6 +++--- tests/functional/arxiv/test_arxiv.py | 3 +-- tests/functional/wsp/test_wsp.py | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index 73f28f96..c227d109 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -133,13 +133,13 @@ def expected_json_results_from_file(*path_chunks, **kwargs): return expected_data - -def clean_dir(path): +def clean_dir(path=os.path.join(os.getcwd(), '.scrapy')): """ Deletes all contained files of given target directory path. Args: - path: Absolute path of target directory to be cleaned. + path(str): path of directory to be deleted. Default path is the produced DB per spider that + stores the requested urls. Example: diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index ce01eecb..22025020 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -11,7 +11,6 @@ from __future__ import absolute_import, division, print_function -import os from time import sleep import pytest @@ -53,7 +52,7 @@ def set_up_local_environment(): } } - clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + clean_dir() @pytest.mark.parametrize( diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index d034e7aa..27f36955 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -44,7 +44,7 @@ def set_up_ftp_environment(): ) # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(7) + sleep(10) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', @@ -55,7 +55,7 @@ def set_up_ftp_environment(): } } - clean_dir(path='/tmp/WSP/') + clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) @@ -81,7 +81,7 @@ def set_up_local_environment(): def remove_generated_files(package_location): - clean_dir(path='/tmp/WSP/') + clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) _, dirs, files = next(os.walk(package_location)) From 026bdfb985d1d5e29cf54612eefabc5a05d8ed00 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:41:16 +0200 Subject: [PATCH 09/26] tests: add `clean_dir` to tests that using the DB Addresses #161 Signed-off-by: Spiros Delviniotis --- tests/unit/test_alpha.py | 6 ++++-- tests/unit/test_aps.py | 5 ++++- tests/unit/test_arxiv_all.py | 8 ++++++-- tests/unit/test_arxiv_single.py | 8 ++++++-- tests/unit/test_pipelines.py | 9 +++++++-- tests/unit/test_pos.py | 9 +++++++-- tests/unit/test_world_scientific.py | 9 +++++++-- 7 files changed, 41 insertions(+), 13 deletions(-) diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py index 96bf9af1..d5ff6331 100644 --- a/tests/unit/test_alpha.py +++ b/tests/unit/test_alpha.py @@ -13,8 +13,10 @@ from hepcrawl.spiders import alpha_spider -from hepcrawl.testlib.fixtures import fake_response_from_file - +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture def results(): diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py index 3bb3698c..e8e64962 100644 --- a/tests/unit/test_aps.py +++ b/tests/unit/test_aps.py @@ -12,7 +12,10 @@ import pytest from hepcrawl.spiders import aps_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index 45689def..63ce0822 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -16,7 +16,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -48,8 +51,9 @@ def _get_processed_record(item, spider): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items] + yield [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items] + clean_dir() def test_page_nr(many_results): """Test extracting page_nr""" diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index 329a2a49..709c6d9a 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -17,7 +17,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -44,8 +47,9 @@ def _get_processed_item(item, spider): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] + yield [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] + clean_dir() def test_abstracts(results): diff --git a/tests/unit/test_pipelines.py b/tests/unit/test_pipelines.py index 050df092..08b81319 100644 --- a/tests/unit/test_pipelines.py +++ b/tests/unit/test_pipelines.py @@ -21,7 +21,10 @@ from hepcrawl.spiders import arxiv_spider from hepcrawl.pipelines import InspireAPIPushPipeline -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -44,7 +47,9 @@ def json_spider_record(tmpdir): ) parsed_record = items.next() assert parsed_record - return spider, parsed_record + yield spider, parsed_record + + clean_dir() @pytest.fixture diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index bea29b34..0eccd4fc 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -19,7 +19,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import pos_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -55,7 +58,9 @@ def record(scrape_pos_page_body): parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record - return parsed_record + yield parsed_record + + clean_dir() def test_titles(record): diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 291d00d0..38adc79b 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -18,7 +18,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import wsp_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) def create_spider(): @@ -44,7 +47,9 @@ def get_records(response_file_name): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return (pipeline.process_item(record, spider) for record in records) + yield (pipeline.process_item(record, spider) for record in records) + + clean_dir() def get_one_record(response_file_name): From e7ff520eb94e249f122e3c118393d6abd7f71c0d Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:44:14 +0200 Subject: [PATCH 10/26] wsp: minor fix * Adds: makes staticmethod `_get_collections` class method. * Adds: minor indentation fix. * Removes: unused import. Signed-off-by: Spiros Delviniotis --- hepcrawl/spiders/wsp_spider.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index bc569bb3..f638bd03 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -13,6 +13,7 @@ import os import urlparse +import tempfile from scrapy import Request from scrapy.spiders import XMLFeedSpider @@ -75,7 +76,7 @@ class WorldScientificSpider(Jats, XMLFeedSpider): def __init__( self, package_path=None, - ftp_folder="/WSP", + ftp_folder="WSP", ftp_host=None, ftp_netrc=None, tmp_dir=None, @@ -163,7 +164,10 @@ def parse_node(self, response, node): record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', 'addendum']: - record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href") + record.add_xpath( + 'related_article_doi', + "//related-article[@ext-link-type='doi']/@href", + ) record.add_value('journal_doctype', article_type) dois = node.xpath("//article-id[@pub-id-type='doi']/text()").extract() @@ -221,10 +225,16 @@ def parse_node(self, response, node): return parsed_item - def _get_collections(self, node, article_type, current_journal_title): + @staticmethod + def _get_collections(node, article_type, current_journal_title): """Return this articles' collection.""" conference = node.xpath('.//conference').extract() - if conference or current_journal_title == "International Journal of Modern Physics: Conference Series": + if ( + conference + or current_journal_title == ( + "International Journal of Modern Physics: Conference Series" + ) + ): return ['HEP', 'ConferencePaper'] elif article_type == "review-article": return ['HEP', 'Review'] From b509c82b7e8c87003a294170caa229eefad5b919 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 15:09:04 +0200 Subject: [PATCH 11/26] pipelines: minor fix * Adds: make `InspireAPIPushPipeline._cleanup` staticmethod. Signed-off-by: Spiros Delviniotis --- hepcrawl/pipelines.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 4ff2b3c9..7a51e4cb 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -125,7 +125,8 @@ def _prepare_payload(self, spider): ] return payload - def _cleanup(self, spider): + @staticmethod + def _cleanup(spider): """Run cleanup.""" # Cleanup errors if 'errors' in spider.state: From a82a158ffa8dbfd4be7e43d9e981bd5890e78596 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 16:01:24 +0200 Subject: [PATCH 12/26] global: add `CRAWL_ONCE_PATH` to settings * Adds: variable in `settings.py` to specify where to store `scrapy-crawl-once` DB. Addresses #161 Signed-off-by: Spiros Delviniotis --- docker-compose.test.yml | 2 ++ hepcrawl/settings.py | 4 ++++ hepcrawl/testlib/fixtures.py | 4 +++- tests/Dockerfile.hepcrawl_base | 4 ++-- tests/docker_entrypoint.sh | 3 +++ tests/fix_rights | Bin 8992 -> 9032 bytes tests/fix_rights.c | 8 ++++++-- 7 files changed, 20 insertions(+), 5 deletions(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index d14d7c87..d9b69c75 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,6 +18,7 @@ services: - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results - APP_FILES_STORE=/tmp/file_urls + - APP_CRAWL_ONCE_PATH=/var/lib/scrapy/crawl_once/ - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} - BASE_USER_GIT=${BASE_USER_GIT:-1000} @@ -28,6 +29,7 @@ services: - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf - /tmp/WSP:/tmp/WSP - /tmp/file_urls:/tmp/file_urls + - ${PWD}/.scrapy/crawl_once:/var/lib/scrapy/crawl_once functional_wsp: <<: *service_base diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 91f82829..216f7694 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -74,6 +74,10 @@ CRAWL_ONCE_ENABLED = True CRAWL_ONCE_DEFAULT = True +CRAWL_ONCE_PATH = os.environ.get( + 'APP_CRAWL_ONCE_PATH', + '/var/lib/scrapy/crawl_once/', +) # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index c227d109..b78c1a31 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -15,6 +15,7 @@ from scrapy.http import Request, TextResponse from scrapy.selector import Selector +from hepcrawl.settings import CRAWL_ONCE_PATH def fake_response_from_file(file_name, test_suite='unit', url='http://www.example.com', response_type=TextResponse): @@ -133,7 +134,8 @@ def expected_json_results_from_file(*path_chunks, **kwargs): return expected_data -def clean_dir(path=os.path.join(os.getcwd(), '.scrapy')): + +def clean_dir(path=CRAWL_ONCE_PATH): """ Deletes all contained files of given target directory path. diff --git a/tests/Dockerfile.hepcrawl_base b/tests/Dockerfile.hepcrawl_base index eb91b69f..8db9f43e 100644 --- a/tests/Dockerfile.hepcrawl_base +++ b/tests/Dockerfile.hepcrawl_base @@ -26,10 +26,10 @@ RUN yum install -y epel-release && \ python-virtualenv && \ yum clean all -RUN mkdir /code /hepcrawl_venv +RUN mkdir /code /hepcrawl_venv /var/lib/scrapy RUN useradd test -RUN chown -R test:test /code /hepcrawl_venv +RUN chown -R test:test /code /hepcrawl_venv /var/lib/scrapy ADD ./docker_entrypoint.sh /docker_entrypoint.sh ADD ./fix_rights /fix_rights diff --git a/tests/docker_entrypoint.sh b/tests/docker_entrypoint.sh index 1762a421..4785699d 100755 --- a/tests/docker_entrypoint.sh +++ b/tests/docker_entrypoint.sh @@ -23,6 +23,8 @@ restore_venv_tmp_code_rights() { /fix_rights --codedir "$BASE_USER_UID:$BASE_USER_GID" echo "Restoring permissions of tmpdir to $BASE_USER_UID:$BASE_USER_GID" /fix_rights --tmpdir "$BASE_USER_UID:$BASE_USER_GID" + echo "Restoring permissions of vardir to $BASE_USER_UID:$BASE_USER_GID" + /fix_rights --vardir "$BASE_USER_UID:$BASE_USER_GID" else echo "No BASE_USER_UID env var defined, skipping venv, codedir, tmpdir permission" \ "restore." @@ -57,6 +59,7 @@ main() { /fix_rights --virtualenv 'test:test' /fix_rights --codedir 'test:test' /fix_rights --tmpdir 'test:test' + /fix_rights --vardir 'test:test' trap restore_venv_tmp_code_rights EXIT if ! [[ -f "$VENV_PATH/bin/activate" ]]; then diff --git a/tests/fix_rights b/tests/fix_rights index 98677b2c948eebb928e2439ed495a81515b06b8a..ecf219b08f9c9e8e4e7d5887f56060738ae56f33 100755 GIT binary patch delta 1261 zcmZ9Me`r%z6vyv<&zHQH)c6vczNBU3$LOy>l2(LJ=WL#~Q!@94DlXH&%IWz*`;lu@(3SyyU;niu*LQfL=vZiTKS%X;W$f=X379Hz znzWhH7AvHtNe<-@I$m7j55=+I=4+BagEN3qPWt+n@*@%y~8u+A1n$=iI|HOjv>H$vneKERCxgF^}85_U;+p?3ACXn&&1+t)wX)7R^*90*Y= zrc+e(?oADJtNZrsJ+Ahp`uAB7!WHZ*bx;cjPwT3tjexxcwiWCTP{HS5M>aI=2H5)> z8x7`O5(MmUct7i61Zrwwu^*aEX~mP2lQ+XLg+gEG^s;w2H}& zS$48Sv_;<0raoQZKve^ z9fQ4GdF&<&kg(&q=HzD#OZ1Jcg8QQHfozGw&8-ea&0!IiCA2gC{2~Vpr@W6F|H#Ez6iXHC%#6Q8~lxUgq-v@3bUe)K?Mu` Oh>(|xyRQg-+4BrIo@@93 delta 1112 zcmZ8gU1$?o6ux(2l9{O{lbCTfhTZ%?t&KINFB@DHlQk{3(n7`7zC>(AO6!9b|0=R9 zE3IOvttrO`L0D1^>vkWy!4bC0vNfU@eX;(6=!=R%{6YLf6szO8lejKDaL>8teBV7k zcjjJd%r<5g)6UV_Q}UopOc|6%??h-PNc4y+C6!|-DQ<33Fi!7!#%qE#$%7~Uec$uf z@w%-&`|74QC-q}O*wT{~MPa&4mrG(+x1Q9w@v9=&9#r% zz%!AZ`)Dcs2_xcOm@6%WtW*mfQhof#t}TJ>eY<+MZwplKjPRSIKFf+IS}U%b9}MFs zv~4qn(TsL27R~YaC&RdaHfexcUL=qi_(6^ctsh{kyeRZ5k`M6#5)S2{ux6gkGC=3r zMUuqO7F^;dI4w6?qnJa`X_eCgO4VUd9C4b{*E#HhdlIjM%cY8)7WEy<1f@9$x;2_d zty)|p1?$kupjP#t-4Ceniwv7RBQymsJKZtz zdNA`#Rr5|@Tp?bU+H{KrK}U8Aiyl6H- z#tHm^q_400zCR{s@K5KGk@QgDBBhJraWICDEf`vjx(oGs7z?eIepgHy5#eztCg-)X LcT#{SzVAK*PI@vR diff --git a/tests/fix_rights.c b/tests/fix_rights.c index 8eaa9911..2dead30c 100644 --- a/tests/fix_rights.c +++ b/tests/fix_rights.c @@ -11,6 +11,7 @@ char *VENV_PATH = "/hepcrawl_venv/"; char *CODE_PATH = "/code/"; char *TMP_PATH = "/tmp/"; +char *VAR_PATH = "/var/"; int main (int argc, char *argv[]) { @@ -30,7 +31,7 @@ int main (int argc, char *argv[]) { if (argc != 3) { fprintf( stderr, - "Usage: %s --virtualenv|--codedir|--tmpdir :\n", + "Usage: %s --virtualenv|--codedir|--tmpdir|--var :\n", argv[0] ); exit(EXIT_FAILURE); @@ -48,11 +49,14 @@ int main (int argc, char *argv[]) { } else if (strcmp(argv[1], "--tmpdir") == 0) { // tmp dir permissions chown_argv[3] = TMP_PATH; + } else if (strcmp(argv[1], "--vardir") == 0) { + // var dir permissions + chown_argv[3] = VAR_PATH; } else { fprintf(stderr, "Bad option %s.", argv[1]); fprintf( stderr, - "Usage: %s --virtualenv|--codedir|--tmpdir :\n", + "Usage: %s --virtualenv|--codedir|--tmpdir|--vardir :\n", argv[0] ); exit(EXIT_FAILURE); From 5a358e179f387a94cd34d8eb26fea311fd487d66 Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 4 Sep 2017 12:55:07 +0200 Subject: [PATCH 13/26] middlewares: refactor crawl once middleware Signed-off-by: David Caro --- hepcrawl/middlewares.py | 125 +++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 66 deletions(-) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 86a9998d..1044ba0d 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -24,8 +24,6 @@ class ErrorHandlingMiddleware(object): - """Log errors.""" - @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) @@ -35,13 +33,11 @@ def __init__(self, settings): def process_spider_exception(self, response, exception, spider): """Register the error in the spider and continue.""" - self.process_exception(response, exception, spider) + return self.process_exception(response, exception, spider) def process_exception(self, request, exception, spider): """Register the error in the spider and continue.""" - if 'errors' not in spider.state: - spider.state['errors'] = [] - spider.state['errors'].append({ + spider.state.setdefault('errors', []).append({ 'exception': exception, 'sender': request, }) @@ -68,47 +64,28 @@ class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): By default it does nothing. To avoid crawling a particular page multiple times set ``request.meta['crawl_once'] = True``. Other - ``request.meta`` keys: + ``request.meta`` keys that modify it's behavior: * ``crawl_once_value`` - a value to store in DB. By default, timestamp - is stored for Http/Https requests and last-modified is stored for FTP/File requests. + is stored for Http/Https requests and last-modified is stored for + FTP/File requests. * ``crawl_once_key`` - unique file name is used. Settings: - * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware. - Default is True. - * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database. - By default ``.scrapy/crawl_once/`` path is used; this folder contains - ``.sqlite`` files with databases of seen requests. - * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key - (False by default). When True, all requests are handled by - this middleware unless disabled explicitly using - ``request.meta['crawl_once'] = False``. - """ - - @staticmethod - def _is_newer(this_timestamp, than_this_timestamp, scheme): - if scheme in ['ftp', 'file']: - return this_timestamp > than_this_timestamp - - def _has_to_be_crawled(self, request, spider): - request_db_key = self._get_key(request) - - if request_db_key not in self.db: - return True - - new_request_timestamp = self._get_timestamp(request, spider) - parsed_url = urlparse(request.url) - if self._is_newer( - new_request_timestamp, - self.db.get(key=request_db_key), - scheme=parsed_url.scheme, - ): - return True + * ``CRAWL_ONCE_ENABLED``:set it to False to disable middleware. Default + is True. + * ``CRAWL_ONCE_PATH``: a path to a folder with crawled requests database. + By default ``.scrapy/crawl_once/`` path is used; this folder contains + ``.sqlite`` files with databases of seen requests. + * ``CRAWL_ONCE_DEFAULT``: default value for ``crawl_once`` meta key (False + by default). When True, all requests are handled by this middleware + unless disabled explicitly using + ``request.meta['crawl_once'] = False``. - return False + For more info see: https://github.com/TeamHG-Memex/scrapy-crawl-once + """ def process_request(self, request, spider): if not request.meta.get('crawl_once', self.default): return @@ -120,41 +97,57 @@ def process_request(self, request, spider): self.stats.inc_value('crawl_once/ignored') raise IgnoreRequest() - @staticmethod - def _get_timestamp(request, spider): - def _get_ftp_relative_path(url, host): - return url.replace( - 'ftp://{0}/'.format(host), - '', - ) + def _has_to_be_crawled(self, request, spider): + request_db_key = self._get_key(request) - def _get_ftp_timestamp(spider, url): - ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc) - ftp = FTP( - host=ftp_host, - user=params['ftp_user'], - passwd=params['ftp_password'], - ) - return ftp.sendcmd( - 'MDTM {}'.format( - _get_ftp_relative_path( - url=url, - host=ftp_host - ) - ) - ) + if request_db_key not in self.db: + return True - def _get_file_timestamp(url): - file_path = url.replace('file://', '') - return os.stat(file_path).st_mtime + new_file_timestamp = self._get_timestamp(request, spider) + old_file_timestamp = self.db.get(key=request_db_key) + return new_file_timestamp > old_file_timestamp + @classmethod + def _get_timestamp(cls, request, spider): parsed_url = urlparse(request.url) full_url = request.url if parsed_url.scheme == 'ftp': - last_modified = _get_ftp_timestamp(spider, full_url) + last_modified = cls._get_ftp_timestamp(spider, full_url) elif parsed_url.scheme == 'file': - last_modified = _get_file_timestamp(full_url) + last_modified = cls._get_file_timestamp(full_url) else: last_modified = time.time() return last_modified + + @classmethod + def _get_ftp_timestamp(cls, spider, url): + ftp_host, params = ftp_connection_info( + spider.ftp_host, + spider.ftp_netrc, + ) + ftp = FTP( + host=ftp_host, + user=params['ftp_user'], + passwd=params['ftp_password'], + ) + return ftp.sendcmd( + 'MDTM {}'.format( + cls._get_ftp_relative_path( + url=url, + host=ftp_host + ) + ) + ) + + @staticmethod + def _get_ftp_relative_path(url, host): + return url.replace( + 'ftp://{0}/'.format(host), + '', + ) + + @staticmethod + def _get_file_timestamp(url): + file_path = url.replace('file://', '') + return os.stat(file_path).st_mtime From 16ae688d1c63a654813e6227ac0f2e3650e24984 Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 4 Sep 2017 12:59:32 +0200 Subject: [PATCH 14/26] pep8: wsp_spider Signed-off-by: David Caro --- hepcrawl/spiders/wsp_spider.py | 48 ++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index f638bd03..3498ca31 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -43,22 +43,27 @@ class WorldScientificSpider(Jats, XMLFeedSpider): on the remote server and downloads them to a designated local folder, using ``WorldScientificSpider.start_requests()``. 2. Then the ZIP file is unpacked and it lists all the XML files found - inside, via ``WorldScientificSpider.handle_package()``. Note the callback from - ``WorldScientificSpider.start_requests()``. - 3. Finally, now each XML file is parsed via ``WorldScientificSpider.parse_node()``. + inside, via ``WorldScientificSpider.handle_package()``. Note the + callback from ``WorldScientificSpider.start_requests()``. + 3. Finally, now each XML file is parsed via + ``WorldScientificSpider.parse_node()``. Example: To run a crawl, you need to pass FTP connection information via ``ftp_host`` and ``ftp_netrc``:: - $ scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + $ scrapy crawl \\ + WSP \\ + -a 'ftp_host=ftp.example.com' \\ + -a 'ftp_netrc=/path/to/netrc' """ name = 'WSP' custom_settings = {} start_urls = [] - iterator = 'iternodes' # This is actually unnecessary, since it's the default value + # This is actually unnecessary, since it's the default value + iterator = 'iternodes' itertag = 'article' allowed_article_types = [ @@ -88,7 +93,10 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') + self.tmp_dir = ( + tmp_dir or + tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') + ) self.package_path = package_path def start_requests(self): @@ -100,9 +108,15 @@ def start_requests(self): ) for file_path in new_files_paths: - yield Request("file://{0}".format(file_path), callback=self.handle_package_file) + yield Request( + "file://{0}".format(file_path), + callback=self.handle_package_file, + ) else: - ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + ftp_host, ftp_params = ftp_connection_info( + self.ftp_host, + self.ftp_netrc, + ) new_files_paths = ftp_list_files( self.ftp_folder, @@ -157,7 +171,10 @@ def parse_node(self, response, node): node.remove_namespaces() article_type = node.xpath('@article-type').extract() self.log("Got article_type {0}".format(article_type)) - if article_type is None or article_type[0] not in self.allowed_article_types: + if ( + article_type is None or + article_type[0] not in self.allowed_article_types + ): # Filter out non-interesting article types return @@ -216,7 +233,10 @@ def parse_node(self, response, node): ) record.add_value('license', license) - record.add_value('collections', self._get_collections(node, article_type, journal_title)) + record.add_value( + 'collections', + self._get_collections(node, article_type, journal_title), + ) parsed_item = ParsedItem( record=dict(record.load_item()), @@ -230,10 +250,10 @@ def _get_collections(node, article_type, current_journal_title): """Return this articles' collection.""" conference = node.xpath('.//conference').extract() if ( - conference - or current_journal_title == ( - "International Journal of Modern Physics: Conference Series" - ) + conference or + current_journal_title == ( + "International Journal of Modern Physics: Conference Series" + ) ): return ['HEP', 'ConferencePaper'] elif article_type == "review-article": From 1ac8785fad8f7a5ac326709f9a7eda557090188f Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 4 Sep 2017 13:03:36 +0200 Subject: [PATCH 15/26] setup: pin scrapy-crawl-once to the major version Signed-off-by: David Caro --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ca1e0337..16ce46f0 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ 'inspire-schemas~=46.0', 'inspire-dojson~=46.0', 'Scrapy>=1.1.0', - 'scrapy-crawl-once>=0.1.1', + 'scrapy-crawl-once~=0.1,>=0.1.1', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', 'scrapyd-client>=1.0.1', From 6dffdc8c7989f49364f4ad845e03fff7c1b13b62 Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 4 Sep 2017 13:08:14 +0200 Subject: [PATCH 16/26] tests: remove unneeded var dir chown Signed-off-by: David Caro --- docker-compose.test.yml | 3 +-- tests/docker_entrypoint.sh | 3 --- tests/fix_rights.c | 8 ++------ 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index d9b69c75..64eff7df 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,7 +18,7 @@ services: - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results - APP_FILES_STORE=/tmp/file_urls - - APP_CRAWL_ONCE_PATH=/var/lib/scrapy/crawl_once/ + - APP_CRAWL_ONCE_PATH=/code/.scrapy - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} - BASE_USER_GIT=${BASE_USER_GIT:-1000} @@ -29,7 +29,6 @@ services: - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf - /tmp/WSP:/tmp/WSP - /tmp/file_urls:/tmp/file_urls - - ${PWD}/.scrapy/crawl_once:/var/lib/scrapy/crawl_once functional_wsp: <<: *service_base diff --git a/tests/docker_entrypoint.sh b/tests/docker_entrypoint.sh index 4785699d..1762a421 100755 --- a/tests/docker_entrypoint.sh +++ b/tests/docker_entrypoint.sh @@ -23,8 +23,6 @@ restore_venv_tmp_code_rights() { /fix_rights --codedir "$BASE_USER_UID:$BASE_USER_GID" echo "Restoring permissions of tmpdir to $BASE_USER_UID:$BASE_USER_GID" /fix_rights --tmpdir "$BASE_USER_UID:$BASE_USER_GID" - echo "Restoring permissions of vardir to $BASE_USER_UID:$BASE_USER_GID" - /fix_rights --vardir "$BASE_USER_UID:$BASE_USER_GID" else echo "No BASE_USER_UID env var defined, skipping venv, codedir, tmpdir permission" \ "restore." @@ -59,7 +57,6 @@ main() { /fix_rights --virtualenv 'test:test' /fix_rights --codedir 'test:test' /fix_rights --tmpdir 'test:test' - /fix_rights --vardir 'test:test' trap restore_venv_tmp_code_rights EXIT if ! [[ -f "$VENV_PATH/bin/activate" ]]; then diff --git a/tests/fix_rights.c b/tests/fix_rights.c index 2dead30c..8eaa9911 100644 --- a/tests/fix_rights.c +++ b/tests/fix_rights.c @@ -11,7 +11,6 @@ char *VENV_PATH = "/hepcrawl_venv/"; char *CODE_PATH = "/code/"; char *TMP_PATH = "/tmp/"; -char *VAR_PATH = "/var/"; int main (int argc, char *argv[]) { @@ -31,7 +30,7 @@ int main (int argc, char *argv[]) { if (argc != 3) { fprintf( stderr, - "Usage: %s --virtualenv|--codedir|--tmpdir|--var :\n", + "Usage: %s --virtualenv|--codedir|--tmpdir :\n", argv[0] ); exit(EXIT_FAILURE); @@ -49,14 +48,11 @@ int main (int argc, char *argv[]) { } else if (strcmp(argv[1], "--tmpdir") == 0) { // tmp dir permissions chown_argv[3] = TMP_PATH; - } else if (strcmp(argv[1], "--vardir") == 0) { - // var dir permissions - chown_argv[3] = VAR_PATH; } else { fprintf(stderr, "Bad option %s.", argv[1]); fprintf( stderr, - "Usage: %s --virtualenv|--codedir|--tmpdir|--vardir :\n", + "Usage: %s --virtualenv|--codedir|--tmpdir :\n", argv[0] ); exit(EXIT_FAILURE); From 6ffa2df68033c387b2f2d454357917e896c05de3 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:39:05 +0200 Subject: [PATCH 17/26] unit: small pep8 refactor Signed-off-by: David Caro --- tests/unit/test_arxiv_all.py | 15 ++++-- tests/unit/test_world_scientific.py | 84 +++++++++++++++++++++-------- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index 63ce0822..21a5fd99 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -7,7 +7,12 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) import pytest @@ -47,14 +52,18 @@ def _get_processed_record(item, spider): ) ) - assert records + assert parsed_items pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - yield [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items] + yield [ + _get_processed_record(parsed_item, spider) + for parsed_item in parsed_items + ] clean_dir() + def test_page_nr(many_results): """Test extracting page_nr""" page_nrs = [ diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 38adc79b..f14144fd 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -7,7 +7,12 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) import pytest import os @@ -24,6 +29,12 @@ ) +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + yield + clean_dir() + + def create_spider(): crawler = Crawler(spidercls=wsp_spider.WorldScientificSpider) return wsp_spider.WorldScientificSpider.from_crawler(crawler) @@ -47,9 +58,10 @@ def get_records(response_file_name): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - yield (pipeline.process_item(record, spider) for record in records) - - clean_dir() + return ( + pipeline.process_item(record, spider) + for record in records + ) def get_one_record(response_file_name): @@ -72,17 +84,25 @@ def override_generated_fields(record): [ get_one_record('world_scientific/sample_ws_record.xml'), ( - "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have recently been used as light absorbers in hybrid" - " organic-inorganic solid-state solar cells, with efficiencies above 15%. To date, it is essential to" - " add Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the hole transport materials (HTM) to get" - " a higher conductivity. However, the detrimental effect of high LiTFSI concentration on the charge transport" - ", DOS in the conduction band of the TiO$_{2}$ substrate and device stability results in an overall " - "compromise for a satisfactory device. Using a higher mobility hole conductor to avoid lithium salt " - "is an interesting alternative. Herein, we successfully made an efficient perovskite solar cell by " - "applying a hole conductor PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in the absence of" - " LiTFSI. Under AM 1.5 illumination of 100 mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is " - "comparable to the efficiency of 12.3% with the addition of 1.3 mM LiTFSI. An unsealed device without " - "Li$^{+}$ shows interestingly a promising stability." + "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have " + "recently been used as light absorbers in hybrid" + " organic-inorganic solid-state solar cells, with " + "efficiencies above 15%. To date, it is essential to add " + "Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the " + "hole transport materials (HTM) to get a higher conductivity. " + "However, the detrimental effect of high LiTFSI concentration " + "on the charge transport, DOS in the conduction band of the " + "TiO$_{2}$ substrate and device stability results in an " + "overall compromise for a satisfactory device. Using a higher " + "mobility hole conductor to avoid lithium salt is an " + "interesting alternative. Herein, we successfully made an " + "efficient perovskite solar cell by applying a hole conductor " + "PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in " + "the absence of LiTFSI. Under AM 1.5 illumination of 100 " + "mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is " + "comparable to the efficiency of 12.3% with the addition of " + "1.3 mM LiTFSI. An unsealed device without Li$^{+}$ shows " + "interestingly a promising stability." ), ], ], @@ -103,7 +123,10 @@ def test_abstract(generated_record, expected_abstract): get_one_record('world_scientific/sample_ws_record.xml'), [{ 'source': 'WSP', - 'title': 'High-efficient Solid-state Perovskite Solar Cell Without Lithium Salt in the Hole Transport Material', + 'title': ( + 'High-efficient Solid-state Perovskite Solar Cell Without ' + 'Lithium Salt in the Hole Transport Material' + ), }], ], ], @@ -296,12 +319,18 @@ def test_publication_info(generated_record, expected_publication_info): [ get_one_record('world_scientific/sample_ws_record.xml'), { - 'authors': ["BI, DONGQIN", "BOSCHLOO, GERRIT", "HAGFELDT, ANDERS"], + 'authors': [ + "BI, DONGQIN", + "BOSCHLOO, GERRIT", + "HAGFELDT, ANDERS", + ], 'affiliation': ( - 'Department of Chemistry-Angstrom Laboratory, Uppsala University, Box 532, SE 751 20 Uppsala, Sweden' + 'Department of Chemistry-Angstrom Laboratory, Uppsala ' + 'University, Box 532, SE 751 20 Uppsala, Sweden' ), 'xref_affiliation': ( - 'Physics Department, Brookhaven National Laboratory, Upton, NY 11973, USA' + 'Physics Department, Brookhaven National Laboratory, ' + 'Upton, NY 11973, USA' ), }, ], @@ -319,11 +348,14 @@ def test_authors(generated_record, expected_authors): for index, name in enumerate(expected_authors['authors']): assert generated_record['authors'][index]['full_name'] == name assert expected_authors['affiliation'] in [ - aff['value'] for aff in generated_record['authors'][index]['affiliations'] + aff['value'] + for aff in generated_record['authors'][index]['affiliations'] ] if index == 1: assert expected_authors['xref_affiliation'] in [ - aff['value'] for aff in generated_record['authors'][index]['affiliations'] + aff['value'] + for aff + in generated_record['authors'][index]['affiliations'] ] @@ -418,7 +450,10 @@ def test_pipeline_record(generated_record): 'abstracts': [ { 'source': 'WSP', - 'value': u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa.', + 'value': ( + u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, ' + 'bla blaaa blaa. Bla blaaa blaa.' + ), }, ], 'acquisition_source': { @@ -431,7 +466,10 @@ def test_pipeline_record(generated_record): { 'affiliations': [ { - 'value': u'Department, University, City, City_code 123456, C. R. Country_2', + 'value': ( + u'Department, University, City, City_code 123456, ' + 'C. R. Country_2' + ), }, ], 'full_name': u'author_surname_2, author_name_1', From ddda124aa67ad1898f4d861523873cab656fc17f Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:39:27 +0200 Subject: [PATCH 18/26] gitignore: add some test product files Signed-off-by: David Caro --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 01895086..3b606de7 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,8 @@ nosetests.xml coverage.xml twistd.pid .coverage.* +tests/unit/responses/edp/test_gz +tests/unit/responses/edp/test_rich # Translations *.mo @@ -57,6 +59,8 @@ jobs dbs items logs +.scrapy +scrapy_feed_uri # Local settings local_settings.py From 610fce43caf6bacc8d05a42a59f897dd474b761d Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:40:01 +0200 Subject: [PATCH 19/26] docker-compose: swap sleep with healthcheck That allows to properly wait for the services to be up, instead of adding a dummy sleep. Signed-off-by: David Caro --- .travis.yml | 2 +- docker-compose.test.yml | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 91407e6e..53f7dee5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ before_install: install: - travis_retry docker-compose -f docker-compose.deps.yml run --rm pip - - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd_deploy + - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd-deploy script: - travis_retry docker-compose -f docker-compose.test.yml run --rm ${SUITE} diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 64eff7df..074f50ce 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -66,10 +66,17 @@ services: command: bash -c "rm -f twistd.pid && exec scrapyd" links: - celery + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:6800/listprojects.json" - scrapyd_deploy: + scrapyd-deploy: <<: *service_base - command: bash -c "sleep 8 && scrapyd-deploy" # make sure that the scrapyd is up + command: bash -c "scrapyd-deploy" links: - scrapyd From ff1eb860ebfc058eff60eff724c81f9ef30bb623 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:41:05 +0200 Subject: [PATCH 20/26] utils: nicer ParsedItem string reperesentation Signed-off-by: David Caro --- hepcrawl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 256dd508..caff462d 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -10,6 +10,7 @@ from __future__ import absolute_import, division, print_function import os +import pprint import re from operator import itemgetter from itertools import groupby @@ -467,3 +468,6 @@ def __getattr__(self, key): def __setattr__(self, key, value): self[key] = value + + def __str__(self): + return pprint.pformat(self) From f111993c09a08e30f8d0355a44e74058244bcfd2 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:42:16 +0200 Subject: [PATCH 21/26] celery_monitor: small refactor and default event fix Signed-off-by: David Caro --- hepcrawl/testlib/celery_monitor.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py index 1347ab22..c8d94949 100644 --- a/hepcrawl/testlib/celery_monitor.py +++ b/hepcrawl/testlib/celery_monitor.py @@ -24,7 +24,13 @@ class CeleryMonitor(object): - def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2): + def __init__( + self, + app, + monitor_timeout=3, + monitor_iter_limit=100, + events_limit=2, + ): self.results = [] self.recv = None self.app = app @@ -39,7 +45,13 @@ def __enter__(self): def announce_succeeded_tasks(event): state.event(event) task = state.tasks.get(event['uuid']) - LOGGER.info('TASK SUCCEEDED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + LOGGER.info( + 'TASK SUCCEEDED: %s[%s] %s' % ( + task.name, + task.uuid, + task.info(), + ) + ) tasks = self.app.AsyncResult(task.id) for task in tasks.result: self.results.append(task) @@ -48,7 +60,9 @@ def announce_succeeded_tasks(event): def announce_failed_tasks(event): state.event(event) task = state.tasks.get(event['uuid']) - LOGGER.info('TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + LOGGER.info( + 'TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),) + ) self.results.append(task.info()) self.recv.should_stop = True @@ -62,7 +76,11 @@ def announce_failed_tasks(event): return self def __exit__(self, exc_type, exc_val, exc_tb): - events_iter = self.recv.itercapture(limit=None, timeout=self.monitor_timeout, wakeup=True) + events_iter = self.recv.itercapture( + limit=None, + timeout=self.monitor_timeout, + wakeup=True, + ) self._wait_for_results(events_iter) self.connection.__exit__() @@ -84,8 +102,8 @@ def do_crawl( app, monitor_timeout, monitor_iter_limit, - events_limit, crawler_instance, + events_limit=2, project='hepcrawl', spider='WSP', settings=None, From bcf8f76f0b7d8eafcff8951b4490ca416c9993a9 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:42:55 +0200 Subject: [PATCH 22/26] tohep: add some useful debug logs Signed-off-by: David Caro --- hepcrawl/tohep.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 8fcbc735..88e5e8d2 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -25,10 +25,14 @@ import os import datetime +import logging from inspire_schemas.api import LiteratureBuilder +LOGGER = logging.getLogger(__name__) + + class UnknownItemFormat(Exception): pass @@ -195,10 +199,12 @@ def hep_to_hep(hep_record, record_files): hepcrawl one (normally, marc-ingesting spiders). """ if record_files: + LOGGER.debug('Updating fft fields from: %s', hep_record['_fft']) hep_record['_fft'] = _get_updated_fft_fields( current_fft_fields=hep_record['_fft'], record_files=record_files, ) + LOGGER.debug('Updated fft fields to: %s', hep_record['_fft']) return hep_record From 5aacb1ce3982ec9073ed21f99b3c8546f438ff1e Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:43:57 +0200 Subject: [PATCH 23/26] middlewares: use better key for crawl-once That way uses the schema (ftp/http/local...) and the file name as key, instead of just the file name, to avoid downloading a file from ftp and not crawling it after. Signed-off-by: David Caro --- hepcrawl/middlewares.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 1044ba0d..99551e93 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -13,6 +13,7 @@ import os import time +import logging from ftplib import FTP from six.moves.urllib.parse import urlparse @@ -23,6 +24,9 @@ from hepcrawl.utils import ftp_connection_info +LOGGER = logging.getLogger(__name__) + + class ErrorHandlingMiddleware(object): @classmethod def from_crawler(cls, crawler): @@ -88,15 +92,26 @@ class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): """ def process_request(self, request, spider): if not request.meta.get('crawl_once', self.default): + if 'crawl_once' in request.meta: + LOGGER.info('Crawl-Once: skipping by explicit crawl_once meta') + else: + LOGGER.info('Crawl-Once: skipping by default crawl_once meta') return - request.meta['crawl_once_key'] = os.path.basename(request.url) + request.meta['crawl_once_key'] = self._get_key(request) request.meta['crawl_once_value'] = self._get_timestamp(request, spider) if not self._has_to_be_crawled(request, spider): + LOGGER.info( + 'Crawl-Once: Skipping due to `has_to_be_crawled`, %s' % request + ) self.stats.inc_value('crawl_once/ignored') raise IgnoreRequest() + LOGGER.info( + 'Crawl-Once: Not skipping: %s' % request + ) + def _has_to_be_crawled(self, request, spider): request_db_key = self._get_key(request) @@ -107,6 +122,16 @@ def _has_to_be_crawled(self, request, spider): old_file_timestamp = self.db.get(key=request_db_key) return new_file_timestamp > old_file_timestamp + def _get_key(self, request): + parsed_url = urlparse(request.url) + fname = os.path.basename(parsed_url.path) + if parsed_url.scheme == 'file': + prefix = 'local' + else: + prefix = 'remote' + + return prefix + '::' + fname + @classmethod def _get_timestamp(cls, request, spider): parsed_url = urlparse(request.url) From 1a53584b22f0fb504f270c65bea36dd3306ecd97 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:45:24 +0200 Subject: [PATCH 24/26] pipelines: added extra debug log Signed-off-by: David Caro --- hepcrawl/pipelines.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 7a51e4cb..d9949338 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -17,6 +17,7 @@ import os import shutil +import pprint import requests @@ -93,10 +94,16 @@ def open_spider(self, spider): def _post_enhance_item(self, item, spider): source = spider.name - return item_to_hep( + enhanced_record = item_to_hep( item=item, source=source, ) + spider.logger.debug( + 'Got post-enhanced hep record:\n%s' % pprint.pformat( + enhanced_record + ) + ) + return enhanced_record def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" From 3110b0f9d1a954cf071b4ffbeec5f75c9f3263f5 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:45:50 +0200 Subject: [PATCH 25/26] desy: adapt to the new middleware As It changes the actual url of the files to download, the hash also changes. Signed-off-by: David Caro --- hepcrawl/spiders/desy_spider.py | 15 +++++- .../fixtures/desy_local_records_expected.json | 22 ++++----- tests/functional/desy/test_desy.py | 48 ++++++++++--------- 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index ec70ec39..efae063e 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -178,13 +178,18 @@ def start_requests(self): yield request @staticmethod - def _get_full_uri(current_path, base_url, schema, hostname=''): + def _get_full_uri(current_path, base_url, schema, hostname=None): + hostname = hostname or '' if os.path.isabs(current_path): full_path = current_path else: full_path = os.path.join(base_url, current_path) - return '{schema}://{hostname}{full_path}'.format(**vars()) + return '{schema}://{hostname}{full_path}'.format( + schema=schema, + hostname=hostname, + full_path=full_path, + ) def parse(self, response): """Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`. @@ -208,8 +213,12 @@ def parse(self, response): url_schema = 'file' hostname = None + self.log('Getting marc xml records...') marcxml_records = self._get_marcxml_records(response.body) + self.log('Got %d marc xml records' % len(marcxml_records)) + self.log('Getting hep records...') hep_records = self._hep_records_from_marcxml(marcxml_records) + self.log('Got %d hep records' % len(hep_records)) for hep_record in hep_records: list_file_urls = [ @@ -222,12 +231,14 @@ def parse(self, response): for fft_path in hep_record['_fft'] ] + self.log('Got the following fft urls: %s' % list_file_urls) parsed_item = ParsedItem( record=hep_record, file_urls=list_file_urls, ftp_params=ftp_params, record_format='hep', ) + self.log('Got item: %s' % parsed_item) yield parsed_item diff --git a/tests/functional/desy/fixtures/desy_local_records_expected.json b/tests/functional/desy/fixtures/desy_local_records_expected.json index 1dc784b9..dc7baf23 100644 --- a/tests/functional/desy/fixtures/desy_local_records_expected.json +++ b/tests/functional/desy/fixtures/desy_local_records_expected.json @@ -10,7 +10,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -19,7 +19,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -78,7 +78,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -87,7 +87,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -146,7 +146,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -155,7 +155,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -214,7 +214,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -223,7 +223,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -1754,7 +1754,7 @@ "format": ".pdf", "filename": "dummy", "version": 1, - "path": "/tmp/file_urls/full/c011422ef40ef111a72bd72092066dd3c1cc7a39.pdf", + "path": "/tmp/file_urls/full/0df3efe7842cf285ae0eeed845cca003dd755674.pdf", "type": "Main" }, { @@ -1763,7 +1763,7 @@ "format": ".txt", "filename": "test_fft_1", "version": 1, - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main" }, { @@ -1772,7 +1772,7 @@ "format": ".txt", "filename": "test_fft_2", "version": 1, - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main" } ], diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index d7286f7e..c3e7ca4f 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -13,6 +13,7 @@ import copy import hashlib +import os from time import sleep import pytest @@ -76,6 +77,29 @@ def _generate_md5_hash(file_path): assert file_1_hash == file_2_hash +def assert_ffts_content_matches_expected(record): + for fft_field in record.get('_fft', []): + assert_fft_content_matches_expected(fft_field) + + +def assert_fft_content_matches_expected(fft_field): + expected_file_name = get_file_name_from_fft(fft_field) + assert_files_equal(expected_file_name, fft_field['path']) + + +def get_file_name_from_fft(fft_field): + file_path = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + 'FFT', + fft_field['filename'] + fft_field['format'], + test_suite='functional', + ) + return file_path + + def get_ftp_settings(): netrc_location = get_test_suite_path( 'desy', @@ -120,6 +144,7 @@ def cleanup(): sleep(10) yield + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) clean_dir('/tmp/file_urls') clean_dir('/tmp/DESY') @@ -180,26 +205,3 @@ def test_desy( for record in gotten_results: assert_ffts_content_matches_expected(record) - - -def assert_ffts_content_matches_expected(record): - for fft_field in record.get('_fft', []): - assert_fft_content_matches_expected(fft_field) - - -def assert_fft_content_matches_expected(fft_field): - expected_file_name = get_file_name_from_fft(fft_field) - assert_files_equal(expected_file_name, fft_field['path']) - - -def get_file_name_from_fft(fft_field): - file_path = get_test_suite_path( - 'desy', - 'fixtures', - 'ftp_server', - 'DESY', - 'FFT', - fft_field['filename'] + fft_field['format'], - test_suite='functional', - ) - return file_path From bd72a4fe6366c7d1980a49c8ca981185669246bf Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:59:39 +0200 Subject: [PATCH 26/26] wsp: adapt to new middleware and refactor Signed-off-by: David Caro --- hepcrawl/spiders/wsp_spider.py | 104 +++++++++++++++++++------------ tests/functional/wsp/test_wsp.py | 74 +++++++++++++++------- 2 files changed, 114 insertions(+), 64 deletions(-) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3498ca31..1868c9c5 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -49,6 +49,19 @@ class WorldScientificSpider(Jats, XMLFeedSpider): ``WorldScientificSpider.parse_node()``. + Args: + local_package_dir(str): path to the local directory holding the zip + files to parse and extract the records for, if set, will ignore all + the ftp options. + ftp_folder(str): remote folder in the ftp server to get the zip files + from. + ftp_host(str): host name of the ftp server to connect to. + ftp_netrc(str): path to the netrc file containing the authentication + settings for the ftp. + target_folder(str): path to the temporary local directory to download + the files to. + + Example: To run a crawl, you need to pass FTP connection information via ``ftp_host`` and ``ftp_netrc``:: @@ -80,11 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider): def __init__( self, - package_path=None, + local_package_dir=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, - tmp_dir=None, + target_folder=None, *args, **kwargs ): @@ -93,53 +106,62 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.tmp_dir = ( - tmp_dir or + self.target_folder = ( + target_folder or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') ) - self.package_path = package_path + self.local_package_dir = local_package_dir - def start_requests(self): - """List selected folder on remote FTP and yield new zip files.""" - if self.package_path: - new_files_paths = local_list_files( - self.package_path, - self.target_folder + def _get_local_requests(self): + new_files_paths = local_list_files( + self.local_package_dir, + self.target_folder + ) + + for file_path in new_files_paths: + yield Request( + "file://{0}".format(file_path), + callback=self.handle_package_file, ) - for file_path in new_files_paths: - yield Request( - "file://{0}".format(file_path), - callback=self.handle_package_file, - ) - else: - ftp_host, ftp_params = ftp_connection_info( - self.ftp_host, - self.ftp_netrc, + def _get_remote_requests(self): + ftp_host, ftp_params = ftp_connection_info( + self.ftp_host, + self.ftp_netrc, + ) + + new_files_paths = ftp_list_files( + self.ftp_folder, + destination_folder=self.target_folder, + ftp_host=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'] + ) + + for remote_file in new_files_paths: + # Cast to byte-string for scrapy compatibility + remote_file = str(remote_file) + ftp_params["ftp_local_filename"] = os.path.join( + self.target_folder, + os.path.basename(remote_file) ) - new_files_paths = ftp_list_files( - self.ftp_folder, - destination_folder=self.target_folder, - ftp_host=ftp_host, - user=ftp_params['ftp_user'], - password=ftp_params['ftp_password'] + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp ) - for remote_file in new_files_paths: - # Cast to byte-string for scrapy compatibility - remote_file = str(remote_file) - ftp_params["ftp_local_filename"] = os.path.join( - self.tmp_dir, - os.path.basename(remote_file) - ) - - remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) - yield Request( - str(remote_url), - meta=ftp_params, - callback=self.handle_package_ftp - ) + def start_requests(self): + """List selected folder on remote FTP and yield new zip files.""" + if self.local_package_dir: + requests_iter = self._get_local_requests() + else: + requests_iter = self._get_remote_requests() + + for request in requests_iter: + yield request def handle_package_ftp(self, response): """Handle a zip package and yield every XML found.""" @@ -158,7 +180,7 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - xml_files = unzip_xml_files(zip_filepath, self.tmp_dir) + xml_files = unzip_xml_files(zip_filepath, self.target_folder) for xml_file in xml_files: yield Request( diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 27f36955..493837ec 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -28,13 +28,15 @@ def override_generated_fields(record): record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' + ) return record @pytest.fixture(scope="function") -def set_up_ftp_environment(): +def ftp_environment(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', @@ -43,7 +45,8 @@ def set_up_ftp_environment(): test_suite='functional', ) - # The test must wait until the docker environment is up (takes about 10 seconds). + # The test must wait until the docker environment is up (takes about 10 + # seconds). sleep(10) yield { @@ -73,7 +76,7 @@ def set_up_local_environment(): 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'package_path': package_location, + 'local_package_dir': package_location, } } @@ -105,8 +108,10 @@ def remove_generated_files(package_location): 'smoke', ] ) -def test_wsp_ftp(set_up_ftp_environment, expected_results): - crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) +def test_wsp_ftp(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -114,14 +119,18 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -139,22 +148,29 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): 'crawl_twice', ] ) -def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): - crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) +def test_wsp_ftp_crawl_twice(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, + events_limit=2, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -162,11 +178,12 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=20, + events_limit=2, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) gotten_results = [override_generated_fields(result) for result in results] @@ -188,7 +205,9 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): ] ) def test_wsp_local_package_path(set_up_local_environment, expected_results): - crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -203,7 +222,9 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -221,8 +242,13 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): 'crawl_twice', ] ) -def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) +def test_wsp_local_package_path_crawl_twice( + set_up_local_environment, + expected_results, +): + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -236,7 +262,9 @@ def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_r ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results