From bd72a4fe6366c7d1980a49c8ca981185669246bf Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 13:59:39 +0200 Subject: [PATCH] wsp: adapt to new middleware and refactor Signed-off-by: David Caro --- hepcrawl/spiders/wsp_spider.py | 104 +++++++++++++++++++------------ tests/functional/wsp/test_wsp.py | 74 +++++++++++++++------- 2 files changed, 114 insertions(+), 64 deletions(-) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3498ca31..1868c9c5 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -49,6 +49,19 @@ class WorldScientificSpider(Jats, XMLFeedSpider): ``WorldScientificSpider.parse_node()``. + Args: + local_package_dir(str): path to the local directory holding the zip + files to parse and extract the records for, if set, will ignore all + the ftp options. + ftp_folder(str): remote folder in the ftp server to get the zip files + from. + ftp_host(str): host name of the ftp server to connect to. + ftp_netrc(str): path to the netrc file containing the authentication + settings for the ftp. + target_folder(str): path to the temporary local directory to download + the files to. + + Example: To run a crawl, you need to pass FTP connection information via ``ftp_host`` and ``ftp_netrc``:: @@ -80,11 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider): def __init__( self, - package_path=None, + local_package_dir=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, - tmp_dir=None, + target_folder=None, *args, **kwargs ): @@ -93,53 +106,62 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.tmp_dir = ( - tmp_dir or + self.target_folder = ( + target_folder or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') ) - self.package_path = package_path + self.local_package_dir = local_package_dir - def start_requests(self): - """List selected folder on remote FTP and yield new zip files.""" - if self.package_path: - new_files_paths = local_list_files( - self.package_path, - self.target_folder + def _get_local_requests(self): + new_files_paths = local_list_files( + self.local_package_dir, + self.target_folder + ) + + for file_path in new_files_paths: + yield Request( + "file://{0}".format(file_path), + callback=self.handle_package_file, ) - for file_path in new_files_paths: - yield Request( - "file://{0}".format(file_path), - callback=self.handle_package_file, - ) - else: - ftp_host, ftp_params = ftp_connection_info( - self.ftp_host, - self.ftp_netrc, + def _get_remote_requests(self): + ftp_host, ftp_params = ftp_connection_info( + self.ftp_host, + self.ftp_netrc, + ) + + new_files_paths = ftp_list_files( + self.ftp_folder, + destination_folder=self.target_folder, + ftp_host=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'] + ) + + for remote_file in new_files_paths: + # Cast to byte-string for scrapy compatibility + remote_file = str(remote_file) + ftp_params["ftp_local_filename"] = os.path.join( + self.target_folder, + os.path.basename(remote_file) ) - new_files_paths = ftp_list_files( - self.ftp_folder, - destination_folder=self.target_folder, - ftp_host=ftp_host, - user=ftp_params['ftp_user'], - password=ftp_params['ftp_password'] + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp ) - for remote_file in new_files_paths: - # Cast to byte-string for scrapy compatibility - remote_file = str(remote_file) - ftp_params["ftp_local_filename"] = os.path.join( - self.tmp_dir, - os.path.basename(remote_file) - ) - - remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) - yield Request( - str(remote_url), - meta=ftp_params, - callback=self.handle_package_ftp - ) + def start_requests(self): + """List selected folder on remote FTP and yield new zip files.""" + if self.local_package_dir: + requests_iter = self._get_local_requests() + else: + requests_iter = self._get_remote_requests() + + for request in requests_iter: + yield request def handle_package_ftp(self, response): """Handle a zip package and yield every XML found.""" @@ -158,7 +180,7 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - xml_files = unzip_xml_files(zip_filepath, self.tmp_dir) + xml_files = unzip_xml_files(zip_filepath, self.target_folder) for xml_file in xml_files: yield Request( diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 27f36955..493837ec 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -28,13 +28,15 @@ def override_generated_fields(record): record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' + ) return record @pytest.fixture(scope="function") -def set_up_ftp_environment(): +def ftp_environment(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', @@ -43,7 +45,8 @@ def set_up_ftp_environment(): test_suite='functional', ) - # The test must wait until the docker environment is up (takes about 10 seconds). + # The test must wait until the docker environment is up (takes about 10 + # seconds). sleep(10) yield { @@ -73,7 +76,7 @@ def set_up_local_environment(): 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'package_path': package_location, + 'local_package_dir': package_location, } } @@ -105,8 +108,10 @@ def remove_generated_files(package_location): 'smoke', ] ) -def test_wsp_ftp(set_up_ftp_environment, expected_results): - crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) +def test_wsp_ftp(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -114,14 +119,18 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -139,22 +148,29 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): 'crawl_twice', ] ) -def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): - crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) +def test_wsp_ftp_crawl_twice(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) results = CeleryMonitor.do_crawl( app=celery_app, monitor_timeout=5, monitor_iter_limit=20, + events_limit=2, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -162,11 +178,12 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=20, + events_limit=2, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) gotten_results = [override_generated_fields(result) for result in results] @@ -188,7 +205,9 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results): ] ) def test_wsp_local_package_path(set_up_local_environment, expected_results): - crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -203,7 +222,9 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results @@ -221,8 +242,13 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): 'crawl_twice', ] ) -def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results): - crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) +def test_wsp_local_package_path_crawl_twice( + set_up_local_environment, + expected_results, +): + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -236,7 +262,9 @@ def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_r ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results