diff --git a/docker-compose.test.yml b/docker-compose.test.yml index e188e7f9..64836380 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -23,6 +23,7 @@ services: - ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/ - ${PWD}:/code/ - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf + - /tmp/WSP:/tmp/WSP # the tmp folder that keeps the history of the crawled records links: - rabbitmq - celery diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index d1aff3ff..6a5b02c8 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -23,6 +23,7 @@ from ..utils import ( ftp_list_files, ftp_connection_info, + local_list_files, get_license, unzip_xml_files, ) @@ -88,11 +89,17 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc def start_requests(self): """List selected folder on remote FTP and yield new zip files.""" if self.package_path: - yield Request(self.package_path, callback=self.handle_package_file) + new_files_paths = local_list_files( + self.package_path, + self.target_folder + ) + + for file_path in new_files_paths: + yield Request("file://{0}".format(file_path), callback=self.handle_package_file) else: ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) - dummy, new_files = ftp_list_files( + new_files_paths = ftp_list_files( self.ftp_folder, self.target_folder, server=ftp_host, @@ -100,7 +107,7 @@ def start_requests(self): password=ftp_params['ftp_password'] ) - for remote_file in new_files: + for remote_file in new_files_paths: # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( @@ -116,7 +123,7 @@ def start_requests(self): def handle_package_ftp(self, response): """Handle a zip package and yield every XML found.""" - self.log("Visited %s" % response.url) + self.log("Visited url %s" % response.url) zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) @@ -128,6 +135,7 @@ def handle_package_ftp(self, response): def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" + self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 6940c3c0..6b27a5d0 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -66,16 +66,24 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive encrypt_data_channel=True) with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: - files = host.listdir(host.curdir + '/' + server_folder) - missing_files = [] - all_files = [] - for filename in files: - destination_file = os.path.join(target_folder, filename) - source_file = os.path.join(server_folder, filename) - if not os.path.exists(destination_file): - missing_files.append(source_file) - all_files.append(source_file) - return all_files, missing_files + file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) + return list_missing_files(server_folder, target_folder, file_names) + + +def local_list_files(local_folder, target_folder): + file_names = os.listdir(local_folder) + return list_missing_files(local_folder, target_folder, file_names) + + +def list_missing_files(remote_folder, target_folder, file_names): + missing_files = [] + for file_name in file_names: + destination_file = os.path.join(target_folder, file_name) + source_file = os.path.join(remote_folder, file_name) + if not os.path.exists(destination_file): + missing_files.append(source_file) + + return missing_files def get_first(iterable, default=None): diff --git a/tests/functional/WSP/test_wsp.py b/tests/functional/WSP/test_wsp.py index 8c75f89d..8df90ab6 100644 --- a/tests/functional/WSP/test_wsp.py +++ b/tests/functional/WSP/test_wsp.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016, 2017 CERN. +# Copyright (C) 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -14,6 +14,7 @@ import pytest import json import os +import shutil from scrapyd_api import ScrapydAPI from time import sleep @@ -50,14 +51,14 @@ def expected_results(): return expected_data -@pytest.fixture(scope="module") -def set_up_environment(): - netrc_location = os.path.join(os.path.dirname( - os.path.realpath(__file__)), - 'fixtures/ftp_server/.netrc' +@pytest.fixture(scope="function") +def set_up_ftp_environment(): + netrc_location = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + os.path.join('fixtures', 'ftp_server', '.netrc') ) - return { + yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { @@ -66,9 +67,44 @@ def set_up_environment(): } } + clean_dir() + + +@pytest.fixture(scope="function") +def set_up_local_environment(): + package_location = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + os.path.join('fixtures', 'ftp_server', 'WSP') + ) + + yield { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'package_path': package_location, + } + } + + remove_generated_files(package_location) + + +def remove_generated_files(package_location): + clean_dir() + + _, dirs, files = next(os.walk(package_location)) + for dir_name in dirs: + clean_dir(os.path.join(package_location, dir_name)) + for file_name in files: + if not file_name.endswith('.zip'): + os.unlink(os.path.join(package_location, file_name)) + + +def clean_dir(path='/tmp/WSP/'): + shutil.rmtree(path, ignore_errors=True) -def test_wsp_normal_set_of_records(set_up_environment, expected_results): - crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL')) + +def test_wsp_ftp(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) # The test must wait until the docker environment is up (takes about 10 seconds). sleep(10) @@ -78,13 +114,33 @@ def test_wsp_normal_set_of_records(set_up_environment, expected_results): monitor_timeout=5, monitor_iter_limit=100, crawler_instance=crawler, - project=set_up_environment.get('CRAWLER_PROJECT'), + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + +def test_wsp_local_package_path(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_environment.get('CRAWLER_ARGUMENTS') + **set_up_local_environment.get('CRAWLER_ARGUMENTS') ) - gottern_results = [override_generated_fields(result) for result in results] + gotten_results = [override_generated_fields(result) for result in results] expected_results = [override_generated_fields(expected) for expected in expected_results] - assert gottern_results == expected_results + assert gotten_results == expected_results