diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 6a7c4721..d14d7c87 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -27,7 +27,7 @@ services: - ${PWD}:/code/ - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf - /tmp/WSP:/tmp/WSP - - /tmp/DESY:/tmp/DESY + - /tmp/file_urls:/tmp/file_urls functional_wsp: <<: *service_base diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index 78ac42ec..afbf94b1 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -17,6 +17,43 @@ from inspire_schemas.api import LiteratureBuilder +from hepcrawl.utils import get_file_name_from_url + + +def _update_record_fft_links(record, map_fft_file_paths): + def _list_new_fft_links(old_fft_links, map_fft_file_paths): + new_fft_links = [] + for fft_link in old_fft_links: + file_name = get_file_name_from_url(fft_link['path']) + if file_name in map_fft_file_paths: + new_fft_links.append( + { + 'path': map_fft_file_paths[file_name], + } + ) + + return new_fft_links + + old_fft_links = record['_fft'] + record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths) + return record + + +def to_hep(item, item_format='hepcrawl', fft_file_paths=None): + if item_format == 'hep': + return hep2hep(item, fft_file_paths) + elif item_format == 'hepcrawl': + return crawler2hep(dict(item)) + else: + raise Exception('Unknown item_format::{}'.format(item_format)) + + +def hep2hep(crawler_record, fft_file_paths): + if fft_file_paths: + crawler_record = _update_record_fft_links(crawler_record, fft_file_paths) + + return crawler_record + def crawler2hep(crawler_record): diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index d8524245..2244d255 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -25,7 +25,7 @@ from inspire_schemas.utils import validate -from .crawler2hep import crawler2hep +from hepcrawl.crawler2hep import to_hep from hepcrawl.settings import FILES_STORE from hepcrawl.utils import get_file_name_from_url @@ -95,10 +95,7 @@ def open_spider(self, spider): self.results_data = [] def _post_enhance_item(self, item, spider): - item = self._generate_record_meta(item, spider) - source = spider.name - - if source != 'desy': # Should be changed to other generic flag like "hep_record" + def _normalize_hepcrawl_record(item, source): if 'related_article_doi' in item: item['dois'] += item.pop('related_article_doi', []) @@ -155,10 +152,25 @@ def _post_enhance_item(self, item, spider): 'pubinfo_material', ]) - item = crawler2hep(dict(item)) - spider.logger.debug('Validated item by Builder.') + return item - return item + fft_file_paths = item.get('file_paths') + item_format = item.get('format', 'hepcrawl') + item = item.get('record_item') if item.get('record_item') else item + item = self._generate_record_meta(item, spider) + source = spider.name + + if item_format != 'hep': + item = _normalize_hepcrawl_record( + item=item, + source=source, + ) + + return to_hep( + item=item, + item_format=item_format, + fft_file_paths=fft_file_paths, + ) def _generate_record_meta(self, json_record, spider): json_record['acquisition_source'] = { @@ -169,42 +181,18 @@ def _generate_record_meta(self, json_record, spider): } return json_record - def _update_record_fft_links(self, record, map_fft_file_paths): - def _list_new_fft_links(old_fft_links, map_fft_file_paths): - new_fft_links = [] - for fft_link in old_fft_links: - file_name = get_file_name_from_url(fft_link['path']) - if file_name in map_fft_file_paths: - new_fft_links.append( - { - 'path': map_fft_file_paths[file_name], - } - ) - - return new_fft_links - - old_fft_links = record['_fft'] - - # Provides only list of FFT paths, not list of dicts as it is defined in schemas - record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths) - - return record - def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" self.count += 1 - fft_file_paths = item.get('file_paths') - if item.get('hep_record'): - item = item.get('record_item') - if fft_file_paths: - item = self._update_record_fft_links(item, fft_file_paths) + hep_item = self._post_enhance_item(item, spider) - item = self._post_enhance_item(item, spider) - validate(item, 'hep') - self.results_data.append(item) + validate(hep_item, 'hep') + spider.logger.debug('Validated item by Inspire Schemas.') - return item + self.results_data.append(hep_item) + + return hep_item def _prepare_payload(self, spider): """Return payload for push.""" diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index a5393d4d..76886b23 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -12,7 +12,6 @@ from __future__ import absolute_import, division, print_function import os -import urlparse # must replaced for supporting python 3 from lxml import etree from dojson.contrib.marc21.utils import create_record @@ -155,11 +154,10 @@ def parse(self, response): ] yield { - 'FTP': self.ftp_enabled, 'record_item': hep_record, 'file_urls': list_file_urls, 'ftp_params': ftp_params, - 'hep_record': True, + 'format': 'hep', } def handle_package_ftp(self, response): diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index 513b0395..c9d53339 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -11,6 +11,7 @@ import os import json +import shutil from scrapy.http import Request, TextResponse from scrapy.selector import Selector @@ -131,3 +132,18 @@ def expected_json_results_from_file(*path_chunks, **kwargs): expected_data = json.load(fd) return expected_data + + +def clean_dir(path='/tmp/WSP/'): + """ + Deletes all contained files of given target directory path. + + Args: + path: Absolute path of target directory to be cleaned. + + Example: + + >>> clean_dir('/dir_1/dir_11/') + + """ + shutil.rmtree(path, ignore_errors=True) diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index e2a14406..2b4d8c71 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -20,6 +20,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.utils import get_crawler_instance @@ -34,16 +35,40 @@ def override_generated_fields(record): def compare_two_files_using_md5(file_1, file_2): """Compares two files calculating the md5 hash.""" - def _generate_md5_hash(file): + def _generate_md5_hash(file_path): hasher = hashlib.md5() - with open(str(file), 'rb') as f: - buf = f.read() + with open(str(file_path), 'rb') as fd: + buf = fd.read() hasher.update(buf) return hasher.hexdigest() return _generate_md5_hash(file_1) == _generate_md5_hash(file_2) +@pytest.fixture(scope="function") +def get_fft_1_path(): + return get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'FFT', + 'test_fft_1.txt', + test_suite='functional', + ) + + +@pytest.fixture(scope="function") +def get_fft_2_path(): + return get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'FFT', + 'test_fft_2.txt', + test_suite='functional', + ) + + @pytest.fixture(scope="function") def set_up_ftp_environment(): netrc_location = get_test_suite_path( @@ -57,7 +82,7 @@ def set_up_ftp_environment(): # The test must wait until the docker environment is up (takes about 10 seconds). sleep(10) - return { + yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { @@ -66,6 +91,9 @@ def set_up_ftp_environment(): } } + clean_dir('/tmp/file_urls') + clean_dir('/tmp/DESY') + @pytest.fixture(scope="function") def set_up_local_environment(): @@ -77,7 +105,7 @@ def set_up_local_environment(): test_suite='functional', ) - return { + yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { @@ -85,6 +113,9 @@ def set_up_local_environment(): } } + clean_dir('/tmp/file_urls') + clean_dir('/tmp/DESY') + @pytest.mark.parametrize( 'expected_results', @@ -99,7 +130,12 @@ def set_up_local_environment(): 'smoke', ] ) -def test_desy_ftp(set_up_ftp_environment, expected_results, capsys): +def test_desy_ftp( + set_up_ftp_environment, + expected_results, + get_fft_1_path, + get_fft_2_path, +): crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( @@ -119,11 +155,12 @@ def test_desy_ftp(set_up_ftp_environment, expected_results, capsys): assert sorted(gotten_results) == expected_results - # Check if downloaded files are there MD5 - # for record in expected_results: # WIP - # fft_file_paths = sorted(record['_fft']) - # assert compare_two_files_using_md5(fft_file_paths[0]['path'], 'file_1 from ftp server') - # assert compare_two_files_using_md5(fft_file_paths[1]['path'], 'file_2 from ftp server') + # Check using MD5 Hash if downloaded files are there. + for record in expected_results: + fft_file_paths = sorted(record['_fft']) + + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path) @pytest.mark.parametrize( @@ -139,7 +176,12 @@ def test_desy_ftp(set_up_ftp_environment, expected_results, capsys): 'smoke', ] ) -def test_desy_local_package_path(set_up_local_environment, expected_results): +def test_desy_local_package_path( + set_up_local_environment, + expected_results, + get_fft_1_path, + get_fft_2_path, +): crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( @@ -159,6 +201,10 @@ def test_desy_local_package_path(set_up_local_environment, expected_results): assert sorted(gotten_results) == expected_results - # Check if downloaded files are there MD5 - # WIP + # Check using MD5 Hash if downloaded files are there. + for record in expected_results: + fft_file_paths = sorted(record['_fft']) + + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path) diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index ea38582a..a0411b8e 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -13,7 +13,6 @@ import pytest import os -import shutil from time import sleep @@ -21,6 +20,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.utils import get_crawler_instance @@ -90,10 +90,6 @@ def remove_generated_files(package_location): os.unlink(os.path.join(package_location, file_name)) -def clean_dir(path='/tmp/WSP/'): - shutil.rmtree(path, ignore_errors=True) - - @pytest.mark.parametrize( 'expected_results', [