From 07cf5e6712032fcbc54df815c12aaceb12724486 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 6 Jul 2017 17:15:32 +0200 Subject: [PATCH] WIP for desy spider Signed-off-by: Spiros Delviniotis --- .travis.yml | 1 + docker-compose.test.yml | 8 + hepcrawl/pipelines.py | 35 ++++- hepcrawl/spiders/desy_spider.py | 120 +++++++++++++++ hepcrawl/spiders/wsp_spider.py | 2 +- hepcrawl/utils.py | 23 ++- setup.py | 1 + .../desy/fixtures/desy_smoke_records.json | 58 ++++++++ .../desy/fixtures/ftp_server/.netrc | 3 + .../fixtures/ftp_server/DESY/desy_smoke.xml | 56 +++++++ .../desy/fixtures/ftp_server/pureftpd.passwd | 1 + tests/functional/desy/test_desy.py | 139 ++++++++++++++++++ 12 files changed, 439 insertions(+), 8 deletions(-) create mode 100644 hepcrawl/spiders/desy_spider.py create mode 100644 tests/functional/desy/fixtures/desy_smoke_records.json create mode 100644 tests/functional/desy/fixtures/ftp_server/.netrc create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml create mode 100644 tests/functional/desy/fixtures/ftp_server/pureftpd.passwd create mode 100644 tests/functional/desy/test_desy.py diff --git a/.travis.yml b/.travis.yml index f05e2d22..91407e6e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ env: - SUITE=unit - SUITE=functional_wsp - SUITE=functional_arxiv + - SUITE=functional_desy matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 7ffe0122..85d58d3d 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -34,6 +34,13 @@ services: - scrapyd - ftp_server + functional_desy: + <<: *service_base + command: py.test -vv tests/functional/desy + links: + - scrapyd + - ftp_server + functional_arxiv: <<: *service_base command: py.test -vv tests/functional/arxiv @@ -68,6 +75,7 @@ services: environment: - PUBLICHOST=localhost volumes: + - ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 62ba867c..22a066e1 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -20,6 +20,12 @@ import requests +from lxml import etree +from dojson.contrib.marc21.utils import create_record + +from inspire_dojson.hep import hep +from inspire_schemas.utils import validate + from .crawler2hep import crawler2hep @@ -50,9 +56,7 @@ def __init__(self): def open_spider(self, spider): self.results_data = [] - def process_item(self, item, spider): - """Convert internal format to INSPIRE data model.""" - self.count += 1 + def _post_enhance_item(self, item, spider): if 'related_article_doi' in item: item['dois'] += item.pop('related_article_doi', []) @@ -115,7 +119,30 @@ def process_item(self, item, spider): ]) item = crawler2hep(dict(item)) - spider.logger.debug('Validated item.') + spider.logger.debug('Validated item by Builder.') + return item + + def _read_item_from_marcxml(self, item, spider): # change names and split + item = etree.XML(item['desy_item']) + record_dojson = create_record(item) + record_valid = hep.do(record_dojson) + record_valid['acquisition_source'] = { + 'source': spider.name, + 'method': 'hepcrawl', + } + validate(record_valid, 'hep') + spider.logger.debug('Validated item by Dojson.') + return record_valid + + def process_item(self, item, spider): + """Convert internal format to INSPIRE data model.""" + self.count += 1 + + if spider.name == 'desy': # change logic for marcxml and spider name + item = self._read_item_from_marcxml(item, spider) + else: + item = self._post_enhance_item(item, spider) + self.results_data.append(item) return item diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py new file mode 100644 index 00000000..3584ab7a --- /dev/null +++ b/hepcrawl/spiders/desy_spider.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Spider for DESY.""" + +from __future__ import absolute_import, division, print_function + +import os +import urlparse + +from scrapy import Request +from scrapy.spiders import Spider + +from ..utils import ( + ftp_list_files, + ftp_connection_info, +) + + +class DesySpider(Spider): + """Desy spider. + + This spider connects to a given FTP hosts and downloads XML files + for extraction into HEP records. + + Examples: + To run a crawl, you need to pass FTP connection information via + ``ftp_host`` and ``ftp_netrc``:: + + $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + + To run a crawl on local folder, you need to pass the absolute ``package_path``:: + + $ scrapy crawl desy -a 'package_path=/path/to/package_dir' + """ + name = 'desy' + custom_settings = {} + start_urls = [] + itertag = 'article' + + def __init__( + self, + package_path=None, + ftp_folder='DESY', + ftp_host=None, + ftp_netrc=None, + *args, + **kwargs + ): + """Constructor of ``Desy`` spider.""" + super(DesySpider, self).__init__(*args, **kwargs) + self.ftp_folder = ftp_folder + self.ftp_host = ftp_host + self.ftp_netrc = ftp_netrc + self.package_path = package_path + self.target_folder = '/tmp/DESY' + if not os.path.exists(self.target_folder): + os.makedirs(self.target_folder) + + def start_requests(self): + """List selected folder on remote FTP and yield new zip files.""" + if self.package_path: + file_names = os.listdir(self.package_path) + + for file_name in file_names: + file_path = os.path.join(self.package_path, file_name) + yield Request( + 'file://{0}'.format(file_path), + callback=self.parse, + ) + else: + ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + + remote_files_paths = ftp_list_files( + self.ftp_folder, + target_folder=self.target_folder, + server=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'], + lst_missing_files=False, + ) + + for remote_file in remote_files_paths: + self.log('Try to crawl file from FTP: {0}'.format(remote_file)) + remote_file = str(remote_file) + ftp_params['ftp_local_filename'] = os.path.join( + self.target_folder, + os.path.basename(remote_file), + ) + remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp, + ) + + def parse(self, response): + """Parse a ``Desy`` XML file into a HEP record.""" + self.log('Got record from url/path: {0}'.format(response.url)) + + item = { + 'desy_item': response.body, + } + return item + + def handle_package_ftp(self, response): + """Handle a zip package and yield every XML found.""" + self.log('Visited url %s' % response.url) + self.log('response.body: %s' % response.body) + filepath = response.body + yield Request( + 'file://{0}'.format(filepath), + meta={'package_path': filepath} + ) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3f68131f..bef40a72 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -97,7 +97,7 @@ def start_requests(self): new_files_paths = ftp_list_files( self.ftp_folder, - self.target_folder, + target_folder=self.target_folder, server=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 4ad9db3c..6f983492 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -57,17 +57,34 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): return ftp_host, connection_params -def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False): +def ftp_list_files( + server_folder, + server, + user, + password, + target_folder=None, + passive_mode=False, + lst_missing_files=True, +): """List files from given FTP's server folder to target folder.""" session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, port=21, use_passive_mode=passive_mode, - encrypt_data_channel=True) + encrypt_data_channel=True, + ) with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) - return list_missing_files(server_folder, target_folder, file_names) + if lst_missing_files: + return list_missing_files(server_folder, target_folder, file_names) + else: + return [ + os.path.join( + server_folder, + file_name + ) for file_name in file_names + ] def local_list_files(local_folder, target_folder): diff --git a/setup.py b/setup.py index 7a2b454a..2c6736a5 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ install_requires = [ 'autosemver~=0.2', 'inspire-schemas~=41.0', + 'inspire-dojson~=41.0', 'Scrapy>=1.1.0', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', diff --git a/tests/functional/desy/fixtures/desy_smoke_records.json b/tests/functional/desy/fixtures/desy_smoke_records.json new file mode 100644 index 00000000..e4418a00 --- /dev/null +++ b/tests/functional/desy/fixtures/desy_smoke_records.json @@ -0,0 +1,58 @@ +[{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "control_number": 1608652, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/1608652" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}] \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/.netrc b/tests/functional/desy/fixtures/ftp_server/.netrc new file mode 100644 index 00000000..59a152f7 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/.netrc @@ -0,0 +1,3 @@ +machine ftp_server +login bob +password bob diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml new file mode 100644 index 00000000..17dd16b5 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml @@ -0,0 +1,56 @@ + + + + 1608652 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd new file mode 100644 index 00000000..275a727c --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd @@ -0,0 +1 @@ +bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./:::::::::::: diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py new file mode 100644 index 00000000..0e6d8562 --- /dev/null +++ b/tests/functional/desy/test_desy.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for Desy spider""" + +from __future__ import absolute_import, division, print_function + +import pytest + +from time import sleep + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.fixture(scope="function") +def set_up_ftp_environment(): + netrc_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + '.netrc', + test_suite='functional', + ) + + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(10) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'ftp_host': 'ftp_server', + 'ftp_netrc': netrc_location, + } + } + + +@pytest.fixture(scope="function") +def set_up_local_environment(): + package_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + test_suite='functional', + ) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'package_path': package_location, + } + } + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_smoke_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_ftp(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_smoke_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_local_package_path(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results