From 54970bf8a4e50c5f8bdbe08f957405acb0aae100 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 6 Jul 2017 17:15:32 +0200 Subject: [PATCH] WIP for desy spider Signed-off-by: Spiros Delviniotis --- .travis.yml | 1 + docker-compose.test.yml | 8 + hepcrawl/crawler2hep.py | 2 +- hepcrawl/pipelines.py | 61 +++- hepcrawl/spiders/desy_spider.py | 129 +++++++++ hepcrawl/spiders/wsp_spider.py | 2 +- hepcrawl/testlib/celery_monitor.py | 21 +- hepcrawl/utils.py | 23 +- setup.py | 1 + tests/functional/arxiv/test_arxiv.py | 1 + .../desy/fixtures/desy_smoke_records.json | 232 +++++++++++++++ .../desy/fixtures/ftp_server/.netrc | 3 + .../DESY/desy_collection_records.xml | 105 +++++++ .../desy_no_namespace_collection_records.xml | 105 +++++++ .../desy/fixtures/ftp_server/pureftpd.passwd | 1 + tests/functional/desy/test_desy.py | 141 +++++++++ tests/functional/wsp/test_wsp.py | 2 + .../in_generic_crawler_record.yaml | 2 +- .../crawler2hep/in_no_document_type.yaml | 2 +- .../desy/desy_collection_records.xml | 105 +++++++ tests/unit/responses/desy/desy_record.xml | 54 ++++ tests/unit/test_arxiv_all.py | 31 +- tests/unit/test_arxiv_single.py | 11 +- tests/unit/test_desy.py | 274 ++++++++++++++++++ tests/unit/test_pos.py | 3 +- tests/unit/test_world_scientific.py | 3 +- 26 files changed, 1267 insertions(+), 56 deletions(-) create mode 100644 hepcrawl/spiders/desy_spider.py create mode 100644 tests/functional/desy/fixtures/desy_smoke_records.json create mode 100644 tests/functional/desy/fixtures/ftp_server/.netrc create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml create mode 100644 tests/functional/desy/fixtures/ftp_server/pureftpd.passwd create mode 100644 tests/functional/desy/test_desy.py create mode 100644 tests/unit/responses/desy/desy_collection_records.xml create mode 100644 tests/unit/responses/desy/desy_record.xml create mode 100644 tests/unit/test_desy.py diff --git a/.travis.yml b/.travis.yml index f05e2d22..91407e6e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ env: - SUITE=unit - SUITE=functional_wsp - SUITE=functional_arxiv + - SUITE=functional_desy matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 7ffe0122..85d58d3d 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -34,6 +34,13 @@ services: - scrapyd - ftp_server + functional_desy: + <<: *service_base + command: py.test -vv tests/functional/desy + links: + - scrapyd + - ftp_server + functional_arxiv: <<: *service_base command: py.test -vv tests/functional/arxiv @@ -68,6 +75,7 @@ services: environment: - PUBLICHOST=localhost volumes: + - ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index d6898022..78ac42ec 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -98,7 +98,7 @@ def _filter_affiliation(affiliations): acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], - date=acquisition_source['date'], + date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 62ba867c..fcc6eef3 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -20,6 +20,12 @@ import requests +from lxml import etree +from dojson.contrib.marc21.utils import create_record + +from inspire_dojson.hep import hep +from inspire_schemas.utils import validate + from .crawler2hep import crawler2hep @@ -50,20 +56,12 @@ def __init__(self): def open_spider(self, spider): self.results_data = [] - def process_item(self, item, spider): - """Convert internal format to INSPIRE data model.""" - self.count += 1 + def _post_enhance_item(self, item, spider): + item = self._generate_record_meta(item, spider) if 'related_article_doi' in item: item['dois'] += item.pop('related_article_doi', []) source = spider.name - item['acquisition_source'] = { - 'source': source, - 'method': 'hepcrawl', - 'date': datetime.datetime.now().isoformat(), - 'submission_number': os.environ.get('SCRAPY_JOB', ''), - } - item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), @@ -115,10 +113,49 @@ def process_item(self, item, spider): ]) item = crawler2hep(dict(item)) - spider.logger.debug('Validated item.') - self.results_data.append(item) + spider.logger.debug('Validated item by Builder.') return item + def _read_item_from_marcxml(self, item, spider): # change names and split + def _create_valid_hep_record(str_xml_record): + object_record = create_record(etree.XML(str_xml_record)) + dojson_record = hep.do(object_record) + dojson_record = self._generate_record_meta(dojson_record, spider) + validate(dojson_record, 'hep') + return dojson_record + + list_hep_records = [] + for str_xml_record in item['marcxml']: + hep_record = _create_valid_hep_record(str_xml_record) + spider.logger.debug('Validated hep-record.') + list_hep_records.append(hep_record) + + return list_hep_records + + def _generate_record_meta(self, item, spider): + item['acquisition_source'] = { + 'source': spider.name, + 'method': 'hepcrawl', + 'datetime': datetime.datetime.now().isoformat(), + 'submission_number': os.environ.get('SCRAPY_JOB', ''), + } + return item + + def process_item(self, item, spider): + """Convert internal format to INSPIRE data model.""" + self.count += 1 + + if item.get('marcxml'): + item = self._read_item_from_marcxml(item, spider) + self.results_data.extend(item) + else: + item = self._post_enhance_item(item, spider) + self.results_data.append(item) + + return { + 'dict': item + } + def _prepare_payload(self, spider): """Return payload for push.""" payload = dict( diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py new file mode 100644 index 00000000..3735b664 --- /dev/null +++ b/hepcrawl/spiders/desy_spider.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Spider for DESY.""" + +from __future__ import absolute_import, division, print_function + +import os + +from lxml import etree + + +from scrapy import Request +from scrapy.spiders import Spider + +from ..utils import ( + ftp_list_files, + ftp_connection_info, +) + + +class DesySpider(Spider): + """Desy spider. + + This spider connects to a given FTP hosts and downloads XML files + for extraction into HEP records. + + Examples: + To run a crawl, you need to pass FTP connection information via + ``ftp_host`` and ``ftp_netrc``:: + + $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + + To run a crawl on local folder, you need to pass the absolute ``package_path``:: + + $ scrapy crawl desy -a 'package_path=/path/to/package_dir' + """ + name = 'desy' + custom_settings = {} + start_urls = [] + + def __init__( + self, + package_path=None, + ftp_folder='DESY', + ftp_host=None, + ftp_netrc=None, + *args, + **kwargs + ): + """Constructor of ``Desy`` spider.""" + super(DesySpider, self).__init__(*args, **kwargs) + self.ftp_folder = ftp_folder + self.ftp_host = ftp_host + self.ftp_netrc = ftp_netrc + self.package_path = package_path + self.target_folder = '/tmp/DESY' + if not os.path.exists(self.target_folder): + os.makedirs(self.target_folder) + + def start_requests(self): + """List selected folder on remote FTP and yield files.""" + if self.package_path: + file_names = os.listdir(self.package_path) + + for file_name in file_names: + file_path = os.path.join(self.package_path, file_name) + yield Request( + 'file://{0}'.format(file_path), + callback=self.parse, + ) + else: + ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + + remote_files_paths = ftp_list_files( + self.ftp_folder, + target_folder=self.target_folder, + server=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'], + lst_missing_files=False, + ) + + for remote_file in remote_files_paths: + self.log('Try to crawl file from FTP: {0}'.format(remote_file)) + remote_file = str(remote_file) + ftp_params['ftp_local_filename'] = os.path.join( + self.target_folder, + os.path.basename(remote_file), + ) + remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp, + ) + + def parse(self, response): + """Parse a ``Desy`` XML file into a HEP record.""" + self.log('Got record from url/path: {0}'.format(response.url)) + + list_marcxml_records = self._get_records(response.body) + + return { + 'marcxml': list_marcxml_records, + } + + def handle_package_ftp(self, response): + """Yield every XML file found.""" + self.log('Visited url {}'.format(response.url)) + file_path = response.body + yield Request( + 'file://{0}'.format(file_path), + meta={'package_path': file_path} + ) + + def _get_records(self, response_body): + root = etree.fromstring(response_body) + list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record') + if not list_items: + list_items = root.findall('.//record') + + return [etree.tostring(item) for item in list_items] diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3f68131f..bef40a72 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -97,7 +97,7 @@ def start_requests(self): new_files_paths = ftp_list_files( self.ftp_folder, - self.target_folder, + target_folder=self.target_folder, server=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py index 6c720550..7201b68e 100644 --- a/hepcrawl/testlib/celery_monitor.py +++ b/hepcrawl/testlib/celery_monitor.py @@ -19,13 +19,14 @@ class CeleryMonitor(object): - def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100): + def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2): self.results = [] self.recv = None self.app = app self.connection = None self.monitor_timeout = monitor_timeout self.monitor_iter_limit = monitor_iter_limit + self.events_limit = events_limit def __enter__(self): state = self.app.events.State() @@ -61,10 +62,16 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.connection.__exit__() def _wait_for_results(self, events_iter): - any(islice( + generator_events = islice( events_iter, # iterable self.monitor_iter_limit # stop - )) + ) + counter = 0 + for dummy in generator_events: + if dummy: + counter += 1 + if counter == self.events_limit: + break @classmethod def do_crawl( @@ -72,6 +79,7 @@ def do_crawl( app, monitor_timeout, monitor_iter_limit, + events_limit, crawler_instance, project='hepcrawl', spider='WSP', @@ -80,7 +88,12 @@ def do_crawl( ): settings = settings or {} - with cls(app, monitor_timeout=monitor_timeout, monitor_iter_limit=monitor_iter_limit) as my_monitor: + with cls( + app, + monitor_timeout=monitor_timeout, + monitor_iter_limit=monitor_iter_limit, + events_limit=events_limit + ) as my_monitor: crawler_instance.schedule( project=project, spider=spider, diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 4ad9db3c..6f983492 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -57,17 +57,34 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): return ftp_host, connection_params -def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False): +def ftp_list_files( + server_folder, + server, + user, + password, + target_folder=None, + passive_mode=False, + lst_missing_files=True, +): """List files from given FTP's server folder to target folder.""" session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, port=21, use_passive_mode=passive_mode, - encrypt_data_channel=True) + encrypt_data_channel=True, + ) with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) - return list_missing_files(server_folder, target_folder, file_names) + if lst_missing_files: + return list_missing_files(server_folder, target_folder, file_names) + else: + return [ + os.path.join( + server_folder, + file_name + ) for file_name in file_names + ] def local_list_files(local_folder, target_folder): diff --git a/setup.py b/setup.py index 7a2b454a..2c6736a5 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ install_requires = [ 'autosemver~=0.2', 'inspire-schemas~=41.0', + 'inspire-dojson~=41.0', 'Scrapy>=1.1.0', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index a9677b89..0f58b17d 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -72,6 +72,7 @@ def test_arxiv(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', diff --git a/tests/functional/desy/fixtures/desy_smoke_records.json b/tests/functional/desy/fixtures/desy_smoke_records.json new file mode 100644 index 00000000..2b241e5a --- /dev/null +++ b/tests/functional/desy/fixtures/desy_smoke_records.json @@ -0,0 +1,232 @@ +[{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "control_number": 222222, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/222222" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "control_number": 333333, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/333333" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "control_number": 444444, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/444444" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}] \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/.netrc b/tests/functional/desy/fixtures/ftp_server/.netrc new file mode 100644 index 00000000..59a152f7 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/.netrc @@ -0,0 +1,3 @@ +machine ftp_server +login bob +password bob diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml new file mode 100644 index 00000000..5a57f51c --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml @@ -0,0 +1,105 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + + 222222 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml new file mode 100644 index 00000000..44266cd4 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml @@ -0,0 +1,105 @@ + + + + 333333 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + + 444444 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd new file mode 100644 index 00000000..275a727c --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd @@ -0,0 +1 @@ +bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./:::::::::::: diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py new file mode 100644 index 00000000..6f1c2686 --- /dev/null +++ b/tests/functional/desy/test_desy.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for Desy spider""" + +from __future__ import absolute_import, division, print_function + +import pytest + +from time import sleep + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.fixture(scope="function") +def set_up_ftp_environment(): + netrc_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + '.netrc', + test_suite='functional', + ) + + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(10) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'ftp_host': 'ftp_server', + 'ftp_netrc': netrc_location, + } + } + + +@pytest.fixture(scope="function") +def set_up_local_environment(): + package_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + test_suite='functional', + ) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'package_path': package_location, + } + } + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_smoke_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_ftp(set_up_ftp_environment, expected_results): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_smoke_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_local_package_path(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 70996466..ea38582a 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -114,6 +114,7 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', @@ -147,6 +148,7 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='WSP', diff --git a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml b/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml index 4e80ba6b..1ade2b4b 100644 --- a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml +++ b/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml @@ -3,7 +3,7 @@ "11" ], "acquisition_source": { - "date": "2017-02-21T18:03:40.858985", + "datetime": "2017-02-21T18:03:40.858985", "source": "arXiv", "method": "hepcrawl", "submission_number": "scrapy_job" diff --git a/tests/unit/responses/crawler2hep/in_no_document_type.yaml b/tests/unit/responses/crawler2hep/in_no_document_type.yaml index 22b93fd0..21543c36 100644 --- a/tests/unit/responses/crawler2hep/in_no_document_type.yaml +++ b/tests/unit/responses/crawler2hep/in_no_document_type.yaml @@ -5,7 +5,7 @@ "11" ], "acquisition_source": { - "date": "2017-02-21T18:03:40.858985", + "datetime": "2017-02-21T18:03:40.858985", "source": "arXiv", "method": "hepcrawl", "submission_number": "scrapy_job" diff --git a/tests/unit/responses/desy/desy_collection_records.xml b/tests/unit/responses/desy/desy_collection_records.xml new file mode 100644 index 00000000..5a57f51c --- /dev/null +++ b/tests/unit/responses/desy/desy_collection_records.xml @@ -0,0 +1,105 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + + 222222 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + \ No newline at end of file diff --git a/tests/unit/responses/desy/desy_record.xml b/tests/unit/responses/desy/desy_record.xml new file mode 100644 index 00000000..8219064f --- /dev/null +++ b/tests/unit/responses/desy/desy_record.xml @@ -0,0 +1,54 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + \ No newline at end of file diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index bd75e5a4..063807f8 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -11,7 +11,8 @@ import pytest -from scrapy.crawler import Crawler +from scrapy.crawler import Crawler +from scrapy.http import TextResponse from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider @@ -25,34 +26,14 @@ def spider(): return spider -@pytest.fixture -def one_result(spider): - """Return results generator from the arxiv spider. Tricky fields, one - record. - """ - from scrapy.http import TextResponse - - records = list( - spider.parse( - fake_response_from_file( - 'arxiv/sample_arxiv_record0.xml', - response_type=TextResponse, - ) - ) - ) - - assert records - pipeline = InspireCeleryPushPipeline() - pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] - - @pytest.fixture def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ - from scrapy.http import TextResponse + def _get_processed_item(record, spider): + dict_item = pipeline.process_item(record, spider) + return dict_item['dict'] records = list( spider.parse( @@ -66,7 +47,7 @@ def many_results(spider): assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] + return [_get_processed_item(record, spider) for record in records] def test_page_nr(many_results): diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index a6ed66d6..c9a91125 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -24,6 +24,10 @@ def results(): """Return results generator from the arxiv spider. All fields, one record. """ + def _get_processed_item(record, spider): + dict_item = pipeline.process_item(record, spider) + validate(dict_item['dict'], 'hep') + return dict_item['dict'] crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) @@ -39,13 +43,8 @@ def results(): assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - processed_records = [] - for record in records: - processed_record = pipeline.process_item(record, spider) - validate(processed_record, 'hep') - processed_records.append(processed_record) - return processed_records + return [_get_processed_item(record, spider) for record in records] diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py new file mode 100644 index 00000000..ce46aaf1 --- /dev/null +++ b/tests/unit/test_desy.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +import pytest +import os + +from scrapy.crawler import Crawler +from scrapy.http import TextResponse + +from hepcrawl.pipelines import InspireCeleryPushPipeline +from hepcrawl.spiders import desy_spider + +from hepcrawl.testlib.fixtures import fake_response_from_file + + +def create_spider(): + crawler = Crawler(spidercls=desy_spider.DesySpider) + return desy_spider.DesySpider.from_crawler(crawler) + + +def get_records(response_file_name): + """Return all results generator from the ``Desy`` spider via pipelines.""" + # environmental variables needed for the pipelines payload + os.environ['SCRAPY_JOB'] = 'scrapy_job' + os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' + os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' + + spider = create_spider() + records = spider.parse( + fake_response_from_file( + file_name=response_file_name, + response_type=TextResponse + ) + ) + + pipeline = InspireCeleryPushPipeline() + pipeline.open_spider(spider) + + return ( + pipeline.process_item( + { + dummy: record + }, + spider + ) for dummy, record in records.items() + ) + + +def get_one_record(response_file_name): + results = get_records(response_file_name) + dict_results = results.next() + return dict_results['dict'] + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = '2017-05-04T17:49:07.975168' + record['acquisition_source']['submission_number'] = '5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.mark.parametrize( + 'generated_record', + [ + get_one_record('desy/desy_record.xml'), + ], + ids=[ + 'smoke', + ] +) +def test_pipeline_record(generated_record): + expected = [{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-05-04T17:49:07.975168" + }, + "_collections": [ + "Literature" + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] + }] + + generated_results = [override_generated_fields(rec) for rec in generated_record] + + assert generated_results == expected + + +@pytest.mark.parametrize( + 'generated_record', + [ + get_one_record('desy/desy_collection_records.xml'), + ], + ids=[ + 'smoke', + ] +) +def test_pipeline_collection_records(generated_record): + expected = [{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-05-04T17:49:07.975168" + }, + "_collections": [ + "Literature" + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] + }, + { + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-05-04T17:49:07.975168" + }, + "_collections": [ + "Literature" + ], + "control_number": 222222, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/222222" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] + } + ] + + generated_results = [override_generated_fields(rec) for rec in generated_record] + + assert generated_results == expected diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 20c872f4..c4c02a4e 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -52,7 +52,8 @@ def record(scrape_pos_page_body): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) record = request.callback(response) - return pipeline.process_item(record, spider) + processed_record = pipeline.process_item(record, spider) + return processed_record['dict'] def test_titles(record): diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 36438ab4..2033d192 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -49,7 +49,8 @@ def get_records(response_file_name): def get_one_record(response_file_name): results = get_records(response_file_name) - return results.next() + dict_results = results.next() + return dict_results['dict'] def override_generated_fields(record):