From 840f18a5fe5b9ed81b31e00fd2851105fd4fdf25 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 6 Jul 2017 17:15:32 +0200 Subject: [PATCH] WIP for desy spider Signed-off-by: Spiros Delviniotis --- .travis.yml | 1 + docker-compose.test.yml | 11 + hepcrawl/crawler2hep.py | 143 ++++++- hepcrawl/pipelines.py | 142 +++---- hepcrawl/settings.py | 2 +- hepcrawl/spiders/alpha_spider.py | 12 +- hepcrawl/spiders/aps_spider.py | 15 +- hepcrawl/spiders/arxiv_spider.py | 15 +- hepcrawl/spiders/base_spider.py | 15 +- hepcrawl/spiders/brown_spider.py | 14 +- hepcrawl/spiders/desy_spider.py | 185 +++++++++ hepcrawl/spiders/dnb_spider.py | 15 +- hepcrawl/spiders/edp_spider.py | 17 +- hepcrawl/spiders/elsevier_spider.py | 10 +- hepcrawl/spiders/hindawi_spider.py | 12 +- hepcrawl/spiders/infn_spider.py | 13 +- hepcrawl/spiders/iop_spider.py | 8 +- hepcrawl/spiders/magic_spider.py | 12 +- hepcrawl/spiders/mit_spider.py | 13 +- hepcrawl/spiders/phenix_spider.py | 8 +- hepcrawl/spiders/phil_spider.py | 13 +- hepcrawl/spiders/pos_spider.py | 18 +- hepcrawl/spiders/t2k_spider.py | 12 +- hepcrawl/spiders/wsp_spider.py | 15 +- hepcrawl/testlib/celery_monitor.py | 21 +- hepcrawl/testlib/fixtures.py | 16 + hepcrawl/utils.py | 59 ++- setup.py | 1 + tests/functional/arxiv/test_arxiv.py | 1 + .../desy/fixtures/desy_ftp_records.json | 308 ++++++++++++++ .../desy/fixtures/desy_local_records.json | 308 ++++++++++++++ .../desy/fixtures/ftp_server/.netrc | 3 + .../ftp_server/DESY/FFT/test_fft_1.txt | 1 + .../ftp_server/DESY/FFT/test_fft_2.txt | 1 + .../DESY/desy_collection_records.xml | 149 +++++++ .../desy_no_namespace_collection_records.xml | 149 +++++++ .../ftp_server/DESY/file_not_for_download.txt | 1 + .../desy/fixtures/ftp_server/pureftpd.passwd | 1 + tests/functional/desy/test_desy.py | 212 ++++++++++ tests/functional/wsp/test_wsp.py | 8 +- .../in_generic_crawler_record.yaml | 2 +- .../crawler2hep/in_no_document_type.yaml | 2 +- .../desy/desy_collection_records.xml | 149 +++++++ tests/unit/responses/desy/desy_record.xml | 76 ++++ tests/unit/test_alpha.py | 4 +- tests/unit/test_aps.py | 4 +- tests/unit/test_arxiv_all.py | 35 +- tests/unit/test_arxiv_single.py | 14 +- tests/unit/test_base.py | 18 +- tests/unit/test_brown.py | 12 +- tests/unit/test_desy.py | 381 ++++++++++++++++++ tests/unit/test_dnb.py | 10 +- tests/unit/test_edp.py | 51 ++- tests/unit/test_elsevier.py | 35 +- tests/unit/test_hindawi.py | 7 +- tests/unit/test_infn.py | 13 +- tests/unit/test_iop.py | 15 +- tests/unit/test_magic.py | 19 +- tests/unit/test_mit.py | 20 +- tests/unit/test_phenix.py | 12 +- tests/unit/test_phil.py | 19 +- tests/unit/test_pos.py | 6 +- tests/unit/test_t2k.py | 14 +- tests/unit/test_world_scientific.py | 3 +- 64 files changed, 2631 insertions(+), 250 deletions(-) create mode 100644 hepcrawl/spiders/desy_spider.py create mode 100644 tests/functional/desy/fixtures/desy_ftp_records.json create mode 100644 tests/functional/desy/fixtures/desy_local_records.json create mode 100644 tests/functional/desy/fixtures/ftp_server/.netrc create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt create mode 100644 tests/functional/desy/fixtures/ftp_server/pureftpd.passwd create mode 100644 tests/functional/desy/test_desy.py create mode 100644 tests/unit/responses/desy/desy_collection_records.xml create mode 100644 tests/unit/responses/desy/desy_record.xml create mode 100644 tests/unit/test_desy.py diff --git a/.travis.yml b/.travis.yml index f05e2d22..91407e6e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ env: - SUITE=unit - SUITE=functional_wsp - SUITE=functional_arxiv + - SUITE=functional_desy matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 7ffe0122..d14d7c87 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -17,6 +17,7 @@ services: - APP_CELERY_RESULT_BACKEND=amqp://guest:guest@rabbitmq:5672// - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results + - APP_FILES_STORE=/tmp/file_urls - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} - BASE_USER_GIT=${BASE_USER_GIT:-1000} @@ -26,6 +27,7 @@ services: - ${PWD}:/code/ - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf - /tmp/WSP:/tmp/WSP + - /tmp/file_urls:/tmp/file_urls functional_wsp: <<: *service_base @@ -34,6 +36,13 @@ services: - scrapyd - ftp_server + functional_desy: + <<: *service_base + command: py.test -vv tests/functional/desy + links: + - scrapyd + - ftp_server + functional_arxiv: <<: *service_base command: py.test -vv tests/functional/arxiv @@ -68,6 +77,8 @@ services: environment: - PUBLICHOST=localhost volumes: + - ${PWD}/tests/functional/desy/fixtures/ftp_server/FFT:/home/ftpusers/bob/FFT + - ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index d6898022..add53f70 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -15,8 +15,149 @@ from __future__ import absolute_import, division, print_function +import os +import datetime + from inspire_schemas.api import LiteratureBuilder +from hepcrawl.utils import get_file_name_from_url + + +def _update_record_fft(record, index_fft_file_paths): + def _update_fft_fields(fft_fields, index_fft_file_paths): + new_fft_fields = [] + for fft_field in fft_fields: + file_name = get_file_name_from_url(fft_field['path']) + if file_name in index_fft_file_paths: + fft_field['path'] = index_fft_file_paths[file_name] + new_fft_fields.append(fft_field) + + return new_fft_fields + + record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths) + return record + + +def _has_publication_info(item): + """If any publication info.""" + return item.get('pubinfo_freetext') or item.get('journal_volume') or \ + item.get('journal_title') or \ + item.get('journal_year') or \ + item.get('journal_issue') or \ + item.get('journal_fpage') or \ + item.get('journal_lpage') or \ + item.get('journal_artid') or \ + item.get('journal_doctype') + + +def _filter_fields(item, keys): + """Filter away keys.""" + for key in keys: + item.pop(key, None) + + +def _normalize_hepcrawl_record(item, source): + if 'related_article_doi' in item: + item['dois'] += item.pop('related_article_doi', []) + + item['titles'] = [{ + 'title': item.pop('title', ''), + 'subtitle': item.pop('subtitle', ''), + 'source': source, + }] + + item['abstracts'] = [{ + 'value': item.pop('abstract', ''), + 'source': source, + }] + + item['imprints'] = [{ + 'date': item.pop('date_published', ''), + }] + + item['copyright'] = [{ + 'holder': item.pop('copyright_holder', ''), + 'year': item.pop('copyright_year', ''), + 'statement': item.pop('copyright_statement', ''), + 'material': item.pop('copyright_material', ''), + }] + + if _has_publication_info(item): + item['publication_info'] = [{ + 'journal_title': item.pop('journal_title', ''), + 'journal_volume': item.pop('journal_volume', ''), + 'journal_issue': item.pop('journal_issue', ''), + 'artid': item.pop('journal_artid', ''), + 'page_start': item.pop('journal_fpage', ''), + 'page_end': item.pop('journal_lpage', ''), + 'note': item.pop('journal_doctype', ''), + 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), + 'pubinfo_material': item.pop('pubinfo_material', ''), + }] + if item.get('journal_year'): + item['publication_info'][0]['year'] = int( + item.pop('journal_year') + ) + + # Remove any fields + _filter_fields(item, [ + 'journal_title', + 'journal_volume', + 'journal_year', + 'journal_issue', + 'journal_fpage', + 'journal_lpage', + 'journal_doctype', + 'journal_artid', + 'pubinfo_freetext', + 'pubinfo_material', + ]) + + return item + + +def _generate_acquisition_source(crawler_record, source): + crawler_record['acquisition_source'] = { + 'source': source, + 'method': 'hepcrawl', + 'datetime': datetime.datetime.now().isoformat(), + 'submission_number': os.environ.get('SCRAPY_JOB', ''), + } + return crawler_record + + +def to_hep( + item, + source, + item_format='hepcrawl', + fft_file_paths=None, +): + item = _generate_acquisition_source( + crawler_record=item, + source=source, + ) + + if item_format == 'hep': + return hep2hep( + crawler_record=item, + fft_file_paths=fft_file_paths, + ) + elif item_format == 'hepcrawl': + item = _normalize_hepcrawl_record( + item=item, + source=source, + ) + return crawler2hep(dict(item)) + else: + raise Exception('Unknown item_format::{}'.format(item_format)) + + +def hep2hep(crawler_record, fft_file_paths): + if fft_file_paths: + crawler_record = _update_record_fft(crawler_record, fft_file_paths) + + return crawler_record + def crawler2hep(crawler_record): @@ -98,7 +239,7 @@ def _filter_affiliation(affiliations): acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], - date=acquisition_source['date'], + date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 62ba867c..05b61361 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -15,30 +15,55 @@ from __future__ import absolute_import, division, print_function -import datetime import os import requests -from .crawler2hep import crawler2hep +from scrapy import Request +from scrapy.pipelines.files import FilesPipeline +from inspire_schemas.utils import validate -def has_publication_info(item): - """If any publication info.""" - return item.get('pubinfo_freetext') or item.get('journal_volume') or \ - item.get('journal_title') or \ - item.get('journal_year') or \ - item.get('journal_issue') or \ - item.get('journal_fpage') or \ - item.get('journal_lpage') or \ - item.get('journal_artid') or \ - item.get('journal_doctype') +from hepcrawl.crawler2hep import to_hep +from hepcrawl.settings import FILES_STORE +from hepcrawl.utils import get_file_name_from_url -def filter_fields(item, keys): - """Filter away keys.""" - for key in keys: - item.pop(key, None) +class FftFilesPipeline(FilesPipeline): + """Download all the FFT files provided by record.""" + + def __init__(self, *args, **kwargs): + super(FftFilesPipeline, self).__init__(FILES_STORE) + + def get_media_requests(self, item, info): + """Download FFT files using FTP.""" + if item.get('file_urls'): + for fft_url in item.file_urls: + yield Request( + url=fft_url, + meta=item.ftp_params, + ) + + def item_completed(self, results, item, info): + """Create a map that connects file names with downloaded files.""" + def _get_absolute_local_file_path(path): + return os.path.abspath( + os.path.join( + FILES_STORE, + path + ) + ) + + map_file_names_paths = {} + for ok, result_data in results: + if ok: + map_file_names_paths[ + get_file_name_from_url(result_data['url']) + ] = _get_absolute_local_file_path(result_data['path']) + + item.file_paths = map_file_names_paths + + return item class InspireAPIPushPipeline(object): @@ -50,74 +75,31 @@ def __init__(self): def open_spider(self, spider): self.results_data = [] + def _post_enhance_item(self, item, spider): + fft_file_paths = item.file_paths + item_format = item.item_format + item = item.item if item.item else item + source = spider.name + + return to_hep( + item=item, + source=source, + item_format=item_format, + fft_file_paths=fft_file_paths, + ) + def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" self.count += 1 - if 'related_article_doi' in item: - item['dois'] += item.pop('related_article_doi', []) - source = spider.name - item['acquisition_source'] = { - 'source': source, - 'method': 'hepcrawl', - 'date': datetime.datetime.now().isoformat(), - 'submission_number': os.environ.get('SCRAPY_JOB', ''), - } - - item['titles'] = [{ - 'title': item.pop('title', ''), - 'subtitle': item.pop('subtitle', ''), - 'source': source, - }] - item['abstracts'] = [{ - 'value': item.pop('abstract', ''), - 'source': source, - }] - item['imprints'] = [{ - 'date': item.pop('date_published', ''), - }] - item['copyright'] = [{ - 'holder': item.pop('copyright_holder', ''), - 'year': item.pop('copyright_year', ''), - 'statement': item.pop('copyright_statement', ''), - 'material': item.pop('copyright_material', ''), - }] - if not item.get('publication_info'): - if has_publication_info(item): - item['publication_info'] = [{ - 'journal_title': item.pop('journal_title', ''), - 'journal_volume': item.pop('journal_volume', ''), - 'journal_issue': item.pop('journal_issue', ''), - 'artid': item.pop('journal_artid', ''), - 'page_start': item.pop('journal_fpage', ''), - 'page_end': item.pop('journal_lpage', ''), - 'note': item.pop('journal_doctype', ''), - 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), - 'pubinfo_material': item.pop('pubinfo_material', ''), - }] - if item.get('journal_year'): - item['publication_info'][0]['year'] = int( - item.pop('journal_year') - ) - - # Remove any fields - filter_fields(item, [ - 'journal_title', - 'journal_volume', - 'journal_year', - 'journal_issue', - 'journal_fpage', - 'journal_lpage', - 'journal_doctype', - 'journal_artid', - 'pubinfo_freetext', - 'pubinfo_material', - ]) - - item = crawler2hep(dict(item)) - spider.logger.debug('Validated item.') - self.results_data.append(item) - return item + hep_item = self._post_enhance_item(item, spider) + + validate(hep_item, 'hep') + spider.logger.debug('Validated item by Inspire Schemas.') + + self.results_data.append(hep_item) + + return hep_item def _prepare_payload(self, spider): """Return payload for push.""" diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 71dcfc75..bd16d8cd 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -85,7 +85,7 @@ # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'scrapy.pipelines.files.FilesPipeline': 1, + 'hepcrawl.pipelines.FftFilesPipeline': 1, 'hepcrawl.pipelines.InspireCeleryPushPipeline': 300, } diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index 2ab883f3..ab151fa6 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -20,7 +20,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import has_numbers +from hepcrawl.utils import ( + has_numbers, + ParsedItem, +) class AlphaSpider(CrawlSpider): @@ -145,4 +148,9 @@ def parse(self, response): record.add_value('source', 'Alpha experiment') record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 496e2e8e..d15c690a 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -20,7 +20,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses, get_nested, build_dict +from hepcrawl.utils import ( + get_licenses, + get_nested, + build_dict, + ParsedItem, +) class APSSpider(Spider): @@ -110,7 +115,13 @@ def parse(self, response): record.add_value('license', license) record.add_value('collections', ['HEP', 'Citeable', 'Published']) - yield record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index d82c8318..8ab0af4f 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -17,7 +17,12 @@ from scrapy.spiders import XMLFeedSpider from ..mappings import CONFERENCE_WORDS, THESIS_WORDS -from ..utils import coll_cleanforthe, get_licenses, split_fullname +from hepcrawl.utils import ( + coll_cleanforthe, + get_licenses, + split_fullname, + ParsedItem, +) from ..items import HEPRecord from ..loaders import HEPLoader @@ -110,8 +115,12 @@ def parse_node(self, response, node): ) record.add_value('license', license) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def _get_authors_or_collaboration(self, node): """Parse authors, affiliations; extract collaboration""" diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 5eb22eb7..ee3a7d47 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -18,7 +18,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from hepcrawl.utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class BaseSpider(XMLFeedSpider): @@ -192,7 +197,13 @@ def build_item(self, response): record.add_value("authors", self.get_authors(node)) record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def scrape_for_pdf(self, response): """Scrape splash page for any links to PDFs. diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index 6c881252..ee22a3eb 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -21,7 +21,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname, parse_domain, get_mime_type +from hepcrawl.utils import ( + split_fullname, + parse_domain, + get_mime_type, + ParsedItem, +) class BrownSpider(CrawlSpider): @@ -219,4 +224,9 @@ def build_item(self, response): record.add_value('thesis', response.meta.get("thesis")) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py new file mode 100644 index 00000000..5ec79da8 --- /dev/null +++ b/hepcrawl/spiders/desy_spider.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Spider for DESY.""" + +from __future__ import absolute_import, division, print_function + +import os + +from lxml import etree +from dojson.contrib.marc21.utils import create_record + +from scrapy import Request +from scrapy.spiders import Spider + +from inspire_dojson.hep import hep + +from hepcrawl.utils import ( + ftp_list_files, + ftp_connection_info, + get_absolute_file_path, + get_file_name_from_url, + ParsedItem, +) + + +class DesySpider(Spider): + """Desy spider. + + This spider connects to a given FTP hosts and downloads XML files + for extraction into HEP records. + + Examples: + To run a crawl, you need to pass FTP connection information via + ``ftp_host`` and ``ftp_netrc``:: + + $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + + To run a crawl on local folder, you need to pass the absolute ``package_path``:: + + $ scrapy crawl desy -a 'package_path=/path/to/package_dir' + """ + name = 'desy' + custom_settings = {} + start_urls = [] + + def __init__( + self, + package_path=None, + ftp_folder='DESY', + ftp_host=None, + ftp_netrc=None, + *args, + **kwargs + ): + """Constructor of ``Desy`` spider.""" + super(DesySpider, self).__init__(*args, **kwargs) + self.ftp_folder = ftp_folder + self.ftp_host = ftp_host + self.ftp_netrc = ftp_netrc + self.package_path = package_path + self.target_folder = '/tmp/DESY' + self.ftp_enabled = True if self.ftp_host else False + if not os.path.exists(self.target_folder): + os.makedirs(self.target_folder) + + def start_requests(self): + """List selected folder on remote FTP and yield files.""" + def _list_xml_files_paths(list_files_paths): + return [ + xml_file + for xml_file in list_files_paths + if xml_file.endswith('.xml') + ] + + if self.package_path: + file_names = os.listdir(self.package_path) + xml_file_names = _list_xml_files_paths(file_names) + + for file_name in xml_file_names: + file_path = os.path.join(self.package_path, file_name) + self.log('Local: Try to crawl local file: {0}'.format(file_path)) + yield Request( + 'file://{0}'.format(file_path), + callback=self.parse, + ) + else: + ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + + remote_files_paths = ftp_list_files( + self.ftp_folder, + target_folder=self.target_folder, + server=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'], + lst_missing_files=False, + ) + + xml_remote_files_paths = _list_xml_files_paths(remote_files_paths) + + for remote_file in xml_remote_files_paths: + self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file)) + remote_file = str(remote_file) + ftp_params['ftp_local_filename'] = os.path.join( + self.target_folder, + os.path.basename(remote_file), + ) + remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp, + ) + + def parse(self, response): + """Parse a ``Desy`` XML file into a HEP record.""" + self.log('Got record from url/path: {0}'.format(response.url)) + self.log('FTP enabled: {0}'.format(self.ftp_enabled)) + ftp_params = None + + if self.ftp_enabled: + ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + prefix_url = '{0}://{1}/'.format('ftp', ftp_host) + else: + prefix_url = '{0}://{1}'.format( + 'file', + '/code/tests/functional/desy/fixtures/ftp_server/', # Temporary - Must be absolute path + ) + + marcxml_records = self._get_marcxml_records(response.body) + hep_records = self._hep_records_from_marcxml(marcxml_records) + + list_fft_old_links = [] + for hep_record in hep_records: + list_fft_old_links.extend(hep_record['_fft']) + + list_file_urls = [ + '{0}{1}'.format(prefix_url, fft_link['path']) + for fft_link in hep_record['_fft'] + ] + + parsed_item = ParsedItem( + item=hep_record, + file_urls=list_file_urls, + ftp_params=ftp_params, + item_format='hep', + ) + + yield parsed_item + + def handle_package_ftp(self, response): + """Yield every XML file found.""" + self.log('Visited url {}'.format(response.url)) + file_path = response.body + yield Request( + 'file://{0}'.format(file_path), + meta={'package_path': file_path} + ) + + def _get_marcxml_records(self, response_body): + root = etree.fromstring(response_body) + list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record') + if not list_items: + list_items = root.findall('.//record') + + return [etree.tostring(item) for item in list_items] + + def _hep_records_from_marcxml(self, list_marcxml_records): + def _create_json_record(str_xml_record): + object_record = create_record(etree.XML(str_xml_record)) + dojson_record = hep.do(object_record) + return dojson_record + + list_hep_records = [] + for str_xml_record in list_marcxml_records: + json_record = _create_json_record(str_xml_record) + list_hep_records.append(json_record) + + return list_hep_records diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 3ac8b901..3dd50b59 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -16,7 +16,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from hepcrawl.utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class DNBSpider(XMLFeedSpider): @@ -219,4 +224,10 @@ def build_item(self, response): record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index beea699d..499e3edc 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -22,7 +22,7 @@ from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, get_first, @@ -30,6 +30,7 @@ get_licenses, get_node, parse_domain, + ParsedItem, ) @@ -318,7 +319,12 @@ def build_item_rich(self, response): ) record.add_value("urls", response.meta.get("urls")) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def build_item_jats(self, response): """Build the final HEPRecord with JATS-format XML ('jp').""" @@ -388,7 +394,12 @@ def build_item_jats(self, response): references = self._get_references(node) record.add_value("references", references) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def _get_references(self, node): """Get the references.""" diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index c9aacc00..78fdd5fd 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -25,12 +25,13 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( get_first, get_licenses, has_numbers, range_as_string, unzip_xml_files, + ParsedItem, ) from ..dateutils import format_year @@ -1034,4 +1035,9 @@ def build_item(self, response): record.add_value('collections', self.get_collections(doctype)) record.add_value('references', self.get_references(node)) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 941a3674..46fae495 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -16,7 +16,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses +from hepcrawl.utils import ( + get_licenses, + ParsedItem, +) class HindawiSpider(XMLFeedSpider): @@ -222,4 +225,9 @@ def parse_node(self, response, node): record.add_xpath('source', "./datafield[@tag='260']/subfield[@code='b']/text()") - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e970c1c..579ac65b 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -21,8 +21,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file - +from hepcrawl.utils import ( + get_temporary_file, + ParsedItem, +) from ..dateutils import format_date @@ -240,4 +242,9 @@ def build_item(self, response): record.add_value('source', 'INFN') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index 0e3bae65..90c7809f 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -23,6 +23,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from hepcrawl.utils import ParsedItem class IOPSpider(XMLFeedSpider, NLM): @@ -222,4 +223,9 @@ def parse_node(self, response, node): record.add_value("additional_files", self.add_fft_file(pdf_file_path, file_access, file_type)) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 77bf7948..1c83c829 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from hepcrawl.utils import ( + split_fullname, + ParsedItem, +) class MagicSpider(XMLFeedSpider): @@ -176,4 +179,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index c71234f9..4e099348 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -23,7 +23,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file, split_fullname +from hepcrawl.utils import ( + get_temporary_file, + split_fullname, + ParsedItem, +) class MITSpider(XMLFeedSpider): @@ -223,4 +227,9 @@ def build_item(self, response): record.add_value('page_nr', self.get_page_nr(node)) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index 7200664e..95bc874a 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -18,6 +18,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from hepcrawl.utils import ParsedItem class PhenixSpider(XMLFeedSpider): @@ -128,4 +129,9 @@ def parse_node(self, response, node): record.add_value('source', 'PHENIX') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 101b1163..8a486292 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -19,7 +19,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import parse_domain, get_mime_type +from hepcrawl.utils import ( + parse_domain, + get_mime_type, + ParsedItem, +) class PhilSpider(CrawlSpider): @@ -160,4 +164,9 @@ def build_item(self, response): if not jsonrecord.get('year') == "forthcoming": record.add_value('journal_year', int(jsonrecord['year'])) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 7d3fb87d..a4d68f7d 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -13,10 +13,16 @@ import re +from urlparse import urljoin + from scrapy import Request, Selector from scrapy.spiders import Spider -from urlparse import urljoin -from ..utils import get_licenses, get_first + +from hepcrawl.utils import ( + get_licenses, + get_first, + ParsedItem, +) from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader @@ -128,7 +134,13 @@ def build_item(self, response): record.add_value('extra_data', extra_data) record.add_value('collections', ['HEP', 'ConferencePaper']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def _get_ext_systems_number(self, node): return [ diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 661f0bec..97ae8202 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from hepcrawl.utils import ( + split_fullname, + ParsedItem, +) class T2kSpider(XMLFeedSpider): @@ -164,4 +167,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("additional_files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3f68131f..22d418a9 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -20,12 +20,13 @@ from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, local_list_files, get_licenses, unzip_xml_files, + ParsedItem, ) @@ -97,7 +98,7 @@ def start_requests(self): new_files_paths = ftp_list_files( self.ftp_folder, - self.target_folder, + target_folder=self.target_folder, server=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] @@ -148,7 +149,7 @@ def parse_node(self, response, node): self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[0] not in self.allowed_article_types: # Filter out non-interesting article types - return None + return record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', @@ -203,9 +204,13 @@ def parse_node(self, response, node): record.add_value('license', license) record.add_value('collections', self._get_collections(node, article_type, journal_title)) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + item=dict(record.load_item()), + item_format='hepcrawl', + ) + + return parsed_item def _get_collections(self, node, article_type, current_journal_title): """Return this articles' collection.""" diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py index 6c720550..7201b68e 100644 --- a/hepcrawl/testlib/celery_monitor.py +++ b/hepcrawl/testlib/celery_monitor.py @@ -19,13 +19,14 @@ class CeleryMonitor(object): - def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100): + def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2): self.results = [] self.recv = None self.app = app self.connection = None self.monitor_timeout = monitor_timeout self.monitor_iter_limit = monitor_iter_limit + self.events_limit = events_limit def __enter__(self): state = self.app.events.State() @@ -61,10 +62,16 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.connection.__exit__() def _wait_for_results(self, events_iter): - any(islice( + generator_events = islice( events_iter, # iterable self.monitor_iter_limit # stop - )) + ) + counter = 0 + for dummy in generator_events: + if dummy: + counter += 1 + if counter == self.events_limit: + break @classmethod def do_crawl( @@ -72,6 +79,7 @@ def do_crawl( app, monitor_timeout, monitor_iter_limit, + events_limit, crawler_instance, project='hepcrawl', spider='WSP', @@ -80,7 +88,12 @@ def do_crawl( ): settings = settings or {} - with cls(app, monitor_timeout=monitor_timeout, monitor_iter_limit=monitor_iter_limit) as my_monitor: + with cls( + app, + monitor_timeout=monitor_timeout, + monitor_iter_limit=monitor_iter_limit, + events_limit=events_limit + ) as my_monitor: crawler_instance.schedule( project=project, spider=spider, diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index 513b0395..c9d53339 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -11,6 +11,7 @@ import os import json +import shutil from scrapy.http import Request, TextResponse from scrapy.selector import Selector @@ -131,3 +132,18 @@ def expected_json_results_from_file(*path_chunks, **kwargs): expected_data = json.load(fd) return expected_data + + +def clean_dir(path='/tmp/WSP/'): + """ + Deletes all contained files of given target directory path. + + Args: + path: Absolute path of target directory to be cleaned. + + Example: + + >>> clean_dir('/dir_1/dir_11/') + + """ + shutil.rmtree(path, ignore_errors=True) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 4ad9db3c..71ff3aa6 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -57,17 +57,34 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): return ftp_host, connection_params -def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False): +def ftp_list_files( + server_folder, + server, + user, + password, + target_folder=None, + passive_mode=False, + lst_missing_files=True, +): """List files from given FTP's server folder to target folder.""" session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, port=21, use_passive_mode=passive_mode, - encrypt_data_channel=True) + encrypt_data_channel=True, + ) with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) - return list_missing_files(server_folder, target_folder, file_names) + if lst_missing_files: + return list_missing_files(server_folder, target_folder, file_names) + else: + return [ + os.path.join( + server_folder, + file_name + ) for file_name in file_names + ] def local_list_files(local_folder, target_folder): @@ -321,3 +338,39 @@ def get_license_by_text(license_text): license = get_license_by_url(license_url=LICENSE_TEXTS[key]) return license + + +def get_file_name_from_url(url): + return url.rsplit('/', 1)[-1] + + +def get_absolute_file_path(file_path): + """Returns the absolute path of a relative path.""" + return os.path.abspath(file_path) + + +class ParsedItem(dict): + """Generate interface to communicate Spider-Pipelines""" + def __init__( + self, + item, + file_urls=None, + item_format=None, + ftp_params=None, + file_paths=None, + **kwargs + ): + super(ParsedItem, self).__init__( + item=item, + file_urls=file_urls, + item_format=item_format, + ftp_params=ftp_params, + file_paths=file_paths, + **kwargs + ) + self.item = item + self.file_urls = file_urls + self.format = item_format + self.ftp_params = ftp_params + self.file_paths = file_paths + self.__dict__ = self diff --git a/setup.py b/setup.py index b19e5f14..a98aeb88 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ install_requires = [ 'autosemver~=0.2', 'inspire-schemas~=42.0', + 'inspire-dojson~=41.0', 'Scrapy>=1.1.0', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index a9677b89..0f58b17d 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -72,6 +72,7 @@ def test_arxiv(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', diff --git a/tests/functional/desy/fixtures/desy_ftp_records.json b/tests/functional/desy/fixtures/desy_ftp_records.json new file mode 100644 index 00000000..f685a254 --- /dev/null +++ b/tests/functional/desy/fixtures/desy_ftp_records.json @@ -0,0 +1,308 @@ +[{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 222222, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/222222" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 333333, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/333333" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 444444, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/444444" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}] \ No newline at end of file diff --git a/tests/functional/desy/fixtures/desy_local_records.json b/tests/functional/desy/fixtures/desy_local_records.json new file mode 100644 index 00000000..6fe2c4d0 --- /dev/null +++ b/tests/functional/desy/fixtures/desy_local_records.json @@ -0,0 +1,308 @@ +[{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 222222, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/222222" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 333333, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/333333" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}, +{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" + }, + { + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" + } + ], + "control_number": 444444, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/444444" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] +}] \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/.netrc b/tests/functional/desy/fixtures/ftp_server/.netrc new file mode 100644 index 00000000..59a152f7 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/.netrc @@ -0,0 +1,3 @@ +machine ftp_server +login bob +password bob diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt new file mode 100644 index 00000000..bb8e8348 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt @@ -0,0 +1 @@ +sample file fft 1. \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt new file mode 100644 index 00000000..e1b54448 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt @@ -0,0 +1 @@ +sample file fft 2. \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml new file mode 100644 index 00000000..6900d746 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml @@ -0,0 +1,149 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + + 222222 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml new file mode 100644 index 00000000..2067b5e7 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml @@ -0,0 +1,149 @@ + + + + 333333 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + + 444444 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt b/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt new file mode 100644 index 00000000..5254be38 --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt @@ -0,0 +1 @@ +This is a file not to download the Desy spider! \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd new file mode 100644 index 00000000..275a727c --- /dev/null +++ b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd @@ -0,0 +1 @@ +bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./:::::::::::: diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py new file mode 100644 index 00000000..321766f9 --- /dev/null +++ b/tests/functional/desy/test_desy.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for Desy spider""" + +from __future__ import absolute_import, division, print_function + +import pytest + +from time import sleep +import hashlib + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, + clean_dir, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +def compare_two_files_using_md5(file_1, file_2): + """Compares two files calculating the md5 hash.""" + def _generate_md5_hash(file_path): + hasher = hashlib.md5() + with open(str(file_path), 'rb') as fd: + buf = fd.read() + hasher.update(buf) + return hasher.hexdigest() + + return _generate_md5_hash(file_1) == _generate_md5_hash(file_2) + + +@pytest.fixture(scope="function") +def get_fft_1_path(): + return get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + 'FFT', + 'test_fft_1.txt', + test_suite='functional', + ) + + +@pytest.fixture(scope="function") +def get_fft_2_path(): + return get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + 'FFT', + 'test_fft_2.txt', + test_suite='functional', + ) + + +@pytest.fixture(scope="function") +def set_up_ftp_environment(): + netrc_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + '.netrc', + test_suite='functional', + ) + + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(10) + + yield { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'ftp_host': 'ftp_server', + 'ftp_netrc': netrc_location, + } + } + + clean_dir('/tmp/file_urls') + clean_dir('/tmp/DESY') + + +@pytest.fixture(scope="function") +def set_up_local_environment(): + package_location = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + test_suite='functional', + ) + + yield { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'package_path': package_location, + } + } + + clean_dir('/tmp/file_urls') + clean_dir('/tmp/DESY') + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_ftp_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_ftp( + set_up_ftp_environment, + expected_results, + get_fft_1_path, + get_fft_2_path, +): + crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results + + # Check using MD5 Hash if downloaded files are there. + for record in expected_results: + fft_file_paths = sorted(record['_fft']) + + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path) + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'desy', + 'fixtures', + 'desy_local_records.json', + ), + ], + ids=[ + 'smoke', + ] +) +def test_desy_local_package_path( + set_up_local_environment, + expected_results, + get_fft_1_path, + get_fft_2_path, +): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=2, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='desy', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results + + # Check using MD5 Hash if downloaded files are there. + for record in expected_results: + fft_file_paths = sorted(record['_fft']) + + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path) +# diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 70996466..a0411b8e 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -13,7 +13,6 @@ import pytest import os -import shutil from time import sleep @@ -21,6 +20,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.utils import get_crawler_instance @@ -90,10 +90,6 @@ def remove_generated_files(package_location): os.unlink(os.path.join(package_location, file_name)) -def clean_dir(path='/tmp/WSP/'): - shutil.rmtree(path, ignore_errors=True) - - @pytest.mark.parametrize( 'expected_results', [ @@ -114,6 +110,7 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', @@ -147,6 +144,7 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='WSP', diff --git a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml b/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml index 4e80ba6b..1ade2b4b 100644 --- a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml +++ b/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml @@ -3,7 +3,7 @@ "11" ], "acquisition_source": { - "date": "2017-02-21T18:03:40.858985", + "datetime": "2017-02-21T18:03:40.858985", "source": "arXiv", "method": "hepcrawl", "submission_number": "scrapy_job" diff --git a/tests/unit/responses/crawler2hep/in_no_document_type.yaml b/tests/unit/responses/crawler2hep/in_no_document_type.yaml index 22b93fd0..21543c36 100644 --- a/tests/unit/responses/crawler2hep/in_no_document_type.yaml +++ b/tests/unit/responses/crawler2hep/in_no_document_type.yaml @@ -5,7 +5,7 @@ "11" ], "acquisition_source": { - "date": "2017-02-21T18:03:40.858985", + "datetime": "2017-02-21T18:03:40.858985", "source": "arXiv", "method": "hepcrawl", "submission_number": "scrapy_job" diff --git a/tests/unit/responses/desy/desy_collection_records.xml b/tests/unit/responses/desy/desy_collection_records.xml new file mode 100644 index 00000000..d2086694 --- /dev/null +++ b/tests/unit/responses/desy/desy_collection_records.xml @@ -0,0 +1,149 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + + 222222 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + \ No newline at end of file diff --git a/tests/unit/responses/desy/desy_record.xml b/tests/unit/responses/desy/desy_record.xml new file mode 100644 index 00000000..437c45b3 --- /dev/null +++ b/tests/unit/responses/desy/desy_record.xml @@ -0,0 +1,76 @@ + + + + 111111 + 20170705125610.0 + + DOI + 10.18429/JACoW-IPAC2017-WEYB1 + + + 9783954501823 + + + Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser + Acceleration (DLA) From the Source to Relativistic Electrons + + JACoW + + + Dielectric laser acceleration of electrons has recently been + demonstrated with significantly higher accelerating gradients than other + structure-based linear accelerators. Towards the development of an integrated 1 MeV + electron accelerator based on dielectric laser accelerator technologies, + development in several relevant technologies is needed. In this work, recent + developments on electron sources, bunching, accelerating, focussing, deflecting and + laser coupling structures are reported. With an eye to the near future, components + required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond + electron bunches are outlined. + + Deutsches Elektronen-Synchrotron + + + 6 + + + *Brief entry* + + + 2017 + 2520-2525 + + + 100176 + http://inspirehep.net/record/1608652/files/Towards a fully + integrated acc on a chip.pdf + + Fulltext + + + oai:inspirehep.net:1608652 + INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + + + \ No newline at end of file diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py index eef140b1..ad8f3f03 100644 --- a/tests/unit/test_alpha.py +++ b/tests/unit/test_alpha.py @@ -20,13 +20,15 @@ def results(): """Return results generator from the Alpha spider.""" spider = alpha_spider.AlphaSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file('alpha/test_1.htm') ) ) + records = [parsed_item.item for parsed_item in parsed_items] assert records + return records diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py index eb53269d..8bc66033 100644 --- a/tests/unit/test_aps.py +++ b/tests/unit/test_aps.py @@ -21,7 +21,7 @@ def results(): from scrapy.http import TextResponse spider = aps_spider.APSSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'aps/aps_single_response.json', @@ -30,6 +30,8 @@ def results(): ) ) + records = [parsed_item.item for parsed_item in parsed_items] + assert records return records diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index bd75e5a4..47ea20db 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -11,7 +11,8 @@ import pytest -from scrapy.crawler import Crawler +from scrapy.crawler import Crawler +from scrapy.http import TextResponse from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider @@ -25,36 +26,16 @@ def spider(): return spider -@pytest.fixture -def one_result(spider): - """Return results generator from the arxiv spider. Tricky fields, one - record. - """ - from scrapy.http import TextResponse - - records = list( - spider.parse( - fake_response_from_file( - 'arxiv/sample_arxiv_record0.xml', - response_type=TextResponse, - ) - ) - ) - - assert records - pipeline = InspireCeleryPushPipeline() - pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] - - @pytest.fixture def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ - from scrapy.http import TextResponse + def _get_processed_item(record, spider): + item = pipeline.process_item(record, spider) + return item - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', @@ -63,10 +44,10 @@ def many_results(spider): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] + + return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] def test_page_nr(many_results): diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index a6ed66d6..d8e6f9e5 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -24,10 +24,14 @@ def results(): """Return results generator from the arxiv spider. All fields, one record. """ + def _get_processed_item(record, spider): + item = pipeline.process_item(record, spider) + validate(item, 'hep') + return item crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', @@ -36,16 +40,10 @@ def results(): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - processed_records = [] - for record in records: - processed_record = pipeline.process_item(record, spider) - validate(processed_record, 'hep') - processed_records.append(processed_record) - return processed_records + return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py index cc6ef093..48551cdf 100644 --- a/tests/unit/test_base.py +++ b/tests/unit/test_base.py @@ -38,9 +38,11 @@ def record(): nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -169,7 +171,10 @@ def splash(): 'Content-Type': 'text/html', }, ) - return spider.scrape_for_pdf(splash_response) + + parsed_item = spider.scrape_for_pdf(splash_response) + + return parsed_item.item def test_splash(splash): @@ -201,7 +206,10 @@ def parsed_node(): response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node[0].extract() - return spider.parse_node(response, node[0]) + + parsed_item = spider.parse_node(response, node[0]) + + return parsed_item.item def test_parsed_node(parsed_node): diff --git a/tests/unit/test_brown.py b/tests/unit/test_brown.py index 0b42b4df..b78be316 100644 --- a/tests/unit/test_brown.py +++ b/tests/unit/test_brown.py @@ -41,10 +41,11 @@ def record(): splash_response = fake_response_from_file('brown/test_splash.html') splash_response.meta["jsonrecord"] = jsonrecord - parsed_record = spider.scrape_splash(splash_response) - assert parsed_record - return parsed_record + parsed_item = spider.scrape_splash(splash_response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -200,7 +201,10 @@ def parsed_node_no_splash(): jsonrecord = jsonresponse["items"]["docs"][0] response.meta["jsonrecord"] = jsonrecord - return spider.parse(response).next() + parsed_item = spider.parse(response).next() + assert parsed_item + + return parsed_item.item def test_no_splash(parsed_node_no_splash): diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py new file mode 100644 index 00000000..5b01f7fd --- /dev/null +++ b/tests/unit/test_desy.py @@ -0,0 +1,381 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import absolute_import, division, print_function + +import pytest +import os + +from scrapy.crawler import Crawler +from scrapy.http import TextResponse + +from hepcrawl.pipelines import InspireCeleryPushPipeline +from hepcrawl.spiders import desy_spider + +from hepcrawl.testlib.fixtures import fake_response_from_file + + +def create_spider(): + crawler = Crawler(spidercls=desy_spider.DesySpider) + return desy_spider.DesySpider.from_crawler(crawler) + + +def get_records(response_file_name): + """Return all results generator from the ``Desy`` spider via pipelines.""" + # environmental variables needed for the pipelines payload + os.environ['SCRAPY_JOB'] = 'scrapy_job' + os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' + os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' + + spider = create_spider() + records = spider.parse( + fake_response_from_file( + file_name=response_file_name, + response_type=TextResponse + ) + ) + + pipeline = InspireCeleryPushPipeline() + pipeline.open_spider(spider) + + return ( + pipeline.process_item( + record, + spider + ) for record in records + ) + + +def get_one_record(response_file_name): + results = get_records(response_file_name) + record = results.next() + return record + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = '2017-05-04T17:49:07.975168' + record['acquisition_source']['submission_number'] = '5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.mark.parametrize( + 'generated_record', + [ + get_one_record('desy/desy_record.xml'), + ], + ids=[ + 'smoke', + ] +) +def test_pipeline_record(generated_record): + expected = { + '$schema': 'hep.json', + '_collections': [ + 'Literature' + ], + '_fft': [ + { + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, + }, + { + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 + } + ], + 'abstracts': [ + { + 'source': 'Deutsches Elektronen-Synchrotron', + 'value': 'Dielectric laser acceleration of electrons has recently been\n' + ' demonstrated with significantly higher accelerating ' + 'gradients than other\n structure-based linear ' + 'accelerators. Towards the development of an integrated 1 MeV\n ' + ' electron accelerator based on dielectric laser accelerator ' + 'technologies,\n development in several relevant ' + 'technologies is needed. In this work, recent\n ' + 'developments on electron sources, bunching, accelerating, focussing, ' + 'deflecting and\n laser coupling structures are reported. ' + 'With an eye to the near future, components\n ' + 'required for a 1 MeV kinetic energy tabletop accelerator producing ' + 'sub-femtosecond\n electron bunches are outlined.\n ' + ' ' + } + ], + 'acquisition_source': { + 'datetime': '2017-05-04T17:49:07.975168', + 'method': 'hepcrawl', + 'source': 'desy', + 'submission_number': '5652c7f6190f11e79e8000224dabeaad' + }, + 'control_number': 111111, + 'document_type': [ + 'article' + ], + 'dois': [ + { + 'value': '10.18429/JACoW-IPAC2017-WEYB1' + } + ], + 'number_of_pages': 6, + 'public_notes': [ + { + 'value': '*Brief entry*' + } + ], + 'publication_info': [ + { + 'parent_isbn': '9783954501823' + }, + { + 'page_end': '2525', + 'page_start': '2520', + 'year': 2017 + } + ], + 'self': { + '$ref': 'http://inspirehep.net/api/literature/111111' + }, + 'titles': [ + { + 'source': 'JACoW', + 'title': 'Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n ' + ' Acceleration (DLA) From the Source to Relativistic ' + 'Electrons\n ' + } + ], + 'urls': [ + { + 'description': 'Fulltext', + 'value': 'http://inspirehep.net/record/1608652/files/Towards a fully\n ' + ' integrated acc on a chip.pdf\n ' + } + ] + } + + assert override_generated_fields(generated_record) == expected + + +@pytest.mark.parametrize( + 'generated_records', + [ + get_records('desy/desy_collection_records.xml'), + ], + ids=[ + 'smoke', + ] +) +def test_pipeline_collection_records(generated_records): + expected = [{ + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-05-04T17:49:07.975168" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, + }, + { + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 + } + ], + "control_number": 111111, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/111111" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] + }, + { + "acquisition_source": { + "source": "desy", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-05-04T17:49:07.975168" + }, + "_collections": [ + "Literature" + ], + "_fft": [ + { + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, + }, + { + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 + } + ], + "control_number": 222222, + "public_notes": [ + { + "value": "*Brief entry*" + } + ], + "self": { + "$ref": "http://inspirehep.net/api/literature/222222" + }, + "number_of_pages": 6, + "titles": [ + { + "source": "JACoW", + "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " + } + ], + "urls": [ + { + "description": "Fulltext", + "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " + } + ], + "dois": [ + { + "value": "10.18429/JACoW-IPAC2017-WEYB1" + } + ], + "publication_info": [ + { + "parent_isbn": "9783954501823" + }, + { + "page_start": "2520", + "page_end": "2525", + "year": 2017 + } + ], + "$schema": "hep.json", + "document_type": [ + "article" + ], + "abstracts": [ + { + "source": "Deutsches Elektronen-Synchrotron", + "value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " + } + ] + } + ] + + generated_results = [override_generated_fields(rec) for rec in generated_records] + + assert generated_results == expected diff --git a/tests/unit/test_dnb.py b/tests/unit/test_dnb.py index b00aff3d..a1a22dbd 100644 --- a/tests/unit/test_dnb.py +++ b/tests/unit/test_dnb.py @@ -72,7 +72,11 @@ def record(scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + parsed_item = request.callback(response) + assert parsed_item + + return parsed_item.item def test_title(record): @@ -241,7 +245,9 @@ def parse_without_splash(): 'Content-Type': 'application/pdf;charset=base64', } ) - return spider.parse_node(response, nodes[0]) + + parsed_item = spider.parse_node(response, nodes[0]) + return parsed_item.item def test_parse_without_splash(parse_without_splash): diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index cc7885bd..9d88d5ad 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -40,6 +40,7 @@ def scrape_pos_page_body(): ) ) + @pytest.fixture def targzfile(): """Path to test tar.gz file with JATS XML file.""" @@ -50,6 +51,7 @@ def targzfile(): 'test_gz.tar.gz' ) + @pytest.fixture def package_jats(targzfile): """Extract tar.gz package with JATS XML file.""" @@ -75,7 +77,11 @@ def record_jats(package_jats, scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + parsed_item = request.callback(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -107,7 +113,10 @@ def record_rich(package_rich): fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] - return spider.parse_node(fake_resp, node) + parsed_item = spider.parse_node(fake_resp, node) + assert parsed_item + + return parsed_item.item def test_title(record_jats): @@ -145,6 +154,7 @@ def test_abstract(record_jats): assert 'abstract' in record_jats assert record_jats['abstract'] == abstract + def test_date_published(record_jats): """Test extracting date_published.""" date_published = "2015-01-01" @@ -179,6 +189,7 @@ def test_doi(record_jats): assert 'dois' in record_jats assert record_jats['dois'][0]['value'] == doi + def test_publication_info(record_jats): """Test extracting publication info.""" assert 'journal_title' in record_jats @@ -206,7 +217,6 @@ def test_keywords(record_jats): assert keyw["value"] in keywords - def test_authors(record_jats): """Test authors.""" authors = ["Arasoglu, Ali", "Ozdemir, Omer Faruk"] @@ -326,7 +336,6 @@ def test_authors_rich(record_rich): assert astr[index]["affiliations"][0]["value"] == affiliations[index] - def test_tarfile(tarbzfile, tmpdir): """Test untarring a tar.bz package with a test XML file. @@ -343,7 +352,6 @@ def test_tarfile(tarbzfile, tmpdir): assert "aas/xml_rich/2000/01" not in xml_files_flat[0] - def test_handle_package_ftp(tarbzfile): """Test getting the target folder name for xml files.""" spider = edp_spider.EDPSpider() @@ -353,6 +361,7 @@ def test_handle_package_ftp(tarbzfile): assert isinstance(request, Request) assert request.meta["package_path"] == tarbzfile + def test_no_dois_jats(): """Test parsing when no DOI in record. JATS format.""" spider = edp_spider.EDPSpider() @@ -370,7 +379,9 @@ def test_no_dois_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item assert "dois" not in record assert "additional_files" not in record @@ -390,7 +401,9 @@ def test_no_dois_rich(): response = fake_response_from_string(body) response.meta["rich"] = True node = get_node(spider, "//EDPSArticle", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item assert "dois" not in record assert "additional_files" not in record @@ -416,7 +429,9 @@ def test_addendum_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item assert "related_article_doi" in record assert record["related_article_doi"][0][ @@ -439,7 +454,9 @@ def test_author_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item assert 'email' in record['authors'][0] assert record['authors'][0]['email'] == "Fname.Sname@university.org" @@ -472,7 +489,9 @@ def test_aff_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA" assert 'affiliations' in record['authors'][0] @@ -481,8 +500,6 @@ def test_aff_with_email(): assert record['authors'][0]['email'] is None - - def test_no_valid_article(): """Test parsing when filtering out non-interesting article types.""" spider = edp_spider.EDPSpider() @@ -506,7 +523,9 @@ def test_collections_review(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + record = parsed_item.item assert "collections" in record assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}] @@ -533,7 +552,11 @@ def record_references_only(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - return spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + + return parsed_item.item def test_references(record_references_only): diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index ca023122..109f3d3f 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -41,9 +41,11 @@ def record(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' tag = '//%s' % spider.itertag nodes = get_node(spider, tag, response) - parsed_record = spider.parse_node(response, nodes) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes) + assert parsed_item + + return parsed_item.item @pytest.fixture(scope="module") @@ -97,7 +99,11 @@ def parsed_node(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' parse_response = spider.parse_node(response, node) parse_response.status = 404 - return spider.scrape_sciencedirect(parse_response) + + parsed_item = spider.scrape_sciencedirect(parse_response) + assert parsed_item + + return parsed_item.item def test_collection(parsed_node): @@ -164,7 +170,10 @@ def cover_display_date(): node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date(cover_display_date): @@ -187,7 +196,10 @@ def cover_display_date_y_m(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date_y_m(cover_display_date_y_m): @@ -210,7 +222,10 @@ def cover_display_date_y(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date_y(cover_display_date_y): @@ -1644,7 +1659,11 @@ def sciencedirect(): ]) response.meta["info"] = {} response.meta["node"] = get_node(spider, '/head', text=body) - return spider.scrape_sciencedirect(response) + + parse_item = spider.scrape_sciencedirect(response) + assert parse_item + + return parse_item.item def test_sciencedirect(sciencedirect): diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py index 37e5e183..bd1da795 100644 --- a/tests/unit/test_hindawi.py +++ b/tests/unit/test_hindawi.py @@ -26,9 +26,10 @@ def record(): response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + + return parsed_item.item def test_title(record): diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py index 0c60799a..5cd1e27d 100644 --- a/tests/unit/test_infn.py +++ b/tests/unit/test_infn.py @@ -28,9 +28,11 @@ def record(): """Return scraping results from the INFN spider.""" spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_splash.html') - parsed_record = spider.scrape_splash(response) - assert parsed_record - return parsed_record + + parsed_item = spider.scrape_splash(response) + assert parsed_item + + return parsed_item.item def test_title(record): @@ -121,6 +123,7 @@ def test_non_thesis(): assert record is None + def test_parse_node(): """Test parse_node function. This should be a scrapy Request object. @@ -148,6 +151,6 @@ def test_parse_node_nolink(): response = fake_response_from_file('infn/test_1_nolink.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() - assert isinstance(record, hepcrawl.items.HEPRecord) + assert isinstance(parsed_item.item, hepcrawl.items.HEPRecord) diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index b776adfa..fb8d26d2 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -38,9 +38,11 @@ def record(): response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, node) + assert parsed_item + + return parsed_item.item def test_abstract(record): @@ -182,10 +184,11 @@ def erratum_open_access_record(): 'iop', 'pdf', ) - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, node) + assert parsed_item + + return parsed_item.item def test_files_erratum_open_access_record(erratum_open_access_record): diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py index eeb574fe..74d9fad4 100644 --- a/tests/unit/test_magic.py +++ b/tests/unit/test_magic.py @@ -23,6 +23,7 @@ get_node, ) + @pytest.fixture def record(): """Return results from the MAGIC spider. First parse node, then scrape, @@ -39,9 +40,10 @@ def record(): splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["urls"] = parsed_node.meta["urls"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + + return parsed_item.item def test_abstract(record): @@ -102,7 +104,6 @@ def test_abstract(record): assert record["abstract"] == abstract - def test_title(record): """Test extracting title.""" title = "Limits to the violation of Lorentz invariance using the emission of the CRAB pulsar at TeV energies, discovered with archival data from the MAGIC telescopes" @@ -139,6 +140,7 @@ def test_url(record): assert 'urls' in record assert record['urls'][0]['value'] == url + def test_pdf_link(record): """Test pdf link(s)""" files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf" @@ -164,8 +166,9 @@ def test_no_author_no_date_no_url(): """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "authors" not in record @@ -184,8 +187,9 @@ def test_no_aff(): """ response = fake_response_from_string(body) - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "affiliations" not in record["authors"] @@ -216,8 +220,9 @@ def test_no_spash_page(): response.status = 404 response.meta["title"] = parsed_node.meta["title"] response.meta["urls"] = parsed_node.meta["urls"] - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "urls" in record assert "title" in record diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py index 0253d91f..2629dd34 100644 --- a/tests/unit/test_mit.py +++ b/tests/unit/test_mit.py @@ -25,9 +25,11 @@ def record(): """Return scraping results from the MIT spider.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_splash.html') - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -37,7 +39,11 @@ def parsed_node(): response = fake_response_from_file('mit/test_list.html') tag = spider.itertag node = get_node(spider, tag, response, rtype="html") - return spider.parse_node(response, node).next() + + parsed_item = spider.parse_node(response, node).next() + assert parsed_item + + return parsed_item def test_url(parsed_node): @@ -159,7 +165,11 @@ def supervisors(): """ response = fake_response_from_string(body) - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item def test_two_supervisors(supervisors): diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py index 75384350..5a6a5f4c 100644 --- a/tests/unit/test_phenix.py +++ b/tests/unit/test_phenix.py @@ -29,9 +29,12 @@ def record(): response = fake_response_from_file('phenix/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + + return parsed_item.item + @pytest.fixture def non_thesis(): @@ -49,10 +52,12 @@ def non_thesis(): node = get_node(spider, '//li', text=body) return spider.parse_node(response, node) + def test_non_thesis(non_thesis): """Test MSc thesis skipping.""" assert non_thesis is None + def test_title(record): """Test extracting title.""" title = "MEASUREMENT OF THE DOUBLE HELICITY ASYMMETRY IN INCLUSIVE $\pi^{0}$ PRODUCTION IN POLARIZED PROTON-PROTON COLLISIONS AT $\sqrt{s}$ = 510 GeV" @@ -82,6 +87,7 @@ def test_authors(record): aff['value'] for aff in record['authors'][index]['affiliations'] ] + def test_pdf_link(record): """Test pdf link(s)""" files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf" diff --git a/tests/unit/test_phil.py b/tests/unit/test_phil.py index e99064b2..be0da905 100644 --- a/tests/unit/test_phil.py +++ b/tests/unit/test_phil.py @@ -33,9 +33,11 @@ def record(): "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fanalysis.oxfordjournals.org%2Fcontent%2F66%2F3%2F194.full.pdf%2Bhtml%3Fframe%3Dsidebar", "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fbrogaardb.googlepages.com%2Ftensedrelationsoffprint.pdf" ] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -48,7 +50,11 @@ def journal(): response = fake_response_from_file('phil/test_journal.json') jsonrecord = json.loads(response.body_as_unicode()) response.meta["jsonrecord"] = jsonrecord[0] - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -223,7 +229,10 @@ def splash(): ] } - return spider.scrape_for_pdf(response) + parsed_item = spider.scrape_for_pdf(response) + assert parsed_item + + return parsed_item.item def test_scrape(splash): diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 20c872f4..283cb4f7 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -51,8 +51,10 @@ def record(scrape_pos_page_body): assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - record = request.callback(response) - return pipeline.process_item(record, spider) + parsed_item = request.callback(response) + parsed_item = pipeline.process_item(parsed_item, spider) + + return parsed_item def test_titles(record): diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py index 283a02e5..caaaaefc 100644 --- a/tests/unit/test_t2k.py +++ b/tests/unit/test_t2k.py @@ -36,9 +36,10 @@ def record(): splash_response.meta["urls"] = parsed_node.meta["urls"] splash_response.meta["authors"] = parsed_node.meta["authors"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + + return parsed_item.item def test_abstact(record): @@ -125,9 +126,10 @@ def non_url(): selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]).next() - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, nodes[0]).next() + assert parsed_item + + return parsed_item.item def test_non_url(non_url): diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 36438ab4..9476a0a1 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -49,7 +49,8 @@ def get_records(response_file_name): def get_one_record(response_file_name): results = get_records(response_file_name) - return results.next() + record = results.next() + return record def override_generated_fields(record):