diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index afbf94b1..add53f70 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -15,34 +15,138 @@ from __future__ import absolute_import, division, print_function +import os +import datetime + from inspire_schemas.api import LiteratureBuilder from hepcrawl.utils import get_file_name_from_url -def _update_record_fft_links(record, map_fft_file_paths): - def _list_new_fft_links(old_fft_links, map_fft_file_paths): - new_fft_links = [] - for fft_link in old_fft_links: - file_name = get_file_name_from_url(fft_link['path']) - if file_name in map_fft_file_paths: - new_fft_links.append( - { - 'path': map_fft_file_paths[file_name], - } - ) +def _update_record_fft(record, index_fft_file_paths): + def _update_fft_fields(fft_fields, index_fft_file_paths): + new_fft_fields = [] + for fft_field in fft_fields: + file_name = get_file_name_from_url(fft_field['path']) + if file_name in index_fft_file_paths: + fft_field['path'] = index_fft_file_paths[file_name] + new_fft_fields.append(fft_field) - return new_fft_links + return new_fft_fields - old_fft_links = record['_fft'] - record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths) + record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths) return record -def to_hep(item, item_format='hepcrawl', fft_file_paths=None): +def _has_publication_info(item): + """If any publication info.""" + return item.get('pubinfo_freetext') or item.get('journal_volume') or \ + item.get('journal_title') or \ + item.get('journal_year') or \ + item.get('journal_issue') or \ + item.get('journal_fpage') or \ + item.get('journal_lpage') or \ + item.get('journal_artid') or \ + item.get('journal_doctype') + + +def _filter_fields(item, keys): + """Filter away keys.""" + for key in keys: + item.pop(key, None) + + +def _normalize_hepcrawl_record(item, source): + if 'related_article_doi' in item: + item['dois'] += item.pop('related_article_doi', []) + + item['titles'] = [{ + 'title': item.pop('title', ''), + 'subtitle': item.pop('subtitle', ''), + 'source': source, + }] + + item['abstracts'] = [{ + 'value': item.pop('abstract', ''), + 'source': source, + }] + + item['imprints'] = [{ + 'date': item.pop('date_published', ''), + }] + + item['copyright'] = [{ + 'holder': item.pop('copyright_holder', ''), + 'year': item.pop('copyright_year', ''), + 'statement': item.pop('copyright_statement', ''), + 'material': item.pop('copyright_material', ''), + }] + + if _has_publication_info(item): + item['publication_info'] = [{ + 'journal_title': item.pop('journal_title', ''), + 'journal_volume': item.pop('journal_volume', ''), + 'journal_issue': item.pop('journal_issue', ''), + 'artid': item.pop('journal_artid', ''), + 'page_start': item.pop('journal_fpage', ''), + 'page_end': item.pop('journal_lpage', ''), + 'note': item.pop('journal_doctype', ''), + 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), + 'pubinfo_material': item.pop('pubinfo_material', ''), + }] + if item.get('journal_year'): + item['publication_info'][0]['year'] = int( + item.pop('journal_year') + ) + + # Remove any fields + _filter_fields(item, [ + 'journal_title', + 'journal_volume', + 'journal_year', + 'journal_issue', + 'journal_fpage', + 'journal_lpage', + 'journal_doctype', + 'journal_artid', + 'pubinfo_freetext', + 'pubinfo_material', + ]) + + return item + + +def _generate_acquisition_source(crawler_record, source): + crawler_record['acquisition_source'] = { + 'source': source, + 'method': 'hepcrawl', + 'datetime': datetime.datetime.now().isoformat(), + 'submission_number': os.environ.get('SCRAPY_JOB', ''), + } + return crawler_record + + +def to_hep( + item, + source, + item_format='hepcrawl', + fft_file_paths=None, +): + item = _generate_acquisition_source( + crawler_record=item, + source=source, + ) + if item_format == 'hep': - return hep2hep(item, fft_file_paths) + return hep2hep( + crawler_record=item, + fft_file_paths=fft_file_paths, + ) elif item_format == 'hepcrawl': + item = _normalize_hepcrawl_record( + item=item, + source=source, + ) return crawler2hep(dict(item)) else: raise Exception('Unknown item_format::{}'.format(item_format)) @@ -50,7 +154,7 @@ def to_hep(item, item_format='hepcrawl', fft_file_paths=None): def hep2hep(crawler_record, fft_file_paths): if fft_file_paths: - crawler_record = _update_record_fft_links(crawler_record, fft_file_paths) + crawler_record = _update_record_fft(crawler_record, fft_file_paths) return crawler_record diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 2244d255..05b61361 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -15,7 +15,6 @@ from __future__ import absolute_import, division, print_function -import datetime import os import requests @@ -30,24 +29,6 @@ from hepcrawl.utils import get_file_name_from_url -def has_publication_info(item): - """If any publication info.""" - return item.get('pubinfo_freetext') or item.get('journal_volume') or \ - item.get('journal_title') or \ - item.get('journal_year') or \ - item.get('journal_issue') or \ - item.get('journal_fpage') or \ - item.get('journal_lpage') or \ - item.get('journal_artid') or \ - item.get('journal_doctype') - - -def filter_fields(item, keys): - """Filter away keys.""" - for key in keys: - item.pop(key, None) - - class FftFilesPipeline(FilesPipeline): """Download all the FFT files provided by record.""" @@ -57,10 +38,10 @@ def __init__(self, *args, **kwargs): def get_media_requests(self, item, info): """Download FFT files using FTP.""" if item.get('file_urls'): - for fft_url in item.get('file_urls'): + for fft_url in item.file_urls: yield Request( url=fft_url, - meta=item['ftp_params'], + meta=item.ftp_params, ) def item_completed(self, results, item, info): @@ -80,7 +61,7 @@ def _get_absolute_local_file_path(path): get_file_name_from_url(result_data['url']) ] = _get_absolute_local_file_path(result_data['path']) - item['file_paths'] = map_file_names_paths + item.file_paths = map_file_names_paths return item @@ -95,92 +76,18 @@ def open_spider(self, spider): self.results_data = [] def _post_enhance_item(self, item, spider): - def _normalize_hepcrawl_record(item, source): - if 'related_article_doi' in item: - item['dois'] += item.pop('related_article_doi', []) - - item['titles'] = [{ - 'title': item.pop('title', ''), - 'subtitle': item.pop('subtitle', ''), - 'source': source, - }] - - item['abstracts'] = [{ - 'value': item.pop('abstract', ''), - 'source': source, - }] - - item['imprints'] = [{ - 'date': item.pop('date_published', ''), - }] - - item['copyright'] = [{ - 'holder': item.pop('copyright_holder', ''), - 'year': item.pop('copyright_year', ''), - 'statement': item.pop('copyright_statement', ''), - 'material': item.pop('copyright_material', ''), - }] - - if has_publication_info(item): - item['publication_info'] = [{ - 'journal_title': item.pop('journal_title', ''), - 'journal_volume': item.pop('journal_volume', ''), - 'journal_issue': item.pop('journal_issue', ''), - 'artid': item.pop('journal_artid', ''), - 'page_start': item.pop('journal_fpage', ''), - 'page_end': item.pop('journal_lpage', ''), - 'note': item.pop('journal_doctype', ''), - 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), - 'pubinfo_material': item.pop('pubinfo_material', ''), - }] - if item.get('journal_year'): - item['publication_info'][0]['year'] = int( - item.pop('journal_year') - ) - - # Remove any fields - filter_fields(item, [ - 'journal_title', - 'journal_volume', - 'journal_year', - 'journal_issue', - 'journal_fpage', - 'journal_lpage', - 'journal_doctype', - 'journal_artid', - 'pubinfo_freetext', - 'pubinfo_material', - ]) - - return item - - fft_file_paths = item.get('file_paths') - item_format = item.get('format', 'hepcrawl') - item = item.get('record_item') if item.get('record_item') else item - item = self._generate_record_meta(item, spider) + fft_file_paths = item.file_paths + item_format = item.item_format + item = item.item if item.item else item source = spider.name - if item_format != 'hep': - item = _normalize_hepcrawl_record( - item=item, - source=source, - ) - return to_hep( item=item, + source=source, item_format=item_format, fft_file_paths=fft_file_paths, ) - def _generate_record_meta(self, json_record, spider): - json_record['acquisition_source'] = { - 'source': spider.name, - 'method': 'hepcrawl', - 'datetime': datetime.datetime.now().isoformat(), - 'submission_number': os.environ.get('SCRAPY_JOB', ''), - } - return json_record - def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" self.count += 1 diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index 2ab883f3..ab151fa6 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -20,7 +20,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import has_numbers +from hepcrawl.utils import ( + has_numbers, + ParsedItem, +) class AlphaSpider(CrawlSpider): @@ -145,4 +148,9 @@ def parse(self, response): record.add_value('source', 'Alpha experiment') record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 496e2e8e..d15c690a 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -20,7 +20,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses, get_nested, build_dict +from hepcrawl.utils import ( + get_licenses, + get_nested, + build_dict, + ParsedItem, +) class APSSpider(Spider): @@ -110,7 +115,13 @@ def parse(self, response): record.add_value('license', license) record.add_value('collections', ['HEP', 'Citeable', 'Published']) - yield record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index d82c8318..8ab0af4f 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -17,7 +17,12 @@ from scrapy.spiders import XMLFeedSpider from ..mappings import CONFERENCE_WORDS, THESIS_WORDS -from ..utils import coll_cleanforthe, get_licenses, split_fullname +from hepcrawl.utils import ( + coll_cleanforthe, + get_licenses, + split_fullname, + ParsedItem, +) from ..items import HEPRecord from ..loaders import HEPLoader @@ -110,8 +115,12 @@ def parse_node(self, response, node): ) record.add_value('license', license) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def _get_authors_or_collaboration(self, node): """Parse authors, affiliations; extract collaboration""" diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 5eb22eb7..ee3a7d47 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -18,7 +18,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from hepcrawl.utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class BaseSpider(XMLFeedSpider): @@ -192,7 +197,13 @@ def build_item(self, response): record.add_value("authors", self.get_authors(node)) record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item def scrape_for_pdf(self, response): """Scrape splash page for any links to PDFs. diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index 6c881252..3581ee1f 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -21,7 +21,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname, parse_domain, get_mime_type +from hepcrawl.utils import ( + split_fullname, + parse_domain, + get_mime_type, + ParsedItem, +) class BrownSpider(CrawlSpider): @@ -219,4 +224,9 @@ def build_item(self, response): record.add_value('thesis', response.meta.get("thesis")) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 76886b23..5ec79da8 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -24,6 +24,9 @@ from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, + get_absolute_file_path, + get_file_name_from_url, + ParsedItem, ) @@ -127,38 +130,29 @@ def parse(self, response): else: prefix_url = '{0}://{1}'.format( 'file', - '/code/tests/functional/desy/fixtures/ftp_server/', - # Temporary - normally the absolute path of fft_link upgrade schemas + '/code/tests/functional/desy/fixtures/ftp_server/', # Temporary - Must be absolute path ) - # prefix_url = '{0}://'.format('file') marcxml_records = self._get_marcxml_records(response.body) - hep_records = self._json_records_from_marcxml(marcxml_records) - - # list_fft_old_links = [] # Enable after supporting FFT 2 dojson - list_fft_old_links = [ - { - 'path': 'FFT/test_fft_1.txt', - }, - { - 'path': 'FFT/test_fft_2.txt', - }, - ] # Temporary + hep_records = self._hep_records_from_marcxml(marcxml_records) + + list_fft_old_links = [] for hep_record in hep_records: - hep_record['_fft'] = list_fft_old_links # Temporary - # list_fft_old_links.extend(json_record['_fft']) # Enable after supporting FFT 2 dojson + list_fft_old_links.extend(hep_record['_fft']) list_file_urls = [ '{0}{1}'.format(prefix_url, fft_link['path']) for fft_link in hep_record['_fft'] ] - yield { - 'record_item': hep_record, - 'file_urls': list_file_urls, - 'ftp_params': ftp_params, - 'format': 'hep', - } + parsed_item = ParsedItem( + item=hep_record, + file_urls=list_file_urls, + ftp_params=ftp_params, + item_format='hep', + ) + + yield parsed_item def handle_package_ftp(self, response): """Yield every XML file found.""" @@ -177,7 +171,7 @@ def _get_marcxml_records(self, response_body): return [etree.tostring(item) for item in list_items] - def _json_records_from_marcxml(self, list_marcxml_records): + def _hep_records_from_marcxml(self, list_marcxml_records): def _create_json_record(str_xml_record): object_record = create_record(etree.XML(str_xml_record)) dojson_record = hep.do(object_record) diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 3ac8b901..3dd50b59 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -16,7 +16,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from hepcrawl.utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class DNBSpider(XMLFeedSpider): @@ -219,4 +224,10 @@ def build_item(self, response): record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index beea699d..cfb59af5 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -22,7 +22,7 @@ from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, get_first, @@ -30,6 +30,7 @@ get_licenses, get_node, parse_domain, + ParsedItem, ) @@ -318,7 +319,12 @@ def build_item_rich(self, response): ) record.add_value("urls", response.meta.get("urls")) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item def build_item_jats(self, response): """Build the final HEPRecord with JATS-format XML ('jp').""" @@ -388,7 +394,12 @@ def build_item_jats(self, response): references = self._get_references(node) record.add_value("references", references) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item def _get_references(self, node): """Get the references.""" diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index c9aacc00..78fdd5fd 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -25,12 +25,13 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( get_first, get_licenses, has_numbers, range_as_string, unzip_xml_files, + ParsedItem, ) from ..dateutils import format_year @@ -1034,4 +1035,9 @@ def build_item(self, response): record.add_value('collections', self.get_collections(doctype)) record.add_value('references', self.get_references(node)) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 941a3674..37871f3a 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -16,7 +16,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses +from hepcrawl.utils import ( + get_licenses, + ParsedItem, +) class HindawiSpider(XMLFeedSpider): @@ -222,4 +225,9 @@ def parse_node(self, response, node): record.add_xpath('source', "./datafield[@tag='260']/subfield[@code='b']/text()") - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e970c1c..579ac65b 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -21,8 +21,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file - +from hepcrawl.utils import ( + get_temporary_file, + ParsedItem, +) from ..dateutils import format_date @@ -240,4 +242,9 @@ def build_item(self, response): record.add_value('source', 'INFN') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index 0e3bae65..90c7809f 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -23,6 +23,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from hepcrawl.utils import ParsedItem class IOPSpider(XMLFeedSpider, NLM): @@ -222,4 +223,9 @@ def parse_node(self, response, node): record.add_value("additional_files", self.add_fft_file(pdf_file_path, file_access, file_type)) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 77bf7948..1c83c829 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from hepcrawl.utils import ( + split_fullname, + ParsedItem, +) class MagicSpider(XMLFeedSpider): @@ -176,4 +179,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index c71234f9..4e099348 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -23,7 +23,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file, split_fullname +from hepcrawl.utils import ( + get_temporary_file, + split_fullname, + ParsedItem, +) class MITSpider(XMLFeedSpider): @@ -223,4 +227,9 @@ def build_item(self, response): record.add_value('page_nr', self.get_page_nr(node)) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index 7200664e..95bc874a 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -18,6 +18,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from hepcrawl.utils import ParsedItem class PhenixSpider(XMLFeedSpider): @@ -128,4 +129,9 @@ def parse_node(self, response, node): record.add_value('source', 'PHENIX') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 101b1163..8a486292 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -19,7 +19,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import parse_domain, get_mime_type +from hepcrawl.utils import ( + parse_domain, + get_mime_type, + ParsedItem, +) class PhilSpider(CrawlSpider): @@ -160,4 +164,9 @@ def build_item(self, response): if not jsonrecord.get('year') == "forthcoming": record.add_value('journal_year', int(jsonrecord['year'])) - return record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 7d3fb87d..875dbb5e 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -13,10 +13,16 @@ import re +from urlparse import urljoin + from scrapy import Request, Selector from scrapy.spiders import Spider -from urlparse import urljoin -from ..utils import get_licenses, get_first + +from hepcrawl.utils import ( + get_licenses, + get_first, + ParsedItem, +) from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader @@ -128,7 +134,13 @@ def build_item(self, response): record.add_value('extra_data', extra_data) record.add_value('collections', ['HEP', 'ConferencePaper']) - return record.load_item() + + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item def _get_ext_systems_number(self, node): return [ diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 661f0bec..97ae8202 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from hepcrawl.utils import ( + split_fullname, + ParsedItem, +) class T2kSpider(XMLFeedSpider): @@ -164,4 +167,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("additional_files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + item=record.load_item(), + item_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index bef40a72..49b5b725 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -20,12 +20,13 @@ from ..extractors.jats import Jats from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ( +from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, local_list_files, get_licenses, unzip_xml_files, + ParsedItem, ) @@ -148,7 +149,7 @@ def parse_node(self, response, node): self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[0] not in self.allowed_article_types: # Filter out non-interesting article types - return None + return record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', @@ -203,9 +204,13 @@ def parse_node(self, response, node): record.add_value('license', license) record.add_value('collections', self._get_collections(node, article_type, journal_title)) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + item=dict(record.load_item()), + item_format='hepcrawl', + ) + + yield parsed_item def _get_collections(self, node, article_type, current_journal_title): """Return this articles' collection.""" diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index a212593e..71ff3aa6 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -342,3 +342,35 @@ def get_license_by_text(license_text): def get_file_name_from_url(url): return url.rsplit('/', 1)[-1] + + +def get_absolute_file_path(file_path): + """Returns the absolute path of a relative path.""" + return os.path.abspath(file_path) + + +class ParsedItem(dict): + """Generate interface to communicate Spider-Pipelines""" + def __init__( + self, + item, + file_urls=None, + item_format=None, + ftp_params=None, + file_paths=None, + **kwargs + ): + super(ParsedItem, self).__init__( + item=item, + file_urls=file_urls, + item_format=item_format, + ftp_params=ftp_params, + file_paths=file_paths, + **kwargs + ) + self.item = item + self.file_urls = file_urls + self.format = item_format + self.ftp_params = ftp_params + self.file_paths = file_paths + self.__dict__ = self diff --git a/tests/functional/desy/fixtures/desy_ftp_records.json b/tests/functional/desy/fixtures/desy_ftp_records.json index 6a9b6c62..f685a254 100644 --- a/tests/functional/desy/fixtures/desy_ftp_records.json +++ b/tests/functional/desy/fixtures/desy_ftp_records.json @@ -10,10 +10,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 111111, @@ -76,10 +87,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 222222, @@ -142,10 +164,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 333333, @@ -208,10 +241,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 444444, diff --git a/tests/functional/desy/fixtures/desy_local_records.json b/tests/functional/desy/fixtures/desy_local_records.json index 57d780e9..6fe2c4d0 100644 --- a/tests/functional/desy/fixtures/desy_local_records.json +++ b/tests/functional/desy/fixtures/desy_local_records.json @@ -10,10 +10,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 111111, @@ -76,10 +87,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 222222, @@ -142,10 +164,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 333333, @@ -208,10 +241,21 @@ ], "_fft": [ { - "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", + "format": ".txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", + "type": "Main", + "filename": "cNFW_rogue_curves" }, { - "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt" + "version": 1, + "creation_datetime": "2017-06-27T09:43:16", + "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", + "format": ".txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", + "type": "Main", + "filename": "scalingRelations_DutBeh_DC14_all_Oh" } ], "control_number": 444444, diff --git a/tests/functional/desy/fixtures/ftp_server/FFT/test_fft_1.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt similarity index 100% rename from tests/functional/desy/fixtures/ftp_server/FFT/test_fft_1.txt rename to tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt diff --git a/tests/functional/desy/fixtures/ftp_server/FFT/test_fft_2.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt similarity index 100% rename from tests/functional/desy/fixtures/ftp_server/FFT/test_fft_2.txt rename to tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml index 5a57f51c..6900d746 100644 --- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml @@ -50,6 +50,28 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + 222222 @@ -101,5 +123,27 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + \ No newline at end of file diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml index 44266cd4..2067b5e7 100644 --- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml @@ -50,6 +50,28 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + 444444 @@ -101,5 +123,27 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + \ No newline at end of file diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index 2b4d8c71..321766f9 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -51,6 +51,7 @@ def get_fft_1_path(): 'desy', 'fixtures', 'ftp_server', + 'DESY', 'FFT', 'test_fft_1.txt', test_suite='functional', @@ -63,6 +64,7 @@ def get_fft_2_path(): 'desy', 'fixtures', 'ftp_server', + 'DESY', 'FFT', 'test_fft_2.txt', test_suite='functional', @@ -159,8 +161,8 @@ def test_desy_ftp( for record in expected_results: fft_file_paths = sorted(record['_fft']) - assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path) - assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path) + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path) @pytest.mark.parametrize( @@ -205,6 +207,6 @@ def test_desy_local_package_path( for record in expected_results: fft_file_paths = sorted(record['_fft']) - assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path) - assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path) - + assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path) + assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path) +# diff --git a/tests/unit/responses/desy/desy_collection_records.xml b/tests/unit/responses/desy/desy_collection_records.xml index 5a57f51c..d2086694 100644 --- a/tests/unit/responses/desy/desy_collection_records.xml +++ b/tests/unit/responses/desy/desy_collection_records.xml @@ -50,6 +50,28 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + 222222 @@ -100,6 +122,28 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + \ No newline at end of file diff --git a/tests/unit/responses/desy/desy_record.xml b/tests/unit/responses/desy/desy_record.xml index 8219064f..437c45b3 100644 --- a/tests/unit/responses/desy/desy_record.xml +++ b/tests/unit/responses/desy/desy_record.xml @@ -50,5 +50,27 @@ oai:inspirehep.net:1608652 INSPIRE:HEP + + DESY/FFT/test_fft_1.txt;1 + 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. + .txt + cNFW_rogue_curves + + 2017-06-27 09:43:17 + Main + 1 + + + + DESY/FFT/test_fft_2.txt;1 + 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. + .txt + scalingRelations_DutBeh_DC14_all_Oh + + 2017-06-27 09:43:16 + Main + 1 + + \ No newline at end of file diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py index eef140b1..ad8f3f03 100644 --- a/tests/unit/test_alpha.py +++ b/tests/unit/test_alpha.py @@ -20,13 +20,15 @@ def results(): """Return results generator from the Alpha spider.""" spider = alpha_spider.AlphaSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file('alpha/test_1.htm') ) ) + records = [parsed_item.item for parsed_item in parsed_items] assert records + return records diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py index eb53269d..8bc66033 100644 --- a/tests/unit/test_aps.py +++ b/tests/unit/test_aps.py @@ -21,7 +21,7 @@ def results(): from scrapy.http import TextResponse spider = aps_spider.APSSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'aps/aps_single_response.json', @@ -30,6 +30,8 @@ def results(): ) ) + records = [parsed_item.item for parsed_item in parsed_items] + assert records return records diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index d395c494..47ea20db 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -35,7 +35,7 @@ def _get_processed_item(record, spider): item = pipeline.process_item(record, spider) return item - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', @@ -44,10 +44,10 @@ def _get_processed_item(record, spider): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_item(record, spider) for record in records] + + return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] def test_page_nr(many_results): diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index b7ca410d..d8e6f9e5 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -31,7 +31,7 @@ def _get_processed_item(record, spider): crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', @@ -40,11 +40,10 @@ def _get_processed_item(record, spider): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_item(record, spider) for record in records] + return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py index cc6ef093..48551cdf 100644 --- a/tests/unit/test_base.py +++ b/tests/unit/test_base.py @@ -38,9 +38,11 @@ def record(): nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -169,7 +171,10 @@ def splash(): 'Content-Type': 'text/html', }, ) - return spider.scrape_for_pdf(splash_response) + + parsed_item = spider.scrape_for_pdf(splash_response) + + return parsed_item.item def test_splash(splash): @@ -201,7 +206,10 @@ def parsed_node(): response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node[0].extract() - return spider.parse_node(response, node[0]) + + parsed_item = spider.parse_node(response, node[0]) + + return parsed_item.item def test_parsed_node(parsed_node): diff --git a/tests/unit/test_brown.py b/tests/unit/test_brown.py index 0b42b4df..41e3902d 100644 --- a/tests/unit/test_brown.py +++ b/tests/unit/test_brown.py @@ -41,10 +41,12 @@ def record(): splash_response = fake_response_from_file('brown/test_splash.html') splash_response.meta["jsonrecord"] = jsonrecord - parsed_record = spider.scrape_splash(splash_response) + iter_item = spider.scrape_splash(splash_response) - assert parsed_record - return parsed_record + parsed_item = iter_item.next() + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -200,7 +202,12 @@ def parsed_node_no_splash(): jsonrecord = jsonresponse["items"]["docs"][0] response.meta["jsonrecord"] = jsonrecord - return spider.parse(response).next() + iter_item = spider.parse(response).next() + + parsed_item = iter_item.next() + assert parsed_item + + return parsed_item.item def test_no_splash(parsed_node_no_splash): diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py index 73999356..5b01f7fd 100644 --- a/tests/unit/test_desy.py +++ b/tests/unit/test_desy.py @@ -82,10 +82,34 @@ def test_pipeline_record(generated_record): ], '_fft': [ { - 'path': 'FFT/test_fft_1.txt' + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, }, { - 'path': 'FFT/test_fft_2.txt' + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 } ], 'abstracts': [ @@ -182,11 +206,35 @@ def test_pipeline_collection_records(generated_records): ], "_fft": [ { - "path": "FFT/test_fft_1.txt" + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, }, { - "path": "FFT/test_fft_2.txt" - }, + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 + } ], "control_number": 111111, "public_notes": [ @@ -248,11 +296,35 @@ def test_pipeline_collection_records(generated_records): ], "_fft": [ { - "path": "FFT/test_fft_1.txt" + 'creation_datetime': '2017-06-27T09:43:17', + 'description': '00013 Decomposition of the problematic rotation curves in our ' + 'sample according to the best-fit \\textsc{core}NFW models. ' + 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.', + 'filename': 'cNFW_rogue_curves', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_1.txt;1', + 'type': 'Main', + 'version': 1, }, { - "path": "FFT/test_fft_2.txt" - }, + 'creation_datetime': '2017-06-27T09:43:16', + 'description': '00005 Comparison of the parameters of the best-fit DC14 models to ' + 'the cosmological halo mass-concentration relation from \\' + 'cite{dutton14} (left) and the stellar mass-halo mass relation ' + 'from \\cite{behroozi13} (right). The error bars correspond to the ' + 'extremal values of the multidimensional 68\\% confidence region ' + 'for each fit. The theoretical relations are shown as red lines ' + 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by ' + 'the dark and light grey bands, respectively. The ' + 'mass-concentration relation from \\cite{maccio08} and the stellar' + ' mass-halo mass relation from \\cite{behroozi13} are also shown ' + 'as the black dashed lines.', + 'filename': 'scalingRelations_DutBeh_DC14_all_Oh', + 'format': '.txt', + 'path': 'DESY/FFT/test_fft_2.txt;1', + 'type': 'Main', + 'version': 1 + } ], "control_number": 222222, "public_notes": [ diff --git a/tests/unit/test_dnb.py b/tests/unit/test_dnb.py index b00aff3d..a1a22dbd 100644 --- a/tests/unit/test_dnb.py +++ b/tests/unit/test_dnb.py @@ -72,7 +72,11 @@ def record(scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + parsed_item = request.callback(response) + assert parsed_item + + return parsed_item.item def test_title(record): @@ -241,7 +245,9 @@ def parse_without_splash(): 'Content-Type': 'application/pdf;charset=base64', } ) - return spider.parse_node(response, nodes[0]) + + parsed_item = spider.parse_node(response, nodes[0]) + return parsed_item.item def test_parse_without_splash(parse_without_splash): diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index cc7885bd..7e841015 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -40,6 +40,7 @@ def scrape_pos_page_body(): ) ) + @pytest.fixture def targzfile(): """Path to test tar.gz file with JATS XML file.""" @@ -50,6 +51,7 @@ def targzfile(): 'test_gz.tar.gz' ) + @pytest.fixture def package_jats(targzfile): """Extract tar.gz package with JATS XML file.""" @@ -75,7 +77,12 @@ def record_jats(package_jats, scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + iter_item = request.callback(response) + parsed_item = iter_item.next() + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -107,7 +114,11 @@ def record_rich(package_rich): fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] - return spider.parse_node(fake_resp, node) + iter_item = spider.parse_node(fake_resp, node) + parsed_item = iter_item.next() + assert parsed_item + + return parsed_item.item def test_title(record_jats): @@ -145,6 +156,7 @@ def test_abstract(record_jats): assert 'abstract' in record_jats assert record_jats['abstract'] == abstract + def test_date_published(record_jats): """Test extracting date_published.""" date_published = "2015-01-01" @@ -179,6 +191,7 @@ def test_doi(record_jats): assert 'dois' in record_jats assert record_jats['dois'][0]['value'] == doi + def test_publication_info(record_jats): """Test extracting publication info.""" assert 'journal_title' in record_jats @@ -206,7 +219,6 @@ def test_keywords(record_jats): assert keyw["value"] in keywords - def test_authors(record_jats): """Test authors.""" authors = ["Arasoglu, Ali", "Ozdemir, Omer Faruk"] @@ -326,7 +338,6 @@ def test_authors_rich(record_rich): assert astr[index]["affiliations"][0]["value"] == affiliations[index] - def test_tarfile(tarbzfile, tmpdir): """Test untarring a tar.bz package with a test XML file. @@ -343,7 +354,6 @@ def test_tarfile(tarbzfile, tmpdir): assert "aas/xml_rich/2000/01" not in xml_files_flat[0] - def test_handle_package_ftp(tarbzfile): """Test getting the target folder name for xml files.""" spider = edp_spider.EDPSpider() @@ -353,6 +363,7 @@ def test_handle_package_ftp(tarbzfile): assert isinstance(request, Request) assert request.meta["package_path"] == tarbzfile + def test_no_dois_jats(): """Test parsing when no DOI in record. JATS format.""" spider = edp_spider.EDPSpider() @@ -370,7 +381,10 @@ def test_no_dois_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item assert "dois" not in record assert "additional_files" not in record @@ -390,7 +404,10 @@ def test_no_dois_rich(): response = fake_response_from_string(body) response.meta["rich"] = True node = get_node(spider, "//EDPSArticle", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item assert "dois" not in record assert "additional_files" not in record @@ -416,7 +433,10 @@ def test_addendum_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item assert "related_article_doi" in record assert record["related_article_doi"][0][ @@ -439,7 +459,10 @@ def test_author_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item assert 'email' in record['authors'][0] assert record['authors'][0]['email'] == "Fname.Sname@university.org" @@ -472,7 +495,10 @@ def test_aff_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA" assert 'affiliations' in record['authors'][0] @@ -481,8 +507,6 @@ def test_aff_with_email(): assert record['authors'][0]['email'] is None - - def test_no_valid_article(): """Test parsing when filtering out non-interesting article types.""" spider = edp_spider.EDPSpider() @@ -506,7 +530,10 @@ def test_collections_review(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + record = parsed_item.item assert "collections" in record assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}] @@ -533,7 +560,11 @@ def record_references_only(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - return spider.parse_node(response, node) + + iter_item = spider.parse_node(response, node) + parsed_item = iter_item.next() + + return parsed_item.item def test_references(record_references_only): diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index ca023122..109f3d3f 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -41,9 +41,11 @@ def record(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' tag = '//%s' % spider.itertag nodes = get_node(spider, tag, response) - parsed_record = spider.parse_node(response, nodes) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes) + assert parsed_item + + return parsed_item.item @pytest.fixture(scope="module") @@ -97,7 +99,11 @@ def parsed_node(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' parse_response = spider.parse_node(response, node) parse_response.status = 404 - return spider.scrape_sciencedirect(parse_response) + + parsed_item = spider.scrape_sciencedirect(parse_response) + assert parsed_item + + return parsed_item.item def test_collection(parsed_node): @@ -164,7 +170,10 @@ def cover_display_date(): node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date(cover_display_date): @@ -187,7 +196,10 @@ def cover_display_date_y_m(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date_y_m(cover_display_date_y_m): @@ -210,7 +222,10 @@ def cover_display_date_y(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parse_item = spider.parse_node(response, node) + assert parse_item + + return parse_item.item def test_cover_display_date_y(cover_display_date_y): @@ -1644,7 +1659,11 @@ def sciencedirect(): ]) response.meta["info"] = {} response.meta["node"] = get_node(spider, '/head', text=body) - return spider.scrape_sciencedirect(response) + + parse_item = spider.scrape_sciencedirect(response) + assert parse_item + + return parse_item.item def test_sciencedirect(sciencedirect): diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py index 37e5e183..84ebd06d 100644 --- a/tests/unit/test_hindawi.py +++ b/tests/unit/test_hindawi.py @@ -26,9 +26,11 @@ def record(): response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + iter_item = spider.parse_node(response, nodes[0]) + parsed_item = iter_item.next() + assert parsed_item + + return parsed_item.item def test_title(record): diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py index 0c60799a..5cd1e27d 100644 --- a/tests/unit/test_infn.py +++ b/tests/unit/test_infn.py @@ -28,9 +28,11 @@ def record(): """Return scraping results from the INFN spider.""" spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_splash.html') - parsed_record = spider.scrape_splash(response) - assert parsed_record - return parsed_record + + parsed_item = spider.scrape_splash(response) + assert parsed_item + + return parsed_item.item def test_title(record): @@ -121,6 +123,7 @@ def test_non_thesis(): assert record is None + def test_parse_node(): """Test parse_node function. This should be a scrapy Request object. @@ -148,6 +151,6 @@ def test_parse_node_nolink(): response = fake_response_from_file('infn/test_1_nolink.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() - assert isinstance(record, hepcrawl.items.HEPRecord) + assert isinstance(parsed_item.item, hepcrawl.items.HEPRecord) diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index b776adfa..fb8d26d2 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -38,9 +38,11 @@ def record(): response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, node) + assert parsed_item + + return parsed_item.item def test_abstract(record): @@ -182,10 +184,11 @@ def erratum_open_access_record(): 'iop', 'pdf', ) - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, node) + assert parsed_item + + return parsed_item.item def test_files_erratum_open_access_record(erratum_open_access_record): diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py index eeb574fe..74d9fad4 100644 --- a/tests/unit/test_magic.py +++ b/tests/unit/test_magic.py @@ -23,6 +23,7 @@ get_node, ) + @pytest.fixture def record(): """Return results from the MAGIC spider. First parse node, then scrape, @@ -39,9 +40,10 @@ def record(): splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["urls"] = parsed_node.meta["urls"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + + return parsed_item.item def test_abstract(record): @@ -102,7 +104,6 @@ def test_abstract(record): assert record["abstract"] == abstract - def test_title(record): """Test extracting title.""" title = "Limits to the violation of Lorentz invariance using the emission of the CRAB pulsar at TeV energies, discovered with archival data from the MAGIC telescopes" @@ -139,6 +140,7 @@ def test_url(record): assert 'urls' in record assert record['urls'][0]['value'] == url + def test_pdf_link(record): """Test pdf link(s)""" files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf" @@ -164,8 +166,9 @@ def test_no_author_no_date_no_url(): """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "authors" not in record @@ -184,8 +187,9 @@ def test_no_aff(): """ response = fake_response_from_string(body) - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "affiliations" not in record["authors"] @@ -216,8 +220,9 @@ def test_no_spash_page(): response.status = 404 response.meta["title"] = parsed_node.meta["title"] response.meta["urls"] = parsed_node.meta["urls"] - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + record = parsed_item.item assert isinstance(record, hepcrawl.items.HEPRecord) assert "urls" in record assert "title" in record diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py index 0253d91f..2629dd34 100644 --- a/tests/unit/test_mit.py +++ b/tests/unit/test_mit.py @@ -25,9 +25,11 @@ def record(): """Return scraping results from the MIT spider.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_splash.html') - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -37,7 +39,11 @@ def parsed_node(): response = fake_response_from_file('mit/test_list.html') tag = spider.itertag node = get_node(spider, tag, response, rtype="html") - return spider.parse_node(response, node).next() + + parsed_item = spider.parse_node(response, node).next() + assert parsed_item + + return parsed_item def test_url(parsed_node): @@ -159,7 +165,11 @@ def supervisors(): """ response = fake_response_from_string(body) - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item def test_two_supervisors(supervisors): diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py index 75384350..5a6a5f4c 100644 --- a/tests/unit/test_phenix.py +++ b/tests/unit/test_phenix.py @@ -29,9 +29,12 @@ def record(): response = fake_response_from_file('phenix/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + + return parsed_item.item + @pytest.fixture def non_thesis(): @@ -49,10 +52,12 @@ def non_thesis(): node = get_node(spider, '//li', text=body) return spider.parse_node(response, node) + def test_non_thesis(non_thesis): """Test MSc thesis skipping.""" assert non_thesis is None + def test_title(record): """Test extracting title.""" title = "MEASUREMENT OF THE DOUBLE HELICITY ASYMMETRY IN INCLUSIVE $\pi^{0}$ PRODUCTION IN POLARIZED PROTON-PROTON COLLISIONS AT $\sqrt{s}$ = 510 GeV" @@ -82,6 +87,7 @@ def test_authors(record): aff['value'] for aff in record['authors'][index]['affiliations'] ] + def test_pdf_link(record): """Test pdf link(s)""" files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf" diff --git a/tests/unit/test_phil.py b/tests/unit/test_phil.py index e99064b2..be0da905 100644 --- a/tests/unit/test_phil.py +++ b/tests/unit/test_phil.py @@ -33,9 +33,11 @@ def record(): "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fanalysis.oxfordjournals.org%2Fcontent%2F66%2F3%2F194.full.pdf%2Bhtml%3Fframe%3Dsidebar", "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fbrogaardb.googlepages.com%2Ftensedrelationsoffprint.pdf" ] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -48,7 +50,11 @@ def journal(): response = fake_response_from_file('phil/test_journal.json') jsonrecord = json.loads(response.body_as_unicode()) response.meta["jsonrecord"] = jsonrecord[0] - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + + return parsed_item.item @pytest.fixture @@ -223,7 +229,10 @@ def splash(): ] } - return spider.scrape_for_pdf(response) + parsed_item = spider.scrape_for_pdf(response) + assert parsed_item + + return parsed_item.item def test_scrape(splash): diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 897a0112..8c9c8e59 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -51,9 +51,12 @@ def record(scrape_pos_page_body): assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - record = request.callback(response) - processed_record = pipeline.process_item(record, spider) - return processed_record + iter_record = request.callback(response) + + item = iter_record.next() + parsed_item = pipeline.process_item(item, spider) + + return parsed_item def test_titles(record): diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py index 283a02e5..caaaaefc 100644 --- a/tests/unit/test_t2k.py +++ b/tests/unit/test_t2k.py @@ -36,9 +36,10 @@ def record(): splash_response.meta["urls"] = parsed_node.meta["urls"] splash_response.meta["authors"] = parsed_node.meta["authors"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + + return parsed_item.item def test_abstact(record): @@ -125,9 +126,10 @@ def non_url(): selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]).next() - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, nodes[0]).next() + assert parsed_item + + return parsed_item.item def test_non_url(non_url):