diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
index afbf94b1..add53f70 100644
--- a/hepcrawl/crawler2hep.py
+++ b/hepcrawl/crawler2hep.py
@@ -15,34 +15,138 @@
from __future__ import absolute_import, division, print_function
+import os
+import datetime
+
from inspire_schemas.api import LiteratureBuilder
from hepcrawl.utils import get_file_name_from_url
-def _update_record_fft_links(record, map_fft_file_paths):
- def _list_new_fft_links(old_fft_links, map_fft_file_paths):
- new_fft_links = []
- for fft_link in old_fft_links:
- file_name = get_file_name_from_url(fft_link['path'])
- if file_name in map_fft_file_paths:
- new_fft_links.append(
- {
- 'path': map_fft_file_paths[file_name],
- }
- )
+def _update_record_fft(record, index_fft_file_paths):
+ def _update_fft_fields(fft_fields, index_fft_file_paths):
+ new_fft_fields = []
+ for fft_field in fft_fields:
+ file_name = get_file_name_from_url(fft_field['path'])
+ if file_name in index_fft_file_paths:
+ fft_field['path'] = index_fft_file_paths[file_name]
+ new_fft_fields.append(fft_field)
- return new_fft_links
+ return new_fft_fields
- old_fft_links = record['_fft']
- record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths)
+ record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
return record
-def to_hep(item, item_format='hepcrawl', fft_file_paths=None):
+def _has_publication_info(item):
+ """If any publication info."""
+ return item.get('pubinfo_freetext') or item.get('journal_volume') or \
+ item.get('journal_title') or \
+ item.get('journal_year') or \
+ item.get('journal_issue') or \
+ item.get('journal_fpage') or \
+ item.get('journal_lpage') or \
+ item.get('journal_artid') or \
+ item.get('journal_doctype')
+
+
+def _filter_fields(item, keys):
+ """Filter away keys."""
+ for key in keys:
+ item.pop(key, None)
+
+
+def _normalize_hepcrawl_record(item, source):
+ if 'related_article_doi' in item:
+ item['dois'] += item.pop('related_article_doi', [])
+
+ item['titles'] = [{
+ 'title': item.pop('title', ''),
+ 'subtitle': item.pop('subtitle', ''),
+ 'source': source,
+ }]
+
+ item['abstracts'] = [{
+ 'value': item.pop('abstract', ''),
+ 'source': source,
+ }]
+
+ item['imprints'] = [{
+ 'date': item.pop('date_published', ''),
+ }]
+
+ item['copyright'] = [{
+ 'holder': item.pop('copyright_holder', ''),
+ 'year': item.pop('copyright_year', ''),
+ 'statement': item.pop('copyright_statement', ''),
+ 'material': item.pop('copyright_material', ''),
+ }]
+
+ if _has_publication_info(item):
+ item['publication_info'] = [{
+ 'journal_title': item.pop('journal_title', ''),
+ 'journal_volume': item.pop('journal_volume', ''),
+ 'journal_issue': item.pop('journal_issue', ''),
+ 'artid': item.pop('journal_artid', ''),
+ 'page_start': item.pop('journal_fpage', ''),
+ 'page_end': item.pop('journal_lpage', ''),
+ 'note': item.pop('journal_doctype', ''),
+ 'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
+ 'pubinfo_material': item.pop('pubinfo_material', ''),
+ }]
+ if item.get('journal_year'):
+ item['publication_info'][0]['year'] = int(
+ item.pop('journal_year')
+ )
+
+ # Remove any fields
+ _filter_fields(item, [
+ 'journal_title',
+ 'journal_volume',
+ 'journal_year',
+ 'journal_issue',
+ 'journal_fpage',
+ 'journal_lpage',
+ 'journal_doctype',
+ 'journal_artid',
+ 'pubinfo_freetext',
+ 'pubinfo_material',
+ ])
+
+ return item
+
+
+def _generate_acquisition_source(crawler_record, source):
+ crawler_record['acquisition_source'] = {
+ 'source': source,
+ 'method': 'hepcrawl',
+ 'datetime': datetime.datetime.now().isoformat(),
+ 'submission_number': os.environ.get('SCRAPY_JOB', ''),
+ }
+ return crawler_record
+
+
+def to_hep(
+ item,
+ source,
+ item_format='hepcrawl',
+ fft_file_paths=None,
+):
+ item = _generate_acquisition_source(
+ crawler_record=item,
+ source=source,
+ )
+
if item_format == 'hep':
- return hep2hep(item, fft_file_paths)
+ return hep2hep(
+ crawler_record=item,
+ fft_file_paths=fft_file_paths,
+ )
elif item_format == 'hepcrawl':
+ item = _normalize_hepcrawl_record(
+ item=item,
+ source=source,
+ )
return crawler2hep(dict(item))
else:
raise Exception('Unknown item_format::{}'.format(item_format))
@@ -50,7 +154,7 @@ def to_hep(item, item_format='hepcrawl', fft_file_paths=None):
def hep2hep(crawler_record, fft_file_paths):
if fft_file_paths:
- crawler_record = _update_record_fft_links(crawler_record, fft_file_paths)
+ crawler_record = _update_record_fft(crawler_record, fft_file_paths)
return crawler_record
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 2244d255..05b61361 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -15,7 +15,6 @@
from __future__ import absolute_import, division, print_function
-import datetime
import os
import requests
@@ -30,24 +29,6 @@
from hepcrawl.utils import get_file_name_from_url
-def has_publication_info(item):
- """If any publication info."""
- return item.get('pubinfo_freetext') or item.get('journal_volume') or \
- item.get('journal_title') or \
- item.get('journal_year') or \
- item.get('journal_issue') or \
- item.get('journal_fpage') or \
- item.get('journal_lpage') or \
- item.get('journal_artid') or \
- item.get('journal_doctype')
-
-
-def filter_fields(item, keys):
- """Filter away keys."""
- for key in keys:
- item.pop(key, None)
-
-
class FftFilesPipeline(FilesPipeline):
"""Download all the FFT files provided by record."""
@@ -57,10 +38,10 @@ def __init__(self, *args, **kwargs):
def get_media_requests(self, item, info):
"""Download FFT files using FTP."""
if item.get('file_urls'):
- for fft_url in item.get('file_urls'):
+ for fft_url in item.file_urls:
yield Request(
url=fft_url,
- meta=item['ftp_params'],
+ meta=item.ftp_params,
)
def item_completed(self, results, item, info):
@@ -80,7 +61,7 @@ def _get_absolute_local_file_path(path):
get_file_name_from_url(result_data['url'])
] = _get_absolute_local_file_path(result_data['path'])
- item['file_paths'] = map_file_names_paths
+ item.file_paths = map_file_names_paths
return item
@@ -95,92 +76,18 @@ def open_spider(self, spider):
self.results_data = []
def _post_enhance_item(self, item, spider):
- def _normalize_hepcrawl_record(item, source):
- if 'related_article_doi' in item:
- item['dois'] += item.pop('related_article_doi', [])
-
- item['titles'] = [{
- 'title': item.pop('title', ''),
- 'subtitle': item.pop('subtitle', ''),
- 'source': source,
- }]
-
- item['abstracts'] = [{
- 'value': item.pop('abstract', ''),
- 'source': source,
- }]
-
- item['imprints'] = [{
- 'date': item.pop('date_published', ''),
- }]
-
- item['copyright'] = [{
- 'holder': item.pop('copyright_holder', ''),
- 'year': item.pop('copyright_year', ''),
- 'statement': item.pop('copyright_statement', ''),
- 'material': item.pop('copyright_material', ''),
- }]
-
- if has_publication_info(item):
- item['publication_info'] = [{
- 'journal_title': item.pop('journal_title', ''),
- 'journal_volume': item.pop('journal_volume', ''),
- 'journal_issue': item.pop('journal_issue', ''),
- 'artid': item.pop('journal_artid', ''),
- 'page_start': item.pop('journal_fpage', ''),
- 'page_end': item.pop('journal_lpage', ''),
- 'note': item.pop('journal_doctype', ''),
- 'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
- 'pubinfo_material': item.pop('pubinfo_material', ''),
- }]
- if item.get('journal_year'):
- item['publication_info'][0]['year'] = int(
- item.pop('journal_year')
- )
-
- # Remove any fields
- filter_fields(item, [
- 'journal_title',
- 'journal_volume',
- 'journal_year',
- 'journal_issue',
- 'journal_fpage',
- 'journal_lpage',
- 'journal_doctype',
- 'journal_artid',
- 'pubinfo_freetext',
- 'pubinfo_material',
- ])
-
- return item
-
- fft_file_paths = item.get('file_paths')
- item_format = item.get('format', 'hepcrawl')
- item = item.get('record_item') if item.get('record_item') else item
- item = self._generate_record_meta(item, spider)
+ fft_file_paths = item.file_paths
+ item_format = item.item_format
+ item = item.item if item.item else item
source = spider.name
- if item_format != 'hep':
- item = _normalize_hepcrawl_record(
- item=item,
- source=source,
- )
-
return to_hep(
item=item,
+ source=source,
item_format=item_format,
fft_file_paths=fft_file_paths,
)
- def _generate_record_meta(self, json_record, spider):
- json_record['acquisition_source'] = {
- 'source': spider.name,
- 'method': 'hepcrawl',
- 'datetime': datetime.datetime.now().isoformat(),
- 'submission_number': os.environ.get('SCRAPY_JOB', ''),
- }
- return json_record
-
def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1
diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py
index 2ab883f3..ab151fa6 100644
--- a/hepcrawl/spiders/alpha_spider.py
+++ b/hepcrawl/spiders/alpha_spider.py
@@ -20,7 +20,10 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import has_numbers
+from hepcrawl.utils import (
+ has_numbers,
+ ParsedItem,
+)
class AlphaSpider(CrawlSpider):
@@ -145,4 +148,9 @@ def parse(self, response):
record.add_value('source', 'Alpha experiment')
record.add_value('collections', ['HEP', 'THESIS'])
- yield record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py
index 496e2e8e..d15c690a 100644
--- a/hepcrawl/spiders/aps_spider.py
+++ b/hepcrawl/spiders/aps_spider.py
@@ -20,7 +20,12 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_licenses, get_nested, build_dict
+from hepcrawl.utils import (
+ get_licenses,
+ get_nested,
+ build_dict,
+ ParsedItem,
+)
class APSSpider(Spider):
@@ -110,7 +115,13 @@ def parse(self, response):
record.add_value('license', license)
record.add_value('collections', ['HEP', 'Citeable', 'Published'])
- yield record.load_item()
+
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
# Pagination support. Will yield until no more "next" pages are found
if 'Link' in response.headers:
diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py
index d82c8318..8ab0af4f 100644
--- a/hepcrawl/spiders/arxiv_spider.py
+++ b/hepcrawl/spiders/arxiv_spider.py
@@ -17,7 +17,12 @@
from scrapy.spiders import XMLFeedSpider
from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
-from ..utils import coll_cleanforthe, get_licenses, split_fullname
+from hepcrawl.utils import (
+ coll_cleanforthe,
+ get_licenses,
+ split_fullname,
+ ParsedItem,
+)
from ..items import HEPRecord
from ..loaders import HEPLoader
@@ -110,8 +115,12 @@ def parse_node(self, response, node):
)
record.add_value('license', license)
- parsed_record = dict(record.load_item())
- return parsed_record
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py
index 5eb22eb7..ee3a7d47 100644
--- a/hepcrawl/spiders/base_spider.py
+++ b/hepcrawl/spiders/base_spider.py
@@ -18,7 +18,12 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_mime_type, parse_domain, get_node
+from hepcrawl.utils import (
+ get_mime_type,
+ parse_domain,
+ get_node,
+ ParsedItem,
+)
class BaseSpider(XMLFeedSpider):
@@ -192,7 +197,13 @@ def build_item(self, response):
record.add_value("authors", self.get_authors(node))
record.add_value('thesis', {'degree_type': 'PhD'})
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
def scrape_for_pdf(self, response):
"""Scrape splash page for any links to PDFs.
diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py
index 6c881252..3581ee1f 100644
--- a/hepcrawl/spiders/brown_spider.py
+++ b/hepcrawl/spiders/brown_spider.py
@@ -21,7 +21,12 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import split_fullname, parse_domain, get_mime_type
+from hepcrawl.utils import (
+ split_fullname,
+ parse_domain,
+ get_mime_type,
+ ParsedItem,
+)
class BrownSpider(CrawlSpider):
@@ -219,4 +224,9 @@ def build_item(self, response):
record.add_value('thesis', response.meta.get("thesis"))
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 76886b23..5ec79da8 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -24,6 +24,9 @@
from hepcrawl.utils import (
ftp_list_files,
ftp_connection_info,
+ get_absolute_file_path,
+ get_file_name_from_url,
+ ParsedItem,
)
@@ -127,38 +130,29 @@ def parse(self, response):
else:
prefix_url = '{0}://{1}'.format(
'file',
- '/code/tests/functional/desy/fixtures/ftp_server/',
- # Temporary - normally the absolute path of fft_link upgrade schemas
+ '/code/tests/functional/desy/fixtures/ftp_server/', # Temporary - Must be absolute path
)
- # prefix_url = '{0}://'.format('file')
marcxml_records = self._get_marcxml_records(response.body)
- hep_records = self._json_records_from_marcxml(marcxml_records)
-
- # list_fft_old_links = [] # Enable after supporting FFT 2 dojson
- list_fft_old_links = [
- {
- 'path': 'FFT/test_fft_1.txt',
- },
- {
- 'path': 'FFT/test_fft_2.txt',
- },
- ] # Temporary
+ hep_records = self._hep_records_from_marcxml(marcxml_records)
+
+ list_fft_old_links = []
for hep_record in hep_records:
- hep_record['_fft'] = list_fft_old_links # Temporary
- # list_fft_old_links.extend(json_record['_fft']) # Enable after supporting FFT 2 dojson
+ list_fft_old_links.extend(hep_record['_fft'])
list_file_urls = [
'{0}{1}'.format(prefix_url, fft_link['path'])
for fft_link in hep_record['_fft']
]
- yield {
- 'record_item': hep_record,
- 'file_urls': list_file_urls,
- 'ftp_params': ftp_params,
- 'format': 'hep',
- }
+ parsed_item = ParsedItem(
+ item=hep_record,
+ file_urls=list_file_urls,
+ ftp_params=ftp_params,
+ item_format='hep',
+ )
+
+ yield parsed_item
def handle_package_ftp(self, response):
"""Yield every XML file found."""
@@ -177,7 +171,7 @@ def _get_marcxml_records(self, response_body):
return [etree.tostring(item) for item in list_items]
- def _json_records_from_marcxml(self, list_marcxml_records):
+ def _hep_records_from_marcxml(self, list_marcxml_records):
def _create_json_record(str_xml_record):
object_record = create_record(etree.XML(str_xml_record))
dojson_record = hep.do(object_record)
diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py
index 3ac8b901..3dd50b59 100644
--- a/hepcrawl/spiders/dnb_spider.py
+++ b/hepcrawl/spiders/dnb_spider.py
@@ -16,7 +16,12 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_mime_type, parse_domain, get_node
+from hepcrawl.utils import (
+ get_mime_type,
+ parse_domain,
+ get_node,
+ ParsedItem,
+)
class DNBSpider(XMLFeedSpider):
@@ -219,4 +224,10 @@ def build_item(self, response):
record.add_value('thesis', {'degree_type': 'PhD'})
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py
index beea699d..cfb59af5 100644
--- a/hepcrawl/spiders/edp_spider.py
+++ b/hepcrawl/spiders/edp_spider.py
@@ -22,7 +22,7 @@
from ..extractors.jats import Jats
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import (
+from hepcrawl.utils import (
ftp_list_files,
ftp_connection_info,
get_first,
@@ -30,6 +30,7 @@
get_licenses,
get_node,
parse_domain,
+ ParsedItem,
)
@@ -318,7 +319,12 @@ def build_item_rich(self, response):
)
record.add_value("urls", response.meta.get("urls"))
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
def build_item_jats(self, response):
"""Build the final HEPRecord with JATS-format XML ('jp')."""
@@ -388,7 +394,12 @@ def build_item_jats(self, response):
references = self._get_references(node)
record.add_value("references", references)
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
def _get_references(self, node):
"""Get the references."""
diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
index c9aacc00..78fdd5fd 100644
--- a/hepcrawl/spiders/elsevier_spider.py
+++ b/hepcrawl/spiders/elsevier_spider.py
@@ -25,12 +25,13 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import (
+from hepcrawl.utils import (
get_first,
get_licenses,
has_numbers,
range_as_string,
unzip_xml_files,
+ ParsedItem,
)
from ..dateutils import format_year
@@ -1034,4 +1035,9 @@ def build_item(self, response):
record.add_value('collections', self.get_collections(doctype))
record.add_value('references', self.get_references(node))
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py
index 941a3674..37871f3a 100644
--- a/hepcrawl/spiders/hindawi_spider.py
+++ b/hepcrawl/spiders/hindawi_spider.py
@@ -16,7 +16,10 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_licenses
+from hepcrawl.utils import (
+ get_licenses,
+ ParsedItem,
+)
class HindawiSpider(XMLFeedSpider):
@@ -222,4 +225,9 @@ def parse_node(self, response, node):
record.add_xpath('source',
"./datafield[@tag='260']/subfield[@code='b']/text()")
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py
index 2e970c1c..579ac65b 100644
--- a/hepcrawl/spiders/infn_spider.py
+++ b/hepcrawl/spiders/infn_spider.py
@@ -21,8 +21,10 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_temporary_file
-
+from hepcrawl.utils import (
+ get_temporary_file,
+ ParsedItem,
+)
from ..dateutils import format_date
@@ -240,4 +242,9 @@ def build_item(self, response):
record.add_value('source', 'INFN')
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py
index 0e3bae65..90c7809f 100644
--- a/hepcrawl/spiders/iop_spider.py
+++ b/hepcrawl/spiders/iop_spider.py
@@ -23,6 +23,7 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
+from hepcrawl.utils import ParsedItem
class IOPSpider(XMLFeedSpider, NLM):
@@ -222,4 +223,9 @@ def parse_node(self, response, node):
record.add_value("additional_files",
self.add_fft_file(pdf_file_path, file_access, file_type))
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py
index 77bf7948..1c83c829 100644
--- a/hepcrawl/spiders/magic_spider.py
+++ b/hepcrawl/spiders/magic_spider.py
@@ -18,7 +18,10 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import split_fullname
+from hepcrawl.utils import (
+ split_fullname,
+ ParsedItem,
+)
class MagicSpider(XMLFeedSpider):
@@ -176,4 +179,9 @@ def build_item(self, response):
record.add_value("additional_files", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])
- yield record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py
index c71234f9..4e099348 100644
--- a/hepcrawl/spiders/mit_spider.py
+++ b/hepcrawl/spiders/mit_spider.py
@@ -23,7 +23,11 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import get_temporary_file, split_fullname
+from hepcrawl.utils import (
+ get_temporary_file,
+ split_fullname,
+ ParsedItem,
+)
class MITSpider(XMLFeedSpider):
@@ -223,4 +227,9 @@ def build_item(self, response):
record.add_value('page_nr', self.get_page_nr(node))
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py
index 7200664e..95bc874a 100644
--- a/hepcrawl/spiders/phenix_spider.py
+++ b/hepcrawl/spiders/phenix_spider.py
@@ -18,6 +18,7 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
+from hepcrawl.utils import ParsedItem
class PhenixSpider(XMLFeedSpider):
@@ -128,4 +129,9 @@ def parse_node(self, response, node):
record.add_value('source', 'PHENIX')
record.add_value('collections', ['HEP', 'THESIS'])
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py
index 101b1163..8a486292 100644
--- a/hepcrawl/spiders/phil_spider.py
+++ b/hepcrawl/spiders/phil_spider.py
@@ -19,7 +19,11 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import parse_domain, get_mime_type
+from hepcrawl.utils import (
+ parse_domain,
+ get_mime_type,
+ ParsedItem,
+)
class PhilSpider(CrawlSpider):
@@ -160,4 +164,9 @@ def build_item(self, response):
if not jsonrecord.get('year') == "forthcoming":
record.add_value('journal_year', int(jsonrecord['year']))
- return record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ return parsed_item
diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py
index 7d3fb87d..875dbb5e 100644
--- a/hepcrawl/spiders/pos_spider.py
+++ b/hepcrawl/spiders/pos_spider.py
@@ -13,10 +13,16 @@
import re
+from urlparse import urljoin
+
from scrapy import Request, Selector
from scrapy.spiders import Spider
-from urlparse import urljoin
-from ..utils import get_licenses, get_first
+
+from hepcrawl.utils import (
+ get_licenses,
+ get_first,
+ ParsedItem,
+)
from ..dateutils import create_valid_date
from ..items import HEPRecord
from ..loaders import HEPLoader
@@ -128,7 +134,13 @@ def build_item(self, response):
record.add_value('extra_data', extra_data)
record.add_value('collections', ['HEP', 'ConferencePaper'])
- return record.load_item()
+
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
def _get_ext_systems_number(self, node):
return [
diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py
index 661f0bec..97ae8202 100644
--- a/hepcrawl/spiders/t2k_spider.py
+++ b/hepcrawl/spiders/t2k_spider.py
@@ -18,7 +18,10 @@
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import split_fullname
+from hepcrawl.utils import (
+ split_fullname,
+ ParsedItem,
+)
class T2kSpider(XMLFeedSpider):
@@ -164,4 +167,9 @@ def build_item(self, response):
record.add_value("additional_files", response.meta.get("additional_files"))
record.add_value('collections', ['HEP', 'THESIS'])
- yield record.load_item()
+ parsed_item = ParsedItem(
+ item=record.load_item(),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index bef40a72..49b5b725 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -20,12 +20,13 @@
from ..extractors.jats import Jats
from ..items import HEPRecord
from ..loaders import HEPLoader
-from ..utils import (
+from hepcrawl.utils import (
ftp_list_files,
ftp_connection_info,
local_list_files,
get_licenses,
unzip_xml_files,
+ ParsedItem,
)
@@ -148,7 +149,7 @@ def parse_node(self, response, node):
self.log("Got article_type {0}".format(article_type))
if article_type is None or article_type[0] not in self.allowed_article_types:
# Filter out non-interesting article types
- return None
+ return
record = HEPLoader(item=HEPRecord(), selector=node, response=response)
if article_type in ['correction',
@@ -203,9 +204,13 @@ def parse_node(self, response, node):
record.add_value('license', license)
record.add_value('collections', self._get_collections(node, article_type, journal_title))
- parsed_record = dict(record.load_item())
- return parsed_record
+ parsed_item = ParsedItem(
+ item=dict(record.load_item()),
+ item_format='hepcrawl',
+ )
+
+ yield parsed_item
def _get_collections(self, node, article_type, current_journal_title):
"""Return this articles' collection."""
diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index a212593e..71ff3aa6 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -342,3 +342,35 @@ def get_license_by_text(license_text):
def get_file_name_from_url(url):
return url.rsplit('/', 1)[-1]
+
+
+def get_absolute_file_path(file_path):
+ """Returns the absolute path of a relative path."""
+ return os.path.abspath(file_path)
+
+
+class ParsedItem(dict):
+ """Generate interface to communicate Spider-Pipelines"""
+ def __init__(
+ self,
+ item,
+ file_urls=None,
+ item_format=None,
+ ftp_params=None,
+ file_paths=None,
+ **kwargs
+ ):
+ super(ParsedItem, self).__init__(
+ item=item,
+ file_urls=file_urls,
+ item_format=item_format,
+ ftp_params=ftp_params,
+ file_paths=file_paths,
+ **kwargs
+ )
+ self.item = item
+ self.file_urls = file_urls
+ self.format = item_format
+ self.ftp_params = ftp_params
+ self.file_paths = file_paths
+ self.__dict__ = self
diff --git a/tests/functional/desy/fixtures/desy_ftp_records.json b/tests/functional/desy/fixtures/desy_ftp_records.json
index 6a9b6c62..f685a254 100644
--- a/tests/functional/desy/fixtures/desy_ftp_records.json
+++ b/tests/functional/desy/fixtures/desy_ftp_records.json
@@ -10,10 +10,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 111111,
@@ -76,10 +87,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 222222,
@@ -142,10 +164,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 333333,
@@ -208,10 +241,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/589091f319277bfc3316338b4123b215cee402db.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d7b1ef2d316488d23a4d66865eca3f686e29a27b.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 444444,
diff --git a/tests/functional/desy/fixtures/desy_local_records.json b/tests/functional/desy/fixtures/desy_local_records.json
index 57d780e9..6fe2c4d0 100644
--- a/tests/functional/desy/fixtures/desy_local_records.json
+++ b/tests/functional/desy/fixtures/desy_local_records.json
@@ -10,10 +10,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 111111,
@@ -76,10 +87,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 222222,
@@ -142,10 +164,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 333333,
@@ -208,10 +241,21 @@
],
"_fft": [
{
- "path": "/tmp/file_urls/full/bc8e08681ec71885835e07aab1243b0dccf08f1d.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
+ "type": "Main",
+ "filename": "cNFW_rogue_curves"
},
{
- "path": "/tmp/file_urls/full/d341fc9296aafc16c169492c9cd2f80c19df6d9c.txt"
+ "version": 1,
+ "creation_datetime": "2017-06-27T09:43:16",
+ "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+ "format": ".txt",
+ "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
+ "type": "Main",
+ "filename": "scalingRelations_DutBeh_DC14_all_Oh"
}
],
"control_number": 444444,
diff --git a/tests/functional/desy/fixtures/ftp_server/FFT/test_fft_1.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt
similarity index 100%
rename from tests/functional/desy/fixtures/ftp_server/FFT/test_fft_1.txt
rename to tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt
diff --git a/tests/functional/desy/fixtures/ftp_server/FFT/test_fft_2.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt
similarity index 100%
rename from tests/functional/desy/fixtures/ftp_server/FFT/test_fft_2.txt
rename to tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
index 5a57f51c..6900d746 100644
--- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
@@ -50,6 +50,28 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
222222
@@ -101,5 +123,27 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
\ No newline at end of file
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
index 44266cd4..2067b5e7 100644
--- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
@@ -50,6 +50,28 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
444444
@@ -101,5 +123,27 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
\ No newline at end of file
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
index 2b4d8c71..321766f9 100644
--- a/tests/functional/desy/test_desy.py
+++ b/tests/functional/desy/test_desy.py
@@ -51,6 +51,7 @@ def get_fft_1_path():
'desy',
'fixtures',
'ftp_server',
+ 'DESY',
'FFT',
'test_fft_1.txt',
test_suite='functional',
@@ -63,6 +64,7 @@ def get_fft_2_path():
'desy',
'fixtures',
'ftp_server',
+ 'DESY',
'FFT',
'test_fft_2.txt',
test_suite='functional',
@@ -159,8 +161,8 @@ def test_desy_ftp(
for record in expected_results:
fft_file_paths = sorted(record['_fft'])
- assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path)
- assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path)
+ assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
+ assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
@pytest.mark.parametrize(
@@ -205,6 +207,6 @@ def test_desy_local_package_path(
for record in expected_results:
fft_file_paths = sorted(record['_fft'])
- assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path)
- assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path)
-
+ assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
+ assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
+#
diff --git a/tests/unit/responses/desy/desy_collection_records.xml b/tests/unit/responses/desy/desy_collection_records.xml
index 5a57f51c..d2086694 100644
--- a/tests/unit/responses/desy/desy_collection_records.xml
+++ b/tests/unit/responses/desy/desy_collection_records.xml
@@ -50,6 +50,28 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt;1
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt;1
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
222222
@@ -100,6 +122,28 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+
+ DESY/FFT/test_fft_1.txt;1
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt;1
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
\ No newline at end of file
diff --git a/tests/unit/responses/desy/desy_record.xml b/tests/unit/responses/desy/desy_record.xml
index 8219064f..437c45b3 100644
--- a/tests/unit/responses/desy/desy_record.xml
+++ b/tests/unit/responses/desy/desy_record.xml
@@ -50,5 +50,27 @@
oai:inspirehep.net:1608652
INSPIRE:HEP
+
+ DESY/FFT/test_fft_1.txt;1
+ 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
+ .txt
+ cNFW_rogue_curves
+
+ 2017-06-27 09:43:17
+ Main
+ 1
+
+
+
+ DESY/FFT/test_fft_2.txt;1
+ 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
+ .txt
+ scalingRelations_DutBeh_DC14_all_Oh
+
+ 2017-06-27 09:43:16
+ Main
+ 1
+
+
\ No newline at end of file
diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py
index eef140b1..ad8f3f03 100644
--- a/tests/unit/test_alpha.py
+++ b/tests/unit/test_alpha.py
@@ -20,13 +20,15 @@
def results():
"""Return results generator from the Alpha spider."""
spider = alpha_spider.AlphaSpider()
- records = list(
+ parsed_items = list(
spider.parse(
fake_response_from_file('alpha/test_1.htm')
)
)
+ records = [parsed_item.item for parsed_item in parsed_items]
assert records
+
return records
diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py
index eb53269d..8bc66033 100644
--- a/tests/unit/test_aps.py
+++ b/tests/unit/test_aps.py
@@ -21,7 +21,7 @@ def results():
from scrapy.http import TextResponse
spider = aps_spider.APSSpider()
- records = list(
+ parsed_items = list(
spider.parse(
fake_response_from_file(
'aps/aps_single_response.json',
@@ -30,6 +30,8 @@ def results():
)
)
+ records = [parsed_item.item for parsed_item in parsed_items]
+
assert records
return records
diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index d395c494..47ea20db 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -35,7 +35,7 @@ def _get_processed_item(record, spider):
item = pipeline.process_item(record, spider)
return item
- records = list(
+ parsed_items = list(
spider.parse(
fake_response_from_file(
'arxiv/sample_arxiv_record.xml',
@@ -44,10 +44,10 @@ def _get_processed_item(record, spider):
)
)
- assert records
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
- return [_get_processed_item(record, spider) for record in records]
+
+ return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
def test_page_nr(many_results):
diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py
index b7ca410d..d8e6f9e5 100644
--- a/tests/unit/test_arxiv_single.py
+++ b/tests/unit/test_arxiv_single.py
@@ -31,7 +31,7 @@ def _get_processed_item(record, spider):
crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
- records = list(
+ parsed_items = list(
spider.parse(
fake_response_from_file(
'arxiv/sample_arxiv_record0.xml',
@@ -40,11 +40,10 @@ def _get_processed_item(record, spider):
)
)
- assert records
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
- return [_get_processed_item(record, spider) for record in records]
+ return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py
index cc6ef093..48551cdf 100644
--- a/tests/unit/test_base.py
+++ b/tests/unit/test_base.py
@@ -38,9 +38,11 @@ def record():
nodes = selector.xpath('.//%s' % spider.itertag)
response.meta["record"] = nodes[0].extract()
response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]
- parsed_record = spider.build_item(response)
- assert parsed_record
- return parsed_record
+
+ parsed_item = spider.build_item(response)
+ assert parsed_item
+
+ return parsed_item.item
@pytest.fixture
@@ -169,7 +171,10 @@ def splash():
'Content-Type': 'text/html',
},
)
- return spider.scrape_for_pdf(splash_response)
+
+ parsed_item = spider.scrape_for_pdf(splash_response)
+
+ return parsed_item.item
def test_splash(splash):
@@ -201,7 +206,10 @@ def parsed_node():
response = fake_response_from_string(text=body)
node = get_node(spider, 'OAI-PMH:record', text=body)
response.meta["record"] = node[0].extract()
- return spider.parse_node(response, node[0])
+
+ parsed_item = spider.parse_node(response, node[0])
+
+ return parsed_item.item
def test_parsed_node(parsed_node):
diff --git a/tests/unit/test_brown.py b/tests/unit/test_brown.py
index 0b42b4df..41e3902d 100644
--- a/tests/unit/test_brown.py
+++ b/tests/unit/test_brown.py
@@ -41,10 +41,12 @@ def record():
splash_response = fake_response_from_file('brown/test_splash.html')
splash_response.meta["jsonrecord"] = jsonrecord
- parsed_record = spider.scrape_splash(splash_response)
+ iter_item = spider.scrape_splash(splash_response)
- assert parsed_record
- return parsed_record
+ parsed_item = iter_item.next()
+ assert parsed_item
+
+ return parsed_item.item
@pytest.fixture
@@ -200,7 +202,12 @@ def parsed_node_no_splash():
jsonrecord = jsonresponse["items"]["docs"][0]
response.meta["jsonrecord"] = jsonrecord
- return spider.parse(response).next()
+ iter_item = spider.parse(response).next()
+
+ parsed_item = iter_item.next()
+ assert parsed_item
+
+ return parsed_item.item
def test_no_splash(parsed_node_no_splash):
diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py
index 73999356..5b01f7fd 100644
--- a/tests/unit/test_desy.py
+++ b/tests/unit/test_desy.py
@@ -82,10 +82,34 @@ def test_pipeline_record(generated_record):
],
'_fft': [
{
- 'path': 'FFT/test_fft_1.txt'
+ 'creation_datetime': '2017-06-27T09:43:17',
+ 'description': '00013 Decomposition of the problematic rotation curves in our '
+ 'sample according to the best-fit \\textsc{core}NFW models. '
+ 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+ 'filename': 'cNFW_rogue_curves',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_1.txt;1',
+ 'type': 'Main',
+ 'version': 1,
},
{
- 'path': 'FFT/test_fft_2.txt'
+ 'creation_datetime': '2017-06-27T09:43:16',
+ 'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+ 'the cosmological halo mass-concentration relation from \\'
+ 'cite{dutton14} (left) and the stellar mass-halo mass relation '
+ 'from \\cite{behroozi13} (right). The error bars correspond to the '
+ 'extremal values of the multidimensional 68\\% confidence region '
+ 'for each fit. The theoretical relations are shown as red lines '
+ 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+ 'the dark and light grey bands, respectively. The '
+ 'mass-concentration relation from \\cite{maccio08} and the stellar'
+ ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+ 'as the black dashed lines.',
+ 'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_2.txt;1',
+ 'type': 'Main',
+ 'version': 1
}
],
'abstracts': [
@@ -182,11 +206,35 @@ def test_pipeline_collection_records(generated_records):
],
"_fft": [
{
- "path": "FFT/test_fft_1.txt"
+ 'creation_datetime': '2017-06-27T09:43:17',
+ 'description': '00013 Decomposition of the problematic rotation curves in our '
+ 'sample according to the best-fit \\textsc{core}NFW models. '
+ 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+ 'filename': 'cNFW_rogue_curves',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_1.txt;1',
+ 'type': 'Main',
+ 'version': 1,
},
{
- "path": "FFT/test_fft_2.txt"
- },
+ 'creation_datetime': '2017-06-27T09:43:16',
+ 'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+ 'the cosmological halo mass-concentration relation from \\'
+ 'cite{dutton14} (left) and the stellar mass-halo mass relation '
+ 'from \\cite{behroozi13} (right). The error bars correspond to the '
+ 'extremal values of the multidimensional 68\\% confidence region '
+ 'for each fit. The theoretical relations are shown as red lines '
+ 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+ 'the dark and light grey bands, respectively. The '
+ 'mass-concentration relation from \\cite{maccio08} and the stellar'
+ ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+ 'as the black dashed lines.',
+ 'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_2.txt;1',
+ 'type': 'Main',
+ 'version': 1
+ }
],
"control_number": 111111,
"public_notes": [
@@ -248,11 +296,35 @@ def test_pipeline_collection_records(generated_records):
],
"_fft": [
{
- "path": "FFT/test_fft_1.txt"
+ 'creation_datetime': '2017-06-27T09:43:17',
+ 'description': '00013 Decomposition of the problematic rotation curves in our '
+ 'sample according to the best-fit \\textsc{core}NFW models. '
+ 'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+ 'filename': 'cNFW_rogue_curves',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_1.txt;1',
+ 'type': 'Main',
+ 'version': 1,
},
{
- "path": "FFT/test_fft_2.txt"
- },
+ 'creation_datetime': '2017-06-27T09:43:16',
+ 'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+ 'the cosmological halo mass-concentration relation from \\'
+ 'cite{dutton14} (left) and the stellar mass-halo mass relation '
+ 'from \\cite{behroozi13} (right). The error bars correspond to the '
+ 'extremal values of the multidimensional 68\\% confidence region '
+ 'for each fit. The theoretical relations are shown as red lines '
+ 'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+ 'the dark and light grey bands, respectively. The '
+ 'mass-concentration relation from \\cite{maccio08} and the stellar'
+ ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+ 'as the black dashed lines.',
+ 'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+ 'format': '.txt',
+ 'path': 'DESY/FFT/test_fft_2.txt;1',
+ 'type': 'Main',
+ 'version': 1
+ }
],
"control_number": 222222,
"public_notes": [
diff --git a/tests/unit/test_dnb.py b/tests/unit/test_dnb.py
index b00aff3d..a1a22dbd 100644
--- a/tests/unit/test_dnb.py
+++ b/tests/unit/test_dnb.py
@@ -72,7 +72,11 @@ def record(scrape_pos_page_body):
body=scrape_pos_page_body,
**{'encoding': 'utf-8'}
)
- return request.callback(response)
+
+ parsed_item = request.callback(response)
+ assert parsed_item
+
+ return parsed_item.item
def test_title(record):
@@ -241,7 +245,9 @@ def parse_without_splash():
'Content-Type': 'application/pdf;charset=base64',
}
)
- return spider.parse_node(response, nodes[0])
+
+ parsed_item = spider.parse_node(response, nodes[0])
+ return parsed_item.item
def test_parse_without_splash(parse_without_splash):
diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py
index cc7885bd..7e841015 100644
--- a/tests/unit/test_edp.py
+++ b/tests/unit/test_edp.py
@@ -40,6 +40,7 @@ def scrape_pos_page_body():
)
)
+
@pytest.fixture
def targzfile():
"""Path to test tar.gz file with JATS XML file."""
@@ -50,6 +51,7 @@ def targzfile():
'test_gz.tar.gz'
)
+
@pytest.fixture
def package_jats(targzfile):
"""Extract tar.gz package with JATS XML file."""
@@ -75,7 +77,12 @@ def record_jats(package_jats, scrape_pos_page_body):
body=scrape_pos_page_body,
**{'encoding': 'utf-8'}
)
- return request.callback(response)
+
+ iter_item = request.callback(response)
+ parsed_item = iter_item.next()
+ assert parsed_item
+
+ return parsed_item.item
@pytest.fixture
@@ -107,7 +114,11 @@ def record_rich(package_rich):
fake_resp.meta["rich"] = True
node = get_node(spider, "//EDPSArticle", fake_resp)[0]
- return spider.parse_node(fake_resp, node)
+ iter_item = spider.parse_node(fake_resp, node)
+ parsed_item = iter_item.next()
+ assert parsed_item
+
+ return parsed_item.item
def test_title(record_jats):
@@ -145,6 +156,7 @@ def test_abstract(record_jats):
assert 'abstract' in record_jats
assert record_jats['abstract'] == abstract
+
def test_date_published(record_jats):
"""Test extracting date_published."""
date_published = "2015-01-01"
@@ -179,6 +191,7 @@ def test_doi(record_jats):
assert 'dois' in record_jats
assert record_jats['dois'][0]['value'] == doi
+
def test_publication_info(record_jats):
"""Test extracting publication info."""
assert 'journal_title' in record_jats
@@ -206,7 +219,6 @@ def test_keywords(record_jats):
assert keyw["value"] in keywords
-
def test_authors(record_jats):
"""Test authors."""
authors = ["Arasoglu, Ali", "Ozdemir, Omer Faruk"]
@@ -326,7 +338,6 @@ def test_authors_rich(record_rich):
assert astr[index]["affiliations"][0]["value"] == affiliations[index]
-
def test_tarfile(tarbzfile, tmpdir):
"""Test untarring a tar.bz package with a test XML file.
@@ -343,7 +354,6 @@ def test_tarfile(tarbzfile, tmpdir):
assert "aas/xml_rich/2000/01" not in xml_files_flat[0]
-
def test_handle_package_ftp(tarbzfile):
"""Test getting the target folder name for xml files."""
spider = edp_spider.EDPSpider()
@@ -353,6 +363,7 @@ def test_handle_package_ftp(tarbzfile):
assert isinstance(request, Request)
assert request.meta["package_path"] == tarbzfile
+
def test_no_dois_jats():
"""Test parsing when no DOI in record. JATS format."""
spider = edp_spider.EDPSpider()
@@ -370,7 +381,10 @@ def test_no_dois_jats():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
assert "dois" not in record
assert "additional_files" not in record
@@ -390,7 +404,10 @@ def test_no_dois_rich():
response = fake_response_from_string(body)
response.meta["rich"] = True
node = get_node(spider, "//EDPSArticle", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
assert "dois" not in record
assert "additional_files" not in record
@@ -416,7 +433,10 @@ def test_addendum_jats():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
assert "related_article_doi" in record
assert record["related_article_doi"][0][
@@ -439,7 +459,10 @@ def test_author_with_email():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
assert 'email' in record['authors'][0]
assert record['authors'][0]['email'] == "Fname.Sname@university.org"
@@ -472,7 +495,10 @@ def test_aff_with_email():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA"
assert 'affiliations' in record['authors'][0]
@@ -481,8 +507,6 @@ def test_aff_with_email():
assert record['authors'][0]['email'] is None
-
-
def test_no_valid_article():
"""Test parsing when filtering out non-interesting article types."""
spider = edp_spider.EDPSpider()
@@ -506,7 +530,10 @@ def test_collections_review():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- record = spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+ record = parsed_item.item
assert "collections" in record
assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
@@ -533,7 +560,11 @@ def record_references_only():
"""
response = fake_response_from_string(body)
node = get_node(spider, "//article", response)[0]
- return spider.parse_node(response, node)
+
+ iter_item = spider.parse_node(response, node)
+ parsed_item = iter_item.next()
+
+ return parsed_item.item
def test_references(record_references_only):
diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py
index ca023122..109f3d3f 100644
--- a/tests/unit/test_elsevier.py
+++ b/tests/unit/test_elsevier.py
@@ -41,9 +41,11 @@ def record():
response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml'
tag = '//%s' % spider.itertag
nodes = get_node(spider, tag, response)
- parsed_record = spider.parse_node(response, nodes)
- assert parsed_record
- return parsed_record
+
+ parsed_item = spider.parse_node(response, nodes)
+ assert parsed_item
+
+ return parsed_item.item
@pytest.fixture(scope="module")
@@ -97,7 +99,11 @@ def parsed_node():
response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml'
parse_response = spider.parse_node(response, node)
parse_response.status = 404
- return spider.scrape_sciencedirect(parse_response)
+
+ parsed_item = spider.scrape_sciencedirect(parse_response)
+ assert parsed_item
+
+ return parsed_item.item
def test_collection(parsed_node):
@@ -164,7 +170,10 @@ def cover_display_date():
node = get_node(spider, '/doc', text=body)
response = fake_response_from_string(body)
- return spider.parse_node(response, node)
+ parse_item = spider.parse_node(response, node)
+ assert parse_item
+
+ return parse_item.item
def test_cover_display_date(cover_display_date):
@@ -187,7 +196,10 @@ def cover_display_date_y_m():
"""
node = get_node(spider, '/doc', text=body)
response = fake_response_from_string(body)
- return spider.parse_node(response, node)
+ parse_item = spider.parse_node(response, node)
+ assert parse_item
+
+ return parse_item.item
def test_cover_display_date_y_m(cover_display_date_y_m):
@@ -210,7 +222,10 @@ def cover_display_date_y():
"""
node = get_node(spider, '/doc', text=body)
response = fake_response_from_string(body)
- return spider.parse_node(response, node)
+ parse_item = spider.parse_node(response, node)
+ assert parse_item
+
+ return parse_item.item
def test_cover_display_date_y(cover_display_date_y):
@@ -1644,7 +1659,11 @@ def sciencedirect():
])
response.meta["info"] = {}
response.meta["node"] = get_node(spider, '/head', text=body)
- return spider.scrape_sciencedirect(response)
+
+ parse_item = spider.scrape_sciencedirect(response)
+ assert parse_item
+
+ return parse_item.item
def test_sciencedirect(sciencedirect):
diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py
index 37e5e183..84ebd06d 100644
--- a/tests/unit/test_hindawi.py
+++ b/tests/unit/test_hindawi.py
@@ -26,9 +26,11 @@ def record():
response = fake_response_from_file("hindawi/test_1.xml")
nodes = get_node(spider, "//marc:record", response)
- parsed_record = spider.parse_node(response, nodes[0])
- assert parsed_record
- return parsed_record
+ iter_item = spider.parse_node(response, nodes[0])
+ parsed_item = iter_item.next()
+ assert parsed_item
+
+ return parsed_item.item
def test_title(record):
diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py
index 0c60799a..5cd1e27d 100644
--- a/tests/unit/test_infn.py
+++ b/tests/unit/test_infn.py
@@ -28,9 +28,11 @@ def record():
"""Return scraping results from the INFN spider."""
spider = infn_spider.InfnSpider()
response = fake_response_from_file('infn/test_splash.html')
- parsed_record = spider.scrape_splash(response)
- assert parsed_record
- return parsed_record
+
+ parsed_item = spider.scrape_splash(response)
+ assert parsed_item
+
+ return parsed_item.item
def test_title(record):
@@ -121,6 +123,7 @@ def test_non_thesis():
assert record is None
+
def test_parse_node():
"""Test parse_node function. This should be a scrapy Request object.
@@ -148,6 +151,6 @@ def test_parse_node_nolink():
response = fake_response_from_file('infn/test_1_nolink.html')
selector = Selector(response, type='html')
node = selector.xpath('//%s' % spider.itertag)[0]
- record = spider.parse_node(response, node).next()
+ parsed_item = spider.parse_node(response, node).next()
- assert isinstance(record, hepcrawl.items.HEPRecord)
+ assert isinstance(parsed_item.item, hepcrawl.items.HEPRecord)
diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py
index b776adfa..fb8d26d2 100644
--- a/tests/unit/test_iop.py
+++ b/tests/unit/test_iop.py
@@ -38,9 +38,11 @@ def record():
response = fake_response_from_file('iop/xml/test_standard.xml')
node = get_node(spider, "Article", response)
spider.pdf_files = TEST_PDF_DIR
- parsed_record = spider.parse_node(response, node)
- assert parsed_record
- return parsed_record
+
+ parsed_item = spider.parse_node(response, node)
+ assert parsed_item
+
+ return parsed_item.item
def test_abstract(record):
@@ -182,10 +184,11 @@ def erratum_open_access_record():
'iop',
'pdf',
)
- parsed_record = spider.parse_node(response, node)
- assert parsed_record
- return parsed_record
+ parsed_item = spider.parse_node(response, node)
+ assert parsed_item
+
+ return parsed_item.item
def test_files_erratum_open_access_record(erratum_open_access_record):
diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py
index eeb574fe..74d9fad4 100644
--- a/tests/unit/test_magic.py
+++ b/tests/unit/test_magic.py
@@ -23,6 +23,7 @@
get_node,
)
+
@pytest.fixture
def record():
"""Return results from the MAGIC spider. First parse node, then scrape,
@@ -39,9 +40,10 @@ def record():
splash_response.meta["date"] = parsed_node.meta["date"]
splash_response.meta["urls"] = parsed_node.meta["urls"]
- parsed_record = spider.scrape_for_pdf(splash_response).next()
- assert parsed_record
- return parsed_record
+ parsed_item = spider.scrape_for_pdf(splash_response).next()
+ assert parsed_item
+
+ return parsed_item.item
def test_abstract(record):
@@ -102,7 +104,6 @@ def test_abstract(record):
assert record["abstract"] == abstract
-
def test_title(record):
"""Test extracting title."""
title = "Limits to the violation of Lorentz invariance using the emission of the CRAB pulsar at TeV energies, discovered with archival data from the MAGIC telescopes"
@@ -139,6 +140,7 @@ def test_url(record):
assert 'urls' in record
assert record['urls'][0]['value'] == url
+
def test_pdf_link(record):
"""Test pdf link(s)"""
files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf"
@@ -164,8 +166,9 @@ def test_no_author_no_date_no_url():
"""
response = fake_response_from_string(body)
node = get_node(spider, spider.itertag, text=body)
- record = spider.parse_node(response, node).next()
+ parsed_item = spider.parse_node(response, node).next()
+ record = parsed_item.item
assert isinstance(record, hepcrawl.items.HEPRecord)
assert "date" not in record
assert "authors" not in record
@@ -184,8 +187,9 @@ def test_no_aff():