diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
index add53f70..3a8d3ba8 100644
--- a/hepcrawl/crawler2hep.py
+++ b/hepcrawl/crawler2hep.py
@@ -20,22 +20,34 @@
from inspire_schemas.api import LiteratureBuilder
-from hepcrawl.utils import get_file_name_from_url
-
-def _update_record_fft(record, index_fft_file_paths):
- def _update_fft_fields(fft_fields, index_fft_file_paths):
- new_fft_fields = []
- for fft_field in fft_fields:
- file_name = get_file_name_from_url(fft_field['path'])
- if file_name in index_fft_file_paths:
- fft_field['path'] = index_fft_file_paths[file_name]
- new_fft_fields.append(fft_field)
-
- return new_fft_fields
-
- record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
- return record
+def _get_updated_fft_fields(current_fft_fields, record_files):
+ """
+
+ Params:
+ current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
+ expect each of then to have, at least, a key named ``path``.
+ record_files(list(RecordFile)): files attached to the record as populated by
+ ``FftFilesPipeline``.
+ """
+ record_files_index = {
+ record_file.name: record_file.path
+ for record_file in record_files
+ }
+ new_fft_fields = []
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.log(logging.INFO,
+ "-------------------- _get_updated_fft_fields -------------------")
+ logger.log(logging.INFO,
+ 'current_fft_fields: {}'.format(current_fft_fields))
+ for fft_field in current_fft_fields:
+ file_name = os.path.basename(fft_field['path'])
+ if file_name in record_files_index:
+ fft_field['path'] = record_files_index[file_name]
+ new_fft_fields.append(fft_field)
+
+ return new_fft_fields
def _has_publication_info(item):
@@ -116,50 +128,47 @@ def _normalize_hepcrawl_record(item, source):
return item
-def _generate_acquisition_source(crawler_record, source):
- crawler_record['acquisition_source'] = {
+def _generate_acquisition_source(source):
+ acquisition_source = {
'source': source,
'method': 'hepcrawl',
'datetime': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}
- return crawler_record
+ return acquisition_source
-def to_hep(
- item,
- source,
- item_format='hepcrawl',
- fft_file_paths=None,
+def item_to_hep(
+ item,
+ source,
):
- item = _generate_acquisition_source(
- crawler_record=item,
- source=source,
- )
+ item.record['acquisition_source'] = _generate_acquisition_source(source=source)
- if item_format == 'hep':
- return hep2hep(
- crawler_record=item,
- fft_file_paths=fft_file_paths,
+ if item.record_format == 'hep':
+ return hep_to_hep(
+ hep_record=item.record,
+ record_files=item.record_files,
)
- elif item_format == 'hepcrawl':
+ elif item.record_format == 'hepcrawl':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)
- return crawler2hep(dict(item))
+ return hepcrawl_to_hep(dict(item))
else:
- raise Exception('Unknown item_format::{}'.format(item_format))
+ raise Exception('Unknown item_format::{}'.format(item.record_format))
-def hep2hep(crawler_record, fft_file_paths):
- if fft_file_paths:
- crawler_record = _update_record_fft(crawler_record, fft_file_paths)
+def hep_to_hep(hep_record, record_files):
+ hep_record['_fft'] = _get_updated_fft_fields(
+ current_fft_fields=hep_record['_fft'],
+ record_files=record_files,
+ )
- return crawler_record
+ return hep_record
-def crawler2hep(crawler_record):
+def hepcrawl_to_hep(crawler_record):
def _filter_affiliation(affiliations):
return [
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 05b61361..b8db5bcf 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -24,16 +24,23 @@
from inspire_schemas.utils import validate
-from hepcrawl.crawler2hep import to_hep
+from hepcrawl.crawler2hep import item_to_hep
from hepcrawl.settings import FILES_STORE
-from hepcrawl.utils import get_file_name_from_url
+from hepcrawl.utils import RecordFile
class FftFilesPipeline(FilesPipeline):
- """Download all the FFT files provided by record."""
+ """Download all the FFT files provided by record.
- def __init__(self, *args, **kwargs):
- super(FftFilesPipeline, self).__init__(FILES_STORE)
+ Note:
+
+ This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls``
+ property.
+ """
+
+ def __init__(self, store_uri, *args, **kwargs):
+ store_uri = store_uri or FILES_STORE
+ super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs)
def get_media_requests(self, item, info):
"""Download FFT files using FTP."""
@@ -44,24 +51,25 @@ def get_media_requests(self, item, info):
meta=item.ftp_params,
)
+ def get_absolute_file_path(self, path):
+ return os.path.abspath(
+ os.path.join(
+ self.store.basedir,
+ path
+ )
+ )
+
def item_completed(self, results, item, info):
"""Create a map that connects file names with downloaded files."""
- def _get_absolute_local_file_path(path):
- return os.path.abspath(
- os.path.join(
- FILES_STORE,
- path
- )
+ record_files = [
+ RecordFile(
+ path=self.get_absolute_file_path(result_data['path']),
+ name=os.path.basename(result_data['url']),
)
-
- map_file_names_paths = {}
- for ok, result_data in results:
- if ok:
- map_file_names_paths[
- get_file_name_from_url(result_data['url'])
- ] = _get_absolute_local_file_path(result_data['path'])
-
- item.file_paths = map_file_names_paths
+ for ok, result_data in results
+ if ok
+ ]
+ item.record_files = record_files
return item
@@ -76,16 +84,11 @@ def open_spider(self, spider):
self.results_data = []
def _post_enhance_item(self, item, spider):
- fft_file_paths = item.file_paths
- item_format = item.item_format
- item = item.item if item.item else item
source = spider.name
- return to_hep(
+ return item_to_hep(
item=item,
source=source,
- item_format=item_format,
- fft_file_paths=fft_file_paths,
)
def process_item(self, item, spider):
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 5ec79da8..2167d0a2 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -15,6 +15,7 @@
from lxml import etree
from dojson.contrib.marc21.utils import create_record
+from six.moves import urllib
from scrapy import Request
from scrapy.spiders import Spider
@@ -24,8 +25,6 @@
from hepcrawl.utils import (
ftp_list_files,
ftp_connection_info,
- get_absolute_file_path,
- get_file_name_from_url,
ParsedItem,
)
@@ -38,13 +37,14 @@ class DesySpider(Spider):
Examples:
To run a crawl, you need to pass FTP connection information via
- ``ftp_host`` and ``ftp_netrc``::
+ ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to
+ ``DESY``::
$ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
- To run a crawl on local folder, you need to pass the absolute ``package_path``::
+ To run a crawl on local folder, you need to pass the absolute ``source_folder``::
- $ scrapy crawl desy -a 'package_path=/path/to/package_dir'
+ $ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
"""
name = 'desy'
custom_settings = {}
@@ -52,118 +52,147 @@ class DesySpider(Spider):
def __init__(
self,
- package_path=None,
- ftp_folder='DESY',
+ source_folder=None,
+ ftp_folder='/DESY',
ftp_host=None,
ftp_netrc=None,
+ destination_folder='/tmp/DESY',
*args,
**kwargs
):
- """Constructor of ``Desy`` spider."""
super(DesySpider, self).__init__(*args, **kwargs)
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
- self.package_path = package_path
- self.target_folder = '/tmp/DESY'
+ self.source_folder = source_folder
+ self.destination_folder = destination_folder
self.ftp_enabled = True if self.ftp_host else False
- if not os.path.exists(self.target_folder):
- os.makedirs(self.target_folder)
+ if not os.path.exists(self.destination_folder):
+ os.makedirs(self.destination_folder)
+
+ @staticmethod
+ def _list_xml_files_paths(list_files_paths):
+ return [
+ xml_file
+ for xml_file in list_files_paths
+ if xml_file.endswith('.xml')
+ ]
+
+ def crawl_local_directory(self):
+ file_names = os.listdir(self.source_folder)
+ xml_file_names = self._list_xml_files_paths(file_names)
+
+ for file_name in xml_file_names:
+ file_path = os.path.join(self.source_folder, file_name)
+ self.log('Local: Try to crawl local file: {0}'.format(file_path))
+ yield Request(
+ 'file://{0}'.format(file_path),
+ callback=self.parse,
+ )
+
+ def crawl_ftp_directory(self):
+ ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+
+ remote_files_paths = ftp_list_files(
+ self.ftp_folder,
+ destination_folder=self.destination_folder,
+ ftp_host=ftp_host,
+ user=ftp_params['ftp_user'],
+ password=ftp_params['ftp_password'],
+ only_missing_files=False,
+ )
+
+ xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths)
+
+ for remote_file in xml_remote_files_paths:
+ self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
+ remote_file = str(remote_file)
+ ftp_params['ftp_local_filename'] = os.path.join(
+ self.destination_folder,
+ os.path.basename(remote_file),
+ )
+ remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file)
+ yield Request(
+ str(remote_url),
+ meta=ftp_params,
+ callback=self.handle_package_ftp,
+ )
+
+ def handle_package_ftp(self, response):
+ """Yield every XML file found.
+
+ This is an intermediate step before calling ``DesySpider.parse`` to handle ftp downloaded
+ "record collections".
+ """
+ self.log('Visited url {}'.format(response.url))
+ file_path = response.body
+ yield Request(
+ 'file://{0}'.format(file_path),
+ meta={'source_folder': file_path},
+ callback=self.parse,
+ )
def start_requests(self):
"""List selected folder on remote FTP and yield files."""
- def _list_xml_files_paths(list_files_paths):
- return [
- xml_file
- for xml_file in list_files_paths
- if xml_file.endswith('.xml')
- ]
-
- if self.package_path:
- file_names = os.listdir(self.package_path)
- xml_file_names = _list_xml_files_paths(file_names)
- for file_name in xml_file_names:
- file_path = os.path.join(self.package_path, file_name)
- self.log('Local: Try to crawl local file: {0}'.format(file_path))
- yield Request(
- 'file://{0}'.format(file_path),
- callback=self.parse,
- )
+ if self.source_folder:
+ requests = self.crawl_local_directory()
else:
- ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
-
- remote_files_paths = ftp_list_files(
- self.ftp_folder,
- target_folder=self.target_folder,
- server=ftp_host,
- user=ftp_params['ftp_user'],
- password=ftp_params['ftp_password'],
- lst_missing_files=False,
- )
+ requests = self.crawl_ftp_directory()
- xml_remote_files_paths = _list_xml_files_paths(remote_files_paths)
+ for request in requests:
+ yield request
- for remote_file in xml_remote_files_paths:
- self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
- remote_file = str(remote_file)
- ftp_params['ftp_local_filename'] = os.path.join(
- self.target_folder,
- os.path.basename(remote_file),
- )
- remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file)
- yield Request(
- str(remote_url),
- meta=ftp_params,
- callback=self.handle_package_ftp,
- )
+ @staticmethod
+ def _get_full_uri(current_path, base_url, schema, hostname=''):
+ if os.path.isabs(current_path):
+ full_path = current_path
+ else:
+ full_path = os.path.join(base_url, current_path)
+
+ return '{schema}://{hostname}{full_path}'.format(**vars())
def parse(self, response):
- """Parse a ``Desy`` XML file into a HEP record."""
+ """Parse a ``Desy`` XML file into a ``ParsedItem``."""
+
self.log('Got record from url/path: {0}'.format(response.url))
self.log('FTP enabled: {0}'.format(self.ftp_enabled))
ftp_params = None
if self.ftp_enabled:
- ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
- prefix_url = '{0}://{1}/'.format('ftp', ftp_host)
+ hostname, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+ base_url = self.ftp_folder
+ url_schema = 'ftp'
else:
- prefix_url = '{0}://{1}'.format(
- 'file',
- '/code/tests/functional/desy/fixtures/ftp_server/', # Temporary - Must be absolute path
- )
+ base_url = os.path.dirname(urllib.parse.urlparse(response.url).path)
+ url_schema = 'file'
+ hostname = None
marcxml_records = self._get_marcxml_records(response.body)
hep_records = self._hep_records_from_marcxml(marcxml_records)
- list_fft_old_links = []
for hep_record in hep_records:
- list_fft_old_links.extend(hep_record['_fft'])
-
list_file_urls = [
- '{0}{1}'.format(prefix_url, fft_link['path'])
- for fft_link in hep_record['_fft']
+ self._get_full_uri(
+ current_path=fft_path['path'],
+ base_url=base_url,
+ schema=url_schema,
+ hostname=hostname,
+ )
+ for fft_path in hep_record['_fft']
]
parsed_item = ParsedItem(
- item=hep_record,
+ record=hep_record,
file_urls=list_file_urls,
ftp_params=ftp_params,
- item_format='hep',
+ record_format='hep',
)
yield parsed_item
- def handle_package_ftp(self, response):
- """Yield every XML file found."""
- self.log('Visited url {}'.format(response.url))
- file_path = response.body
- yield Request(
- 'file://{0}'.format(file_path),
- meta={'package_path': file_path}
- )
-
- def _get_marcxml_records(self, response_body):
+ @staticmethod
+ def _get_marcxml_records(response_body):
root = etree.fromstring(response_body)
list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record')
if not list_items:
@@ -171,15 +200,16 @@ def _get_marcxml_records(self, response_body):
return [etree.tostring(item) for item in list_items]
- def _hep_records_from_marcxml(self, list_marcxml_records):
- def _create_json_record(str_xml_record):
- object_record = create_record(etree.XML(str_xml_record))
+ @staticmethod
+ def _hep_records_from_marcxml(marcxml_records):
+ def _create_json_record(xml_record):
+ object_record = create_record(etree.XML(xml_record))
dojson_record = hep.do(object_record)
return dojson_record
- list_hep_records = []
- for str_xml_record in list_marcxml_records:
- json_record = _create_json_record(str_xml_record)
- list_hep_records.append(json_record)
+ hep_records = []
+ for xml_record in marcxml_records:
+ json_record = _create_json_record(xml_record)
+ hep_records.append(json_record)
- return list_hep_records
+ return hep_records
diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py
index 499e3edc..d7ed1715 100644
--- a/hepcrawl/spiders/edp_spider.py
+++ b/hepcrawl/spiders/edp_spider.py
@@ -66,11 +66,11 @@ class EDPSpider(Jats, XMLFeedSpider):
To run an ``EDPSpider`` using ``rich`` format::
- $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
+ $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
To run an ``EDPSpider`` using ``gz`` format::
- $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_gz.tar.gz
+ $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_gz.tar.gz
Todo:
@@ -145,9 +145,9 @@ def start_requests(self):
ftp_host, ftp_params = ftp_connection_info(
self.ftp_host, self.ftp_netrc)
_, new_files = ftp_list_files(
- self.ftp_folder,
- self.target_folder,
- server=ftp_host,
+ server_folder=self.ftp_folder,
+ destination_folder=self.target_folder,
+ ftp_host=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)
@@ -176,7 +176,7 @@ def handle_package_ftp(self, response):
for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
- meta={"package_path": zip_filepath}
+ meta={"source_folder": zip_filepath}
)
def handle_package_file(self, response):
@@ -189,7 +189,7 @@ def handle_package_file(self, response):
for xml_file in xml_files:
request = Request(
"file://{0}".format(xml_file),
- meta={"package_path": zip_filepath}
+ meta={"source_folder": zip_filepath}
)
if "xml_rich" in xml_file:
request.meta["rich"] = True
diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
index 78fdd5fd..7dfbb9bb 100644
--- a/hepcrawl/spiders/elsevier_spider.py
+++ b/hepcrawl/spiders/elsevier_spider.py
@@ -181,7 +181,7 @@ def handle_package(self, response):
xml_url = u"file://{0}".format(os.path.abspath(xml_file))
yield Request(
xml_url,
- meta={"package_path": zip_filepath,
+ meta={"source_folder": zip_filepath,
"xml_url": xml_url},
)
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index 22d418a9..3e6ec655 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -72,7 +72,7 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
'rapid-communications'
]
- def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs):
+ def __init__(self, package_path=None, ftp_folder="/WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs):
"""Construct WSP spider."""
super(WorldScientificSpider, self).__init__(*args, **kwargs)
self.ftp_folder = ftp_folder
@@ -98,8 +98,8 @@ def start_requests(self):
new_files_paths = ftp_list_files(
self.ftp_folder,
- target_folder=self.target_folder,
- server=ftp_host,
+ destination_folder=self.target_folder,
+ ftp_host=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)
@@ -127,7 +127,7 @@ def handle_package_ftp(self, response):
for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
- meta={"package_path": zip_filepath}
+ meta={"source_folder": zip_filepath}
)
def handle_package_file(self, response):
@@ -139,7 +139,7 @@ def handle_package_file(self, response):
for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
- meta={"package_path": zip_filepath}
+ meta={"source_folder": zip_filepath}
)
def parse_node(self, response, node):
diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 71ff3aa6..96dc130e 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -31,6 +31,10 @@
INST_PHRASES = ['for the development', ]
+class PathDoesNotExist(IOError):
+ pass
+
+
def unzip_xml_files(filename, target_folder):
"""Unzip files (XML only) into target folder."""
z = ZipFile(filename)
@@ -58,15 +62,19 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False):
def ftp_list_files(
- server_folder,
- server,
- user,
- password,
- target_folder=None,
- passive_mode=False,
- lst_missing_files=True,
+ server_folder,
+ ftp_host,
+ user,
+ password,
+ destination_folder=None,
+ passive_mode=False,
+ only_missing_files=True,
):
- """List files from given FTP's server folder to target folder."""
+ """List files from given FTP's ftp_host folder to target folder.
+
+ Params:
+
+ """
session_factory = ftputil.session.session_factory(
base_class=ftplib.FTP,
port=21,
@@ -74,10 +82,10 @@ def ftp_list_files(
encrypt_data_channel=True,
)
- with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
- file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
- if lst_missing_files:
- return list_missing_files(server_folder, target_folder, file_names)
+ with ftputil.FTPHost(ftp_host, user, password, session_factory=session_factory) as host:
+ file_names = host.listdir(os.path.join(host.curdir, server_folder))
+ if only_missing_files:
+ return list_missing_files(server_folder, destination_folder, file_names)
else:
return [
os.path.join(
@@ -340,37 +348,69 @@ def get_license_by_text(license_text):
return license
-def get_file_name_from_url(url):
- return url.rsplit('/', 1)[-1]
-
-
def get_absolute_file_path(file_path):
"""Returns the absolute path of a relative path."""
return os.path.abspath(file_path)
+class RecordFile(object):
+ """Metadata of a file needed for a record.
+
+ Params:
+ path(str): local path to the file.
+ name(str): Optional, name of the file, if not passed, will use the name in the path.
+
+ Rises:
+ PathDoesNotExist:
+ """
+ def __init__(self, path, name=None):
+ self.path = path
+ if not os.path.exists(self.path):
+ raise PathDoesNotExist("The given record file path '%s' does not exist." % self.path)
+
+ if name is None:
+ name = os.path.basename(path)
+
+ self.name = name
+
+
class ParsedItem(dict):
- """Generate interface to communicate Spider-Pipelines"""
+ """Each of the individual items returned by the spider to the pipeline.
+
+ Params:
+ record(dict): Information about the crawled record, might be in different formats.
+ record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``.
+ file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``.
+ ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the ftp
+ server, if any.
+ record_files(list(RecordFile)): files attached to the record, usually populated by
+ ``FftFilesPipeline`` from the ``file_urls`` parameter.
+ """
def __init__(
self,
- item,
+ record,
+ record_format,
file_urls=None,
- item_format=None,
ftp_params=None,
- file_paths=None,
+ record_files=None,
**kwargs
):
super(ParsedItem, self).__init__(
- item=item,
+ record=record,
+ record_format=record_format,
file_urls=file_urls,
- item_format=item_format,
ftp_params=ftp_params,
- file_paths=file_paths,
+ record_files=record_files,
**kwargs
)
- self.item = item
- self.file_urls = file_urls
- self.format = item_format
- self.ftp_params = ftp_params
- self.file_paths = file_paths
- self.__dict__ = self
+
+ def __getattr__(self, key):
+ if key not in self:
+ raise AttributeError(
+ "'%s' object has no attribute '%s'" % (self.__class__.__name__, key)
+ )
+
+ return self[key]
+
+ def __setattr__(self, key, value):
+ self[key] = value
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
index 672b8248..4095d62f 100644
--- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
@@ -124,7 +124,7 @@
INSPIRE:HEP
- DESY/FFT/test_fft_1.txt
+ FFT/test_fft_1.txt
00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
.txt
cNFW_rogue_curves
@@ -135,7 +135,7 @@
- DESY/FFT/test_fft_2.txt
+ FFT/test_fft_2.txt
00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
.txt
scalingRelations_DutBeh_DC14_all_Oh
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
index b09a992d..fa395bfc 100644
--- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
@@ -124,7 +124,7 @@
INSPIRE:HEP
- DESY/FFT/test_fft_1.txt
+ FFT/test_fft_1.txt
00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.
.txt
cNFW_rogue_curves
@@ -135,7 +135,7 @@
- DESY/FFT/test_fft_2.txt
+ FFT/test_fft_2.txt
00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.
.txt
scalingRelations_DutBeh_DC14_all_Oh
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
index 64a5c209..5c3f4929 100644
--- a/tests/functional/desy/test_desy.py
+++ b/tests/functional/desy/test_desy.py
@@ -111,7 +111,7 @@ def set_up_local_environment():
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
- 'package_path': package_location,
+ 'source_folder': package_location,
}
}
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index a0411b8e..8c7b060a 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -72,7 +72,7 @@ def set_up_local_environment():
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
- 'package_path': package_location,
+ 'source_folder': package_location,
}
}
diff --git a/tests/unit/test_crawler2hep.py b/tests/unit/test_crawler2hep.py
index 95375ebf..088178f1 100644
--- a/tests/unit/test_crawler2hep.py
+++ b/tests/unit/test_crawler2hep.py
@@ -12,14 +12,14 @@
import pytest
import yaml
-from hepcrawl.crawler2hep import crawler2hep
+from hepcrawl.crawler2hep import hepcrawl_to_hep
from hepcrawl.testlib.fixtures import get_test_suite_path
def load_file(file_name):
path = get_test_suite_path(
'responses',
- 'crawler2hep',
+ 'hepcrawl_to_hep',
file_name,
)
with open(path) as input_data:
@@ -52,7 +52,7 @@ def test_generic_crawler_record(
input_generic_crawler_record,
expected_generic_crawler_record
):
- produced_record = crawler2hep(input_generic_crawler_record)
+ produced_record = hepcrawl_to_hep(input_generic_crawler_record)
assert produced_record == expected_generic_crawler_record
@@ -60,5 +60,5 @@ def test_no_document_type(
input_no_document_type_record,
expected_no_document_type_record
):
- produced_record = crawler2hep(input_no_document_type_record)
+ produced_record = hepcrawl_to_hep(input_no_document_type_record)
assert produced_record == expected_no_document_type_record
diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py
index 9d88d5ad..5dba9990 100644
--- a/tests/unit/test_edp.py
+++ b/tests/unit/test_edp.py
@@ -359,7 +359,7 @@ def test_handle_package_ftp(tarbzfile):
request = spider.handle_package_ftp(response).next()
assert isinstance(request, Request)
- assert request.meta["package_path"] == tarbzfile
+ assert request.meta["source_folder"] == tarbzfile
def test_no_dois_jats():
diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py
index 109f3d3f..d26e52fb 100644
--- a/tests/unit/test_elsevier.py
+++ b/tests/unit/test_elsevier.py
@@ -1594,11 +1594,11 @@ def test_handle_package(handled_package):
for astro, nima in zip(astropart, nima):
assert nima
assert astro
- assert astro.meta["package_path"] == "tests/unit/responses/elsevier/fake_astropart.zip"
+ assert astro.meta["source_folder"] == "tests/unit/responses/elsevier/fake_astropart.zip"
url_to_match = u'file:///tmp/elsevier_fake_astropart_*/0927-6505/aip/S0927650515001656/S0927650515001656.xml'
assert astro.meta["xml_url"] == fnmatch.filter([astro.meta["xml_url"]], url_to_match)[0]
- assert nima.meta["package_path"] == "tests/unit/responses/elsevier/fake_nima.zip"
+ assert nima.meta["source_folder"] == "tests/unit/responses/elsevier/fake_nima.zip"
url_to_match = u'file:///tmp/elsevier_fake_nima_*/0168-9002/S0168900215X00398/S0168900215015636/S0168900215015636.xml'
assert nima.meta["xml_url"] == fnmatch.filter([nima.meta["xml_url"]], url_to_match)[0]