diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index add53f70..3a8d3ba8 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -20,22 +20,34 @@ from inspire_schemas.api import LiteratureBuilder -from hepcrawl.utils import get_file_name_from_url - -def _update_record_fft(record, index_fft_file_paths): - def _update_fft_fields(fft_fields, index_fft_file_paths): - new_fft_fields = [] - for fft_field in fft_fields: - file_name = get_file_name_from_url(fft_field['path']) - if file_name in index_fft_file_paths: - fft_field['path'] = index_fft_file_paths[file_name] - new_fft_fields.append(fft_field) - - return new_fft_fields - - record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths) - return record +def _get_updated_fft_fields(current_fft_fields, record_files): + """ + + Params: + current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We + expect each of then to have, at least, a key named ``path``. + record_files(list(RecordFile)): files attached to the record as populated by + ``FftFilesPipeline``. + """ + record_files_index = { + record_file.name: record_file.path + for record_file in record_files + } + new_fft_fields = [] + import logging + logger = logging.getLogger(__name__) + logger.log(logging.INFO, + "-------------------- _get_updated_fft_fields -------------------") + logger.log(logging.INFO, + 'current_fft_fields: {}'.format(current_fft_fields)) + for fft_field in current_fft_fields: + file_name = os.path.basename(fft_field['path']) + if file_name in record_files_index: + fft_field['path'] = record_files_index[file_name] + new_fft_fields.append(fft_field) + + return new_fft_fields def _has_publication_info(item): @@ -116,50 +128,47 @@ def _normalize_hepcrawl_record(item, source): return item -def _generate_acquisition_source(crawler_record, source): - crawler_record['acquisition_source'] = { +def _generate_acquisition_source(source): + acquisition_source = { 'source': source, 'method': 'hepcrawl', 'datetime': datetime.datetime.now().isoformat(), 'submission_number': os.environ.get('SCRAPY_JOB', ''), } - return crawler_record + return acquisition_source -def to_hep( - item, - source, - item_format='hepcrawl', - fft_file_paths=None, +def item_to_hep( + item, + source, ): - item = _generate_acquisition_source( - crawler_record=item, - source=source, - ) + item.record['acquisition_source'] = _generate_acquisition_source(source=source) - if item_format == 'hep': - return hep2hep( - crawler_record=item, - fft_file_paths=fft_file_paths, + if item.record_format == 'hep': + return hep_to_hep( + hep_record=item.record, + record_files=item.record_files, ) - elif item_format == 'hepcrawl': + elif item.record_format == 'hepcrawl': item = _normalize_hepcrawl_record( item=item, source=source, ) - return crawler2hep(dict(item)) + return hepcrawl_to_hep(dict(item)) else: - raise Exception('Unknown item_format::{}'.format(item_format)) + raise Exception('Unknown item_format::{}'.format(item.record_format)) -def hep2hep(crawler_record, fft_file_paths): - if fft_file_paths: - crawler_record = _update_record_fft(crawler_record, fft_file_paths) +def hep_to_hep(hep_record, record_files): + hep_record['_fft'] = _get_updated_fft_fields( + current_fft_fields=hep_record['_fft'], + record_files=record_files, + ) - return crawler_record + return hep_record -def crawler2hep(crawler_record): +def hepcrawl_to_hep(crawler_record): def _filter_affiliation(affiliations): return [ diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 05b61361..b8db5bcf 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -24,16 +24,23 @@ from inspire_schemas.utils import validate -from hepcrawl.crawler2hep import to_hep +from hepcrawl.crawler2hep import item_to_hep from hepcrawl.settings import FILES_STORE -from hepcrawl.utils import get_file_name_from_url +from hepcrawl.utils import RecordFile class FftFilesPipeline(FilesPipeline): - """Download all the FFT files provided by record.""" + """Download all the FFT files provided by record. - def __init__(self, *args, **kwargs): - super(FftFilesPipeline, self).__init__(FILES_STORE) + Note: + + This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls`` + property. + """ + + def __init__(self, store_uri, *args, **kwargs): + store_uri = store_uri or FILES_STORE + super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs) def get_media_requests(self, item, info): """Download FFT files using FTP.""" @@ -44,24 +51,25 @@ def get_media_requests(self, item, info): meta=item.ftp_params, ) + def get_absolute_file_path(self, path): + return os.path.abspath( + os.path.join( + self.store.basedir, + path + ) + ) + def item_completed(self, results, item, info): """Create a map that connects file names with downloaded files.""" - def _get_absolute_local_file_path(path): - return os.path.abspath( - os.path.join( - FILES_STORE, - path - ) + record_files = [ + RecordFile( + path=self.get_absolute_file_path(result_data['path']), + name=os.path.basename(result_data['url']), ) - - map_file_names_paths = {} - for ok, result_data in results: - if ok: - map_file_names_paths[ - get_file_name_from_url(result_data['url']) - ] = _get_absolute_local_file_path(result_data['path']) - - item.file_paths = map_file_names_paths + for ok, result_data in results + if ok + ] + item.record_files = record_files return item @@ -76,16 +84,11 @@ def open_spider(self, spider): self.results_data = [] def _post_enhance_item(self, item, spider): - fft_file_paths = item.file_paths - item_format = item.item_format - item = item.item if item.item else item source = spider.name - return to_hep( + return item_to_hep( item=item, source=source, - item_format=item_format, - fft_file_paths=fft_file_paths, ) def process_item(self, item, spider): diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 5ec79da8..2167d0a2 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -15,6 +15,7 @@ from lxml import etree from dojson.contrib.marc21.utils import create_record +from six.moves import urllib from scrapy import Request from scrapy.spiders import Spider @@ -24,8 +25,6 @@ from hepcrawl.utils import ( ftp_list_files, ftp_connection_info, - get_absolute_file_path, - get_file_name_from_url, ParsedItem, ) @@ -38,13 +37,14 @@ class DesySpider(Spider): Examples: To run a crawl, you need to pass FTP connection information via - ``ftp_host`` and ``ftp_netrc``:: + ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to + ``DESY``:: $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' - To run a crawl on local folder, you need to pass the absolute ``package_path``:: + To run a crawl on local folder, you need to pass the absolute ``source_folder``:: - $ scrapy crawl desy -a 'package_path=/path/to/package_dir' + $ scrapy crawl desy -a 'source_folder=/path/to/package_dir' """ name = 'desy' custom_settings = {} @@ -52,118 +52,147 @@ class DesySpider(Spider): def __init__( self, - package_path=None, - ftp_folder='DESY', + source_folder=None, + ftp_folder='/DESY', ftp_host=None, ftp_netrc=None, + destination_folder='/tmp/DESY', *args, **kwargs ): - """Constructor of ``Desy`` spider.""" super(DesySpider, self).__init__(*args, **kwargs) self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.package_path = package_path - self.target_folder = '/tmp/DESY' + self.source_folder = source_folder + self.destination_folder = destination_folder self.ftp_enabled = True if self.ftp_host else False - if not os.path.exists(self.target_folder): - os.makedirs(self.target_folder) + if not os.path.exists(self.destination_folder): + os.makedirs(self.destination_folder) + + @staticmethod + def _list_xml_files_paths(list_files_paths): + return [ + xml_file + for xml_file in list_files_paths + if xml_file.endswith('.xml') + ] + + def crawl_local_directory(self): + file_names = os.listdir(self.source_folder) + xml_file_names = self._list_xml_files_paths(file_names) + + for file_name in xml_file_names: + file_path = os.path.join(self.source_folder, file_name) + self.log('Local: Try to crawl local file: {0}'.format(file_path)) + yield Request( + 'file://{0}'.format(file_path), + callback=self.parse, + ) + + def crawl_ftp_directory(self): + ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + + remote_files_paths = ftp_list_files( + self.ftp_folder, + destination_folder=self.destination_folder, + ftp_host=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'], + only_missing_files=False, + ) + + xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths) + + for remote_file in xml_remote_files_paths: + self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file)) + remote_file = str(remote_file) + ftp_params['ftp_local_filename'] = os.path.join( + self.destination_folder, + os.path.basename(remote_file), + ) + remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp, + ) + + def handle_package_ftp(self, response): + """Yield every XML file found. + + This is an intermediate step before calling ``DesySpider.parse`` to handle ftp downloaded + "record collections". + """ + self.log('Visited url {}'.format(response.url)) + file_path = response.body + yield Request( + 'file://{0}'.format(file_path), + meta={'source_folder': file_path}, + callback=self.parse, + ) def start_requests(self): """List selected folder on remote FTP and yield files.""" - def _list_xml_files_paths(list_files_paths): - return [ - xml_file - for xml_file in list_files_paths - if xml_file.endswith('.xml') - ] - - if self.package_path: - file_names = os.listdir(self.package_path) - xml_file_names = _list_xml_files_paths(file_names) - for file_name in xml_file_names: - file_path = os.path.join(self.package_path, file_name) - self.log('Local: Try to crawl local file: {0}'.format(file_path)) - yield Request( - 'file://{0}'.format(file_path), - callback=self.parse, - ) + if self.source_folder: + requests = self.crawl_local_directory() else: - ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) - - remote_files_paths = ftp_list_files( - self.ftp_folder, - target_folder=self.target_folder, - server=ftp_host, - user=ftp_params['ftp_user'], - password=ftp_params['ftp_password'], - lst_missing_files=False, - ) + requests = self.crawl_ftp_directory() - xml_remote_files_paths = _list_xml_files_paths(remote_files_paths) + for request in requests: + yield request - for remote_file in xml_remote_files_paths: - self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file)) - remote_file = str(remote_file) - ftp_params['ftp_local_filename'] = os.path.join( - self.target_folder, - os.path.basename(remote_file), - ) - remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) - yield Request( - str(remote_url), - meta=ftp_params, - callback=self.handle_package_ftp, - ) + @staticmethod + def _get_full_uri(current_path, base_url, schema, hostname=''): + if os.path.isabs(current_path): + full_path = current_path + else: + full_path = os.path.join(base_url, current_path) + + return '{schema}://{hostname}{full_path}'.format(**vars()) def parse(self, response): - """Parse a ``Desy`` XML file into a HEP record.""" + """Parse a ``Desy`` XML file into a ``ParsedItem``.""" + self.log('Got record from url/path: {0}'.format(response.url)) self.log('FTP enabled: {0}'.format(self.ftp_enabled)) ftp_params = None if self.ftp_enabled: - ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) - prefix_url = '{0}://{1}/'.format('ftp', ftp_host) + hostname, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + base_url = self.ftp_folder + url_schema = 'ftp' else: - prefix_url = '{0}://{1}'.format( - 'file', - '/code/tests/functional/desy/fixtures/ftp_server/', # Temporary - Must be absolute path - ) + base_url = os.path.dirname(urllib.parse.urlparse(response.url).path) + url_schema = 'file' + hostname = None marcxml_records = self._get_marcxml_records(response.body) hep_records = self._hep_records_from_marcxml(marcxml_records) - list_fft_old_links = [] for hep_record in hep_records: - list_fft_old_links.extend(hep_record['_fft']) - list_file_urls = [ - '{0}{1}'.format(prefix_url, fft_link['path']) - for fft_link in hep_record['_fft'] + self._get_full_uri( + current_path=fft_path['path'], + base_url=base_url, + schema=url_schema, + hostname=hostname, + ) + for fft_path in hep_record['_fft'] ] parsed_item = ParsedItem( - item=hep_record, + record=hep_record, file_urls=list_file_urls, ftp_params=ftp_params, - item_format='hep', + record_format='hep', ) yield parsed_item - def handle_package_ftp(self, response): - """Yield every XML file found.""" - self.log('Visited url {}'.format(response.url)) - file_path = response.body - yield Request( - 'file://{0}'.format(file_path), - meta={'package_path': file_path} - ) - - def _get_marcxml_records(self, response_body): + @staticmethod + def _get_marcxml_records(response_body): root = etree.fromstring(response_body) list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record') if not list_items: @@ -171,15 +200,16 @@ def _get_marcxml_records(self, response_body): return [etree.tostring(item) for item in list_items] - def _hep_records_from_marcxml(self, list_marcxml_records): - def _create_json_record(str_xml_record): - object_record = create_record(etree.XML(str_xml_record)) + @staticmethod + def _hep_records_from_marcxml(marcxml_records): + def _create_json_record(xml_record): + object_record = create_record(etree.XML(xml_record)) dojson_record = hep.do(object_record) return dojson_record - list_hep_records = [] - for str_xml_record in list_marcxml_records: - json_record = _create_json_record(str_xml_record) - list_hep_records.append(json_record) + hep_records = [] + for xml_record in marcxml_records: + json_record = _create_json_record(xml_record) + hep_records.append(json_record) - return list_hep_records + return hep_records diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index 499e3edc..d7ed1715 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -66,11 +66,11 @@ class EDPSpider(Jats, XMLFeedSpider): To run an ``EDPSpider`` using ``rich`` format:: - $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_rich.tar.bz2 + $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_rich.tar.bz2 To run an ``EDPSpider`` using ``gz`` format:: - $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_gz.tar.gz + $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_gz.tar.gz Todo: @@ -145,9 +145,9 @@ def start_requests(self): ftp_host, ftp_params = ftp_connection_info( self.ftp_host, self.ftp_netrc) _, new_files = ftp_list_files( - self.ftp_folder, - self.target_folder, - server=ftp_host, + server_folder=self.ftp_folder, + destination_folder=self.target_folder, + ftp_host=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) @@ -176,7 +176,7 @@ def handle_package_ftp(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def handle_package_file(self, response): @@ -189,7 +189,7 @@ def handle_package_file(self, response): for xml_file in xml_files: request = Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) if "xml_rich" in xml_file: request.meta["rich"] = True diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 78fdd5fd..7dfbb9bb 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -181,7 +181,7 @@ def handle_package(self, response): xml_url = u"file://{0}".format(os.path.abspath(xml_file)) yield Request( xml_url, - meta={"package_path": zip_filepath, + meta={"source_folder": zip_filepath, "xml_url": xml_url}, ) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 22d418a9..3e6ec655 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -72,7 +72,7 @@ class WorldScientificSpider(Jats, XMLFeedSpider): 'rapid-communications' ] - def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs): + def __init__(self, package_path=None, ftp_folder="/WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs): """Construct WSP spider.""" super(WorldScientificSpider, self).__init__(*args, **kwargs) self.ftp_folder = ftp_folder @@ -98,8 +98,8 @@ def start_requests(self): new_files_paths = ftp_list_files( self.ftp_folder, - target_folder=self.target_folder, - server=ftp_host, + destination_folder=self.target_folder, + ftp_host=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) @@ -127,7 +127,7 @@ def handle_package_ftp(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def handle_package_file(self, response): @@ -139,7 +139,7 @@ def handle_package_file(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def parse_node(self, response, node): diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 71ff3aa6..96dc130e 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -31,6 +31,10 @@ INST_PHRASES = ['for the development', ] +class PathDoesNotExist(IOError): + pass + + def unzip_xml_files(filename, target_folder): """Unzip files (XML only) into target folder.""" z = ZipFile(filename) @@ -58,15 +62,19 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): def ftp_list_files( - server_folder, - server, - user, - password, - target_folder=None, - passive_mode=False, - lst_missing_files=True, + server_folder, + ftp_host, + user, + password, + destination_folder=None, + passive_mode=False, + only_missing_files=True, ): - """List files from given FTP's server folder to target folder.""" + """List files from given FTP's ftp_host folder to target folder. + + Params: + + """ session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, port=21, @@ -74,10 +82,10 @@ def ftp_list_files( encrypt_data_channel=True, ) - with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: - file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) - if lst_missing_files: - return list_missing_files(server_folder, target_folder, file_names) + with ftputil.FTPHost(ftp_host, user, password, session_factory=session_factory) as host: + file_names = host.listdir(os.path.join(host.curdir, server_folder)) + if only_missing_files: + return list_missing_files(server_folder, destination_folder, file_names) else: return [ os.path.join( @@ -340,37 +348,69 @@ def get_license_by_text(license_text): return license -def get_file_name_from_url(url): - return url.rsplit('/', 1)[-1] - - def get_absolute_file_path(file_path): """Returns the absolute path of a relative path.""" return os.path.abspath(file_path) +class RecordFile(object): + """Metadata of a file needed for a record. + + Params: + path(str): local path to the file. + name(str): Optional, name of the file, if not passed, will use the name in the path. + + Rises: + PathDoesNotExist: + """ + def __init__(self, path, name=None): + self.path = path + if not os.path.exists(self.path): + raise PathDoesNotExist("The given record file path '%s' does not exist." % self.path) + + if name is None: + name = os.path.basename(path) + + self.name = name + + class ParsedItem(dict): - """Generate interface to communicate Spider-Pipelines""" + """Each of the individual items returned by the spider to the pipeline. + + Params: + record(dict): Information about the crawled record, might be in different formats. + record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``. + file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``. + ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the ftp + server, if any. + record_files(list(RecordFile)): files attached to the record, usually populated by + ``FftFilesPipeline`` from the ``file_urls`` parameter. + """ def __init__( self, - item, + record, + record_format, file_urls=None, - item_format=None, ftp_params=None, - file_paths=None, + record_files=None, **kwargs ): super(ParsedItem, self).__init__( - item=item, + record=record, + record_format=record_format, file_urls=file_urls, - item_format=item_format, ftp_params=ftp_params, - file_paths=file_paths, + record_files=record_files, **kwargs ) - self.item = item - self.file_urls = file_urls - self.format = item_format - self.ftp_params = ftp_params - self.file_paths = file_paths - self.__dict__ = self + + def __getattr__(self, key): + if key not in self: + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__.__name__, key) + ) + + return self[key] + + def __setattr__(self, key, value): + self[key] = value diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml index 672b8248..4095d62f 100644 --- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml @@ -124,7 +124,7 @@ INSPIRE:HEP - DESY/FFT/test_fft_1.txt + FFT/test_fft_1.txt 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. .txt cNFW_rogue_curves @@ -135,7 +135,7 @@ - DESY/FFT/test_fft_2.txt + FFT/test_fft_2.txt 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. .txt scalingRelations_DutBeh_DC14_all_Oh diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml index b09a992d..fa395bfc 100644 --- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml @@ -124,7 +124,7 @@ INSPIRE:HEP - DESY/FFT/test_fft_1.txt + FFT/test_fft_1.txt 00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}. .txt cNFW_rogue_curves @@ -135,7 +135,7 @@ - DESY/FFT/test_fft_2.txt + FFT/test_fft_2.txt 00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines. .txt scalingRelations_DutBeh_DC14_all_Oh diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index 64a5c209..5c3f4929 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -111,7 +111,7 @@ def set_up_local_environment(): 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'package_path': package_location, + 'source_folder': package_location, } } diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index a0411b8e..8c7b060a 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -72,7 +72,7 @@ def set_up_local_environment(): 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'package_path': package_location, + 'source_folder': package_location, } } diff --git a/tests/unit/test_crawler2hep.py b/tests/unit/test_crawler2hep.py index 95375ebf..088178f1 100644 --- a/tests/unit/test_crawler2hep.py +++ b/tests/unit/test_crawler2hep.py @@ -12,14 +12,14 @@ import pytest import yaml -from hepcrawl.crawler2hep import crawler2hep +from hepcrawl.crawler2hep import hepcrawl_to_hep from hepcrawl.testlib.fixtures import get_test_suite_path def load_file(file_name): path = get_test_suite_path( 'responses', - 'crawler2hep', + 'hepcrawl_to_hep', file_name, ) with open(path) as input_data: @@ -52,7 +52,7 @@ def test_generic_crawler_record( input_generic_crawler_record, expected_generic_crawler_record ): - produced_record = crawler2hep(input_generic_crawler_record) + produced_record = hepcrawl_to_hep(input_generic_crawler_record) assert produced_record == expected_generic_crawler_record @@ -60,5 +60,5 @@ def test_no_document_type( input_no_document_type_record, expected_no_document_type_record ): - produced_record = crawler2hep(input_no_document_type_record) + produced_record = hepcrawl_to_hep(input_no_document_type_record) assert produced_record == expected_no_document_type_record diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index 9d88d5ad..5dba9990 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -359,7 +359,7 @@ def test_handle_package_ftp(tarbzfile): request = spider.handle_package_ftp(response).next() assert isinstance(request, Request) - assert request.meta["package_path"] == tarbzfile + assert request.meta["source_folder"] == tarbzfile def test_no_dois_jats(): diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index 109f3d3f..d26e52fb 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -1594,11 +1594,11 @@ def test_handle_package(handled_package): for astro, nima in zip(astropart, nima): assert nima assert astro - assert astro.meta["package_path"] == "tests/unit/responses/elsevier/fake_astropart.zip" + assert astro.meta["source_folder"] == "tests/unit/responses/elsevier/fake_astropart.zip" url_to_match = u'file:///tmp/elsevier_fake_astropart_*/0927-6505/aip/S0927650515001656/S0927650515001656.xml' assert astro.meta["xml_url"] == fnmatch.filter([astro.meta["xml_url"]], url_to_match)[0] - assert nima.meta["package_path"] == "tests/unit/responses/elsevier/fake_nima.zip" + assert nima.meta["source_folder"] == "tests/unit/responses/elsevier/fake_nima.zip" url_to_match = u'file:///tmp/elsevier_fake_nima_*/0168-9002/S0168900215X00398/S0168900215015636/S0168900215015636.xml' assert nima.meta["xml_url"] == fnmatch.filter([nima.meta["xml_url"]], url_to_match)[0]