From b3d9bb2caaf2b23a35cc6f3ebe13fd9f1ad8a06e Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 23 Oct 2017 12:49:25 +0200 Subject: [PATCH] desy: properly use/populate documents url Signed-off-by: David Caro --- hepcrawl/pipelines.py | 8 ++----- hepcrawl/spiders/desy_spider.py | 21 ++++++++++++------- hepcrawl/tohep.py | 17 +++++++++------ hepcrawl/utils.py | 10 +++++++++ .../fixtures/desy_records_ftp_expected.json | 2 +- .../fixtures/desy_records_local_expected.json | 2 +- .../DESY/desy_collection_records.xml | 2 +- 7 files changed, 40 insertions(+), 22 deletions(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 9b386c58..b30ff6c7 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -36,7 +36,7 @@ class DocumentsPipeline(FilesPipeline): - """Download all the documents provided by record. + """Download all the documents the record passed to download. Note: @@ -55,7 +55,6 @@ def __init__(self, store_uri, *args, **kwargs): ) def get_media_requests(self, item, info): - """Download documents using FTP.""" if item.get('file_urls'): logging.info( 'Got the following files to download:\n%s' % pprint.pformat( @@ -70,10 +69,7 @@ def get_media_requests(self, item, info): def get_absolute_file_path(self, path): return os.path.abspath( - os.path.join( - self.store.basedir, - path - ) + os.path.join(self.store.basedir, path) ) def item_completed(self, results, item, info): diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index f0d06c87..3353066d 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -178,14 +178,19 @@ def start_requests(self): yield request @staticmethod - def _get_full_uri(current_url, base_url, schema, hostname=None): + def _has_to_be_downloaded(current_url): + def _is_local_path(url): + parsed_url = urllib.parse.urlparse(url) + return not parsed_url.scheme + + return _is_local_path(current_url) + + @staticmethod + def _get_full_uri(current_url, base_url, schema='ftp', hostname=None): hostname = hostname or '' parsed_url = urllib.parse.urlparse(current_url) - if parsed_url.scheme and parsed_url.scheme not in ['ftp', 'file']: - return current_url - current_path = parsed_url.path if os.path.isabs(current_path): full_path = current_path @@ -228,7 +233,7 @@ def parse(self, response): self.log('Got %d hep records' % len(hep_records)) for hep_record in hep_records: - list_file_urls = [ + files_to_download = [ self._get_full_uri( current_url=document['url'], base_url=base_url, @@ -236,14 +241,16 @@ def parse(self, response): hostname=hostname, ) for document in hep_record.get('documents', []) + if self._has_to_be_downloaded(document['url']) ] self.log( - 'Got the following attached documents: %s' % list_file_urls + 'Got the following attached documents to download: %s' + % files_to_download ) parsed_item = ParsedItem( record=hep_record, - file_urls=list_file_urls, + file_urls=files_to_download, ftp_params=ftp_params, record_format='hep', ) diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 5bc133c2..980a030d 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -43,21 +43,25 @@ def _get_updated_documents(current_documents, record_files): Args: current_documents(list(dict)): current documents as generated by ``dojson``. We expect each of them to have, at least, a key - named ``url``. + named ``old_url``. record_files(list(RecordFile)): files attached to the record as populated by :class:`hepcrawl.pipelines.DocumentsPipeline`. """ record_files_index = { - record_file.name: record_file.path + os.path.basename(record_file.name): record_file.path for record_file in record_files } new_documents = [] for document in current_documents: - file_name = os.path.basename(document['url']) - if file_name in record_files_index: - document['url'] = record_files_index[file_name] - new_documents.append(document) + url = document.get('old_url', document.get('url', '')) + full_file_name = os.path.basename(url) + if url and full_file_name in record_files_index: + document['url'] = record_files_index[full_file_name] + elif url: + document['url'] = document['old_url'] + + new_documents.append(document) return new_documents @@ -200,6 +204,7 @@ def hep_to_hep(hep_record, record_files): """ if record_files: LOGGER.debug('Updating documents from: %s', hep_record['documents']) + LOGGER.debug('With record_files: %s', record_files) hep_record['documents'] = _get_updated_documents( current_documents=hep_record['documents'], record_files=record_files, diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index a15455eb..81e1d9d4 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -376,6 +376,16 @@ def __init__(self, path, name=None): self.name = name + def __repr__(self): + return self.__str__() + + def __str__(self): + return '%s(path="%s", name="%s")' % ( + self.__class__.__name__, + self.name, + self.path, + ) + class ParsedItem(dict): """Each of the individual items returned by the spider to the pipeline. diff --git a/tests/functional/desy/fixtures/desy_records_ftp_expected.json b/tests/functional/desy/fixtures/desy_records_ftp_expected.json index 57f91e06..1580ad9e 100644 --- a/tests/functional/desy/fixtures/desy_records_ftp_expected.json +++ b/tests/functional/desy/fixtures/desy_records_ftp_expected.json @@ -225,7 +225,7 @@ "core": true, "documents": [ { - "url": "/tmp/file_urls/full/85f78f549bd45b34999bc72353d982127043c341.pdf", + "url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf", "fulltext": true, "key": "document" } diff --git a/tests/functional/desy/fixtures/desy_records_local_expected.json b/tests/functional/desy/fixtures/desy_records_local_expected.json index 3fa238bb..d3a75d90 100644 --- a/tests/functional/desy/fixtures/desy_records_local_expected.json +++ b/tests/functional/desy/fixtures/desy_records_local_expected.json @@ -225,7 +225,7 @@ "core": true, "documents": [ { - "url": "/tmp/file_urls/full/395353fad4fc8ce1c37afe9f019e1473917747e9.pdf", + "url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf", "fulltext": true, "key": "document" } diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml index 14c8de21..7a01d94b 100644 --- a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml +++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml @@ -237,7 +237,7 @@ DESY - FFT/desy-thesis-17-036.title.pdf + http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf Fulltext INSPIRE-PUBLIC