Merge pull request #187 from david-caro/fix_desy_documents

desy: properly use/populate documents url
inspirehep · Oct 23, 2017 · 903c5ff · 903c5ff
2 parents 42b85d1 + b3d9bb2
commit 903c5ff
Show file tree

Hide file tree

Showing 7 changed files with 40 additions and 22 deletions.
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
@@ -36,7 +36,7 @@
 
 
 class DocumentsPipeline(FilesPipeline):
-    """Download all the documents provided by record.
+    """Download all the documents the record passed to download.
 
     Note:
 
@@ -55,7 +55,6 @@ def __init__(self, store_uri, *args, **kwargs):
         )
 
     def get_media_requests(self, item, info):
-        """Download documents using FTP."""
         if item.get('file_urls'):
             logging.info(
                 'Got the following files to download:\n%s' % pprint.pformat(
@@ -70,10 +69,7 @@ def get_media_requests(self, item, info):
 
     def get_absolute_file_path(self, path):
         return os.path.abspath(
-            os.path.join(
-                self.store.basedir,
-                path
-            )
+            os.path.join(self.store.basedir, path)
         )
 
     def item_completed(self, results, item, info):

diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
@@ -178,14 +178,19 @@ def start_requests(self):
             yield request
 
     @staticmethod
-    def _get_full_uri(current_url, base_url, schema, hostname=None):
+    def _has_to_be_downloaded(current_url):
+        def _is_local_path(url):
+            parsed_url = urllib.parse.urlparse(url)
+            return not parsed_url.scheme
+
+        return _is_local_path(current_url)
+
+    @staticmethod
+    def _get_full_uri(current_url, base_url, schema='ftp', hostname=None):
         hostname = hostname or ''
 
         parsed_url = urllib.parse.urlparse(current_url)
 
-        if parsed_url.scheme and parsed_url.scheme not in ['ftp', 'file']:
-            return current_url
-
         current_path = parsed_url.path
         if os.path.isabs(current_path):
             full_path = current_path
@@ -228,22 +233,24 @@ def parse(self, response):
         self.log('Got %d hep records' % len(hep_records))
 
         for hep_record in hep_records:
-            list_file_urls = [
+            files_to_download = [
                 self._get_full_uri(
                     current_url=document['url'],
                     base_url=base_url,
                     schema=url_schema,
                     hostname=hostname,
                 )
                 for document in hep_record.get('documents', [])
+                if self._has_to_be_downloaded(document['url'])
             ]
 
             self.log(
-                'Got the following attached documents: %s' % list_file_urls
+                'Got the following attached documents to download: %s'
+                % files_to_download
             )
             parsed_item = ParsedItem(
                 record=hep_record,
-                file_urls=list_file_urls,
+                file_urls=files_to_download,
                 ftp_params=ftp_params,
                 record_format='hep',
             )

diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py
@@ -43,21 +43,25 @@ def _get_updated_documents(current_documents, record_files):
     Args:
         current_documents(list(dict)): current documents as generated
             by ``dojson``. We expect each of them to have, at least, a key
-            named ``url``.
+            named ``old_url``.
 
         record_files(list(RecordFile)): files attached to the record as
             populated by :class:`hepcrawl.pipelines.DocumentsPipeline`.
     """
     record_files_index = {
-        record_file.name: record_file.path
+        os.path.basename(record_file.name): record_file.path
         for record_file in record_files
     }
     new_documents = []
     for document in current_documents:
-        file_name = os.path.basename(document['url'])
-        if file_name in record_files_index:
-            document['url'] = record_files_index[file_name]
-            new_documents.append(document)
+        url = document.get('old_url', document.get('url', ''))
+        full_file_name = os.path.basename(url)
+        if url and full_file_name in record_files_index:
+            document['url'] = record_files_index[full_file_name]
+        elif url:
+            document['url'] = document['old_url']
+
+        new_documents.append(document)
 
     return new_documents
 
@@ -200,6 +204,7 @@ def hep_to_hep(hep_record, record_files):
     """
     if record_files:
         LOGGER.debug('Updating documents from: %s', hep_record['documents'])
+        LOGGER.debug('With record_files: %s', record_files)
         hep_record['documents'] = _get_updated_documents(
             current_documents=hep_record['documents'],
             record_files=record_files,

diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
@@ -376,6 +376,16 @@ def __init__(self, path, name=None):
 
         self.name = name
 
+    def __repr__(self):
+        return self.__str__()
+
+    def __str__(self):
+        return '%s(path="%s", name="%s")' % (
+            self.__class__.__name__,
+            self.name,
+            self.path,
+        )
+
 
 class ParsedItem(dict):
     """Each of the individual items returned by the spider to the pipeline.

diff --git a/tests/functional/desy/fixtures/desy_records_ftp_expected.json b/tests/functional/desy/fixtures/desy_records_ftp_expected.json
@@ -225,7 +225,7 @@
         "core": true, 
         "documents": [
             {
-                "url": "/tmp/file_urls/full/85f78f549bd45b34999bc72353d982127043c341.pdf", 
+                "url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf", 
                 "fulltext": true, 
                 "key": "document"
             }

diff --git a/tests/functional/desy/fixtures/desy_records_local_expected.json b/tests/functional/desy/fixtures/desy_records_local_expected.json
@@ -225,7 +225,7 @@
         "core": true, 
         "documents": [
             {
-                "url": "/tmp/file_urls/full/395353fad4fc8ce1c37afe9f019e1473917747e9.pdf", 
+                "url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf", 
                 "fulltext": true, 
                 "key": "document"
             }

diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
@@ -237,7 +237,7 @@
     <subfield code="u">DESY</subfield>
   </datafield>
   <datafield tag="FFT" ind1=" " ind2=" ">
-    <subfield code="a">FFT/desy-thesis-17-036.title.pdf</subfield>
+    <subfield code="a">http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
     <subfield code="d">Fulltext</subfield>
     <subfield code="t">INSPIRE-PUBLIC</subfield>
   </datafield>