WIP_per_coding

Signed-off-by: Spiros Delviniotis <[email protected]>
inspirehep · Aug 4, 2017 · da9acb4 · da9acb4
1 parent 066864a
commit da9acb4
Show file tree

Hide file tree

Showing 14 changed files with 287 additions and 205 deletions.
diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
@@ -20,22 +20,34 @@
 
 from inspire_schemas.api import LiteratureBuilder
 
-from hepcrawl.utils import get_file_name_from_url
 
-
-def _update_record_fft(record, index_fft_file_paths):
-    def _update_fft_fields(fft_fields, index_fft_file_paths):
-        new_fft_fields = []
-        for fft_field in fft_fields:
-            file_name = get_file_name_from_url(fft_field['path'])
-            if file_name in index_fft_file_paths:
-                fft_field['path'] = index_fft_file_paths[file_name]
-                new_fft_fields.append(fft_field)
-
-        return new_fft_fields
-
-    record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
-    return record
+def _get_updated_fft_fields(current_fft_fields, record_files):
+    """
+
+    Params:
+        current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
+             expect each of then to have, at least, a key named ``path``.
+        record_files(list(RecordFile)): files attached to the record as populated by
+             ``FftFilesPipeline``.
+    """
+    record_files_index = {
+        record_file.name: record_file.path
+        for record_file in record_files
+    }
+    new_fft_fields = []
+    import logging
+    logger = logging.getLogger(__name__)
+    logger.log(logging.INFO,
+               "-------------------- _get_updated_fft_fields -------------------")
+    logger.log(logging.INFO,
+               'current_fft_fields: {}'.format(current_fft_fields))
+    for fft_field in current_fft_fields:
+        file_name = os.path.basename(fft_field['path'])
+        if file_name in record_files_index:
+            fft_field['path'] = record_files_index[file_name]
+            new_fft_fields.append(fft_field)
+
+    return new_fft_fields
 
 
 def _has_publication_info(item):
@@ -116,50 +128,47 @@ def _normalize_hepcrawl_record(item, source):
     return item
 
 
-def _generate_acquisition_source(crawler_record, source):
-    crawler_record['acquisition_source'] = {
+def _generate_acquisition_source(source):
+    acquisition_source = {
         'source': source,
         'method': 'hepcrawl',
         'datetime': datetime.datetime.now().isoformat(),
         'submission_number': os.environ.get('SCRAPY_JOB', ''),
     }
-    return crawler_record
+    return acquisition_source
 
 
-def to_hep(
-        item,
-        source,
-        item_format='hepcrawl',
-        fft_file_paths=None,
+def item_to_hep(
+    item,
+    source,
 ):
-    item = _generate_acquisition_source(
-        crawler_record=item,
-        source=source,
-    )
+    item.record['acquisition_source'] = _generate_acquisition_source(source=source)
 
-    if item_format == 'hep':
-        return hep2hep(
-            crawler_record=item,
-            fft_file_paths=fft_file_paths,
+    if item.record_format == 'hep':
+        return hep_to_hep(
+            hep_record=item.record,
+            record_files=item.record_files,
         )
-    elif item_format == 'hepcrawl':
+    elif item.record_format == 'hepcrawl':
         item = _normalize_hepcrawl_record(
             item=item,
             source=source,
         )
-        return crawler2hep(dict(item))
+        return hepcrawl_to_hep(dict(item))
     else:
-        raise Exception('Unknown item_format::{}'.format(item_format))
+        raise Exception('Unknown item_format::{}'.format(item.record_format))
 
 
-def hep2hep(crawler_record, fft_file_paths):
-    if fft_file_paths:
-        crawler_record = _update_record_fft(crawler_record, fft_file_paths)
+def hep_to_hep(hep_record, record_files):
+    hep_record['_fft'] = _get_updated_fft_fields(
+        current_fft_fields=hep_record['_fft'],
+        record_files=record_files,
+    )
 
-    return crawler_record
+    return hep_record
 
 
-def crawler2hep(crawler_record):
+def hepcrawl_to_hep(crawler_record):
 
     def _filter_affiliation(affiliations):
         return [

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
@@ -24,16 +24,23 @@
 
 from inspire_schemas.utils import validate
 
-from hepcrawl.crawler2hep import to_hep
+from hepcrawl.crawler2hep import item_to_hep
 from hepcrawl.settings import FILES_STORE
-from hepcrawl.utils import get_file_name_from_url
+from hepcrawl.utils import RecordFile
 
 
 class FftFilesPipeline(FilesPipeline):
-    """Download all the FFT files provided by record."""
+    """Download all the FFT files provided by record.
 
-    def __init__(self, *args, **kwargs):
-        super(FftFilesPipeline, self).__init__(FILES_STORE)
+    Note:
+
+         This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls``
+         property.
+    """
+
+    def __init__(self, store_uri, *args, **kwargs):
+        store_uri = store_uri or FILES_STORE
+        super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs)
 
     def get_media_requests(self, item, info):
         """Download FFT files using FTP."""
@@ -44,24 +51,25 @@ def get_media_requests(self, item, info):
                     meta=item.ftp_params,
                 )
 
+    def get_absolute_file_path(self, path):
+        return os.path.abspath(
+            os.path.join(
+                self.store.basedir,
+                path
+            )
+        )
+
     def item_completed(self, results, item, info):
         """Create a map that connects file names with downloaded files."""
-        def _get_absolute_local_file_path(path):
-            return os.path.abspath(
-                os.path.join(
-                    FILES_STORE,
-                    path
-                )
+        record_files = [
+            RecordFile(
+                path=self.get_absolute_file_path(result_data['path']),
+                name=os.path.basename(result_data['url']),
             )
-
-        map_file_names_paths = {}
-        for ok, result_data in results:
-            if ok:
-                map_file_names_paths[
-                    get_file_name_from_url(result_data['url'])
-                ] = _get_absolute_local_file_path(result_data['path'])
-
-        item.file_paths = map_file_names_paths
+            for ok, result_data in results
+            if ok
+        ]
+        item.record_files = record_files
 
         return item
 
@@ -76,16 +84,11 @@ def open_spider(self, spider):
         self.results_data = []
 
     def _post_enhance_item(self, item, spider):
-        fft_file_paths = item.file_paths
-        item_format = item.item_format
-        item = item.item if item.item else item
         source = spider.name
 
-        return to_hep(
+        return item_to_hep(
             item=item,
             source=source,
-            item_format=item_format,
-            fft_file_paths=fft_file_paths,
         )
 
     def process_item(self, item, spider):