WIP for desy spider

Signed-off-by: Spiros Delviniotis <[email protected]>
inspirehep · Aug 2, 2017 · 066864a · 066864a
1 parent fa74f59
commit 066864a
Show file tree

Hide file tree

Showing 64 changed files with 2,630 additions and 250 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -28,6 +28,7 @@ env:
     - SUITE=unit
     - SUITE=functional_wsp
     - SUITE=functional_arxiv
+    - SUITE=functional_desy
 
 matrix:
   fast_finish: true

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -17,6 +17,7 @@ services:
       - APP_CELERY_RESULT_BACKEND=amqp://guest:guest@rabbitmq:5672//
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
+      - APP_FILES_STORE=/tmp/file_urls
       - COVERAGE_PROCESS_START=/code/.coveragerc
       - BASE_USER_UID=${BASE_USER_UID:-1000}
       - BASE_USER_GIT=${BASE_USER_GIT:-1000}
@@ -26,6 +27,7 @@ services:
       - ${PWD}:/code/
       - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
       - /tmp/WSP:/tmp/WSP
+      - /tmp/file_urls:/tmp/file_urls
 
   functional_wsp:
     <<: *service_base
@@ -34,6 +36,13 @@ services:
       - scrapyd
       - ftp_server
 
+  functional_desy:
+    <<: *service_base
+    command: py.test -vv tests/functional/desy
+    links:
+      - scrapyd
+      - ftp_server
+
   functional_arxiv:
     <<: *service_base
     command: py.test -vv tests/functional/arxiv
@@ -68,6 +77,8 @@ services:
     environment:
       - PUBLICHOST=localhost
     volumes:
+      - ${PWD}/tests/functional/desy/fixtures/ftp_server/FFT:/home/ftpusers/bob/FFT
+      - ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd
 

diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
@@ -15,8 +15,149 @@
 
 from __future__ import absolute_import, division, print_function
 
+import os
+import datetime
+
 from inspire_schemas.api import LiteratureBuilder
 
+from hepcrawl.utils import get_file_name_from_url
+
+
+def _update_record_fft(record, index_fft_file_paths):
+    def _update_fft_fields(fft_fields, index_fft_file_paths):
+        new_fft_fields = []
+        for fft_field in fft_fields:
+            file_name = get_file_name_from_url(fft_field['path'])
+            if file_name in index_fft_file_paths:
+                fft_field['path'] = index_fft_file_paths[file_name]
+                new_fft_fields.append(fft_field)
+
+        return new_fft_fields
+
+    record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
+    return record
+
+
+def _has_publication_info(item):
+    """If any publication info."""
+    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
+        item.get('journal_title') or \
+        item.get('journal_year') or \
+        item.get('journal_issue') or \
+        item.get('journal_fpage') or \
+        item.get('journal_lpage') or \
+        item.get('journal_artid') or \
+        item.get('journal_doctype')
+
+
+def _filter_fields(item, keys):
+    """Filter away keys."""
+    for key in keys:
+        item.pop(key, None)
+
+
+def _normalize_hepcrawl_record(item, source):
+    if 'related_article_doi' in item:
+        item['dois'] += item.pop('related_article_doi', [])
+
+    item['titles'] = [{
+        'title': item.pop('title', ''),
+        'subtitle': item.pop('subtitle', ''),
+        'source': source,
+    }]
+
+    item['abstracts'] = [{
+        'value': item.pop('abstract', ''),
+        'source': source,
+    }]
+
+    item['imprints'] = [{
+        'date': item.pop('date_published', ''),
+    }]
+
+    item['copyright'] = [{
+        'holder': item.pop('copyright_holder', ''),
+        'year': item.pop('copyright_year', ''),
+        'statement': item.pop('copyright_statement', ''),
+        'material': item.pop('copyright_material', ''),
+    }]
+
+    if _has_publication_info(item):
+        item['publication_info'] = [{
+            'journal_title': item.pop('journal_title', ''),
+            'journal_volume': item.pop('journal_volume', ''),
+            'journal_issue': item.pop('journal_issue', ''),
+            'artid': item.pop('journal_artid', ''),
+            'page_start': item.pop('journal_fpage', ''),
+            'page_end': item.pop('journal_lpage', ''),
+            'note': item.pop('journal_doctype', ''),
+            'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
+            'pubinfo_material': item.pop('pubinfo_material', ''),
+        }]
+        if item.get('journal_year'):
+            item['publication_info'][0]['year'] = int(
+                item.pop('journal_year')
+            )
+
+    # Remove any fields
+    _filter_fields(item, [
+        'journal_title',
+        'journal_volume',
+        'journal_year',
+        'journal_issue',
+        'journal_fpage',
+        'journal_lpage',
+        'journal_doctype',
+        'journal_artid',
+        'pubinfo_freetext',
+        'pubinfo_material',
+    ])
+
+    return item
+
+
+def _generate_acquisition_source(crawler_record, source):
+    crawler_record['acquisition_source'] = {
+        'source': source,
+        'method': 'hepcrawl',
+        'datetime': datetime.datetime.now().isoformat(),
+        'submission_number': os.environ.get('SCRAPY_JOB', ''),
+    }
+    return crawler_record
+
+
+def to_hep(
+        item,
+        source,
+        item_format='hepcrawl',
+        fft_file_paths=None,
+):
+    item = _generate_acquisition_source(
+        crawler_record=item,
+        source=source,
+    )
+
+    if item_format == 'hep':
+        return hep2hep(
+            crawler_record=item,
+            fft_file_paths=fft_file_paths,
+        )
+    elif item_format == 'hepcrawl':
+        item = _normalize_hepcrawl_record(
+            item=item,
+            source=source,
+        )
+        return crawler2hep(dict(item))
+    else:
+        raise Exception('Unknown item_format::{}'.format(item_format))
+
+
+def hep2hep(crawler_record, fft_file_paths):
+    if fft_file_paths:
+        crawler_record = _update_record_fft(crawler_record, fft_file_paths)
+
+    return crawler_record
+
 
 def crawler2hep(crawler_record):
 
@@ -98,7 +239,7 @@ def _filter_affiliation(affiliations):
     acquisition_source = crawler_record.get('acquisition_source', {})
     builder.add_acquisition_source(
         method=acquisition_source['method'],
-        date=acquisition_source['date'],
+        date=acquisition_source['datetime'],
         source=acquisition_source['source'],
         submission_number=acquisition_source['submission_number'],
     )

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
@@ -15,30 +15,55 @@
 
 from __future__ import absolute_import, division, print_function
 
-import datetime
 import os
 
 import requests
 
-from .crawler2hep import crawler2hep
+from scrapy import Request
+from scrapy.pipelines.files import FilesPipeline
 
+from inspire_schemas.utils import validate
 
-def has_publication_info(item):
-    """If any publication info."""
-    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
-        item.get('journal_title') or \
-        item.get('journal_year') or \
-        item.get('journal_issue') or \
-        item.get('journal_fpage') or \
-        item.get('journal_lpage') or \
-        item.get('journal_artid') or \
-        item.get('journal_doctype')
+from hepcrawl.crawler2hep import to_hep
+from hepcrawl.settings import FILES_STORE
+from hepcrawl.utils import get_file_name_from_url
 
 
-def filter_fields(item, keys):
-    """Filter away keys."""
-    for key in keys:
-        item.pop(key, None)
+class FftFilesPipeline(FilesPipeline):
+    """Download all the FFT files provided by record."""
+
+    def __init__(self, *args, **kwargs):
+        super(FftFilesPipeline, self).__init__(FILES_STORE)
+
+    def get_media_requests(self, item, info):
+        """Download FFT files using FTP."""
+        if item.get('file_urls'):
+            for fft_url in item.file_urls:
+                yield Request(
+                    url=fft_url,
+                    meta=item.ftp_params,
+                )
+
+    def item_completed(self, results, item, info):
+        """Create a map that connects file names with downloaded files."""
+        def _get_absolute_local_file_path(path):
+            return os.path.abspath(
+                os.path.join(
+                    FILES_STORE,
+                    path
+                )
+            )
+
+        map_file_names_paths = {}
+        for ok, result_data in results:
+            if ok:
+                map_file_names_paths[
+                    get_file_name_from_url(result_data['url'])
+                ] = _get_absolute_local_file_path(result_data['path'])
+
+        item.file_paths = map_file_names_paths
+
+        return item
 
 
 class InspireAPIPushPipeline(object):
@@ -50,74 +75,31 @@ def __init__(self):
     def open_spider(self, spider):
         self.results_data = []
 
+    def _post_enhance_item(self, item, spider):
+        fft_file_paths = item.file_paths
+        item_format = item.item_format
+        item = item.item if item.item else item
+        source = spider.name
+
+        return to_hep(
+            item=item,
+            source=source,
+            item_format=item_format,
+            fft_file_paths=fft_file_paths,
+        )
+
     def process_item(self, item, spider):
         """Convert internal format to INSPIRE data model."""
         self.count += 1
-        if 'related_article_doi' in item:
-            item['dois'] += item.pop('related_article_doi', [])
 
-        source = spider.name
-        item['acquisition_source'] = {
-            'source': source,
-            'method': 'hepcrawl',
-            'date': datetime.datetime.now().isoformat(),
-            'submission_number': os.environ.get('SCRAPY_JOB', ''),
-        }
-
-        item['titles'] = [{
-            'title': item.pop('title', ''),
-            'subtitle': item.pop('subtitle', ''),
-            'source': source,
-        }]
-        item['abstracts'] = [{
-            'value': item.pop('abstract', ''),
-            'source': source,
-        }]
-        item['imprints'] = [{
-            'date': item.pop('date_published', ''),
-        }]
-        item['copyright'] = [{
-            'holder': item.pop('copyright_holder', ''),
-            'year': item.pop('copyright_year', ''),
-            'statement': item.pop('copyright_statement', ''),
-            'material': item.pop('copyright_material', ''),
-        }]
-        if not item.get('publication_info'):
-            if has_publication_info(item):
-                item['publication_info'] = [{
-                    'journal_title': item.pop('journal_title', ''),
-                    'journal_volume': item.pop('journal_volume', ''),
-                    'journal_issue': item.pop('journal_issue', ''),
-                    'artid': item.pop('journal_artid', ''),
-                    'page_start': item.pop('journal_fpage', ''),
-                    'page_end': item.pop('journal_lpage', ''),
-                    'note': item.pop('journal_doctype', ''),
-                    'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
-                    'pubinfo_material': item.pop('pubinfo_material', ''),
-                }]
-                if item.get('journal_year'):
-                    item['publication_info'][0]['year'] = int(
-                        item.pop('journal_year')
-                    )
-
-        # Remove any fields
-        filter_fields(item, [
-            'journal_title',
-            'journal_volume',
-            'journal_year',
-            'journal_issue',
-            'journal_fpage',
-            'journal_lpage',
-            'journal_doctype',
-            'journal_artid',
-            'pubinfo_freetext',
-            'pubinfo_material',
-        ])
-
-        item = crawler2hep(dict(item))
-        spider.logger.debug('Validated item.')
-        self.results_data.append(item)
-        return item
+        hep_item = self._post_enhance_item(item, spider)
+
+        validate(hep_item, 'hep')
+        spider.logger.debug('Validated item by Inspire Schemas.')
+
+        self.results_data.append(hep_item)
+
+        return hep_item
 
     def _prepare_payload(self, spider):
         """Return payload for push."""

diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
@@ -85,7 +85,7 @@
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'scrapy.pipelines.files.FilesPipeline': 1,
+    'hepcrawl.pipelines.FftFilesPipeline': 1,
     'hepcrawl.pipelines.InspireCeleryPushPipeline': 300,
 }