pos: fix spider

Signed-off-by: Spiros Delviniotis <[email protected]>
inspirehep · Aug 22, 2017 · 8320ade · 8320ade
1 parent 4978cf2
commit 8320ade
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 55 deletions.
diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
@@ -339,6 +339,12 @@ def _filter_affiliation(affiliations):
             source=report_number.get('source')
         )
 
+    for url in crawler_record.get('urls', []):
+        builder.add_url(url=url.get('value'))
+
+    if crawler_record.get('_fft'):
+        builder.record['_fft'] = crawler_record.get('_fft')
+
     builder.validate_record()
 
     return builder.record
diff --git a/hepcrawl/items.py b/hepcrawl/items.py
@@ -318,3 +318,7 @@ class HEPRecord(scrapy.Item):
 
     thesis_supervisor = scrapy.Field()
     language = scrapy.Field()
+
+    _fft = scrapy.Field()
+    """Used to communicate with legacy about files (to be) attached to the
+        record."""
diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py
@@ -62,14 +62,13 @@ class POSSpider(Spider):
 
     To do that and because each needs the information of the previous, the
     spider must use the callbacks system provided by scrapy through the
-    :ref:`scrapy.html.response.Response` callback parameter, and chain the
+    :class:`scrapy.html.response.Response` callback parameter, and chain the
     parser functions.
 
     The deduplication of the conference proceedings papers is left for the
     `HepcrawlCrawlOnceMiddleware` middleware.
 
     Example:
-        ::
             $ scrapy crawl PoS \\
                 -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml"
     """
@@ -94,24 +93,26 @@ def parse(self, response):
         self.log('Got record from: {response.url}'.format(**vars()))
 
         response.selector.remove_namespaces()
-        records = response.selector.xpath('.//record')
-        for record in records:
-            yield self.get_conference_paper_page_request(raw_xml=record)
+        record_xml_selectors = response.selector.xpath('.//record')
+        for record_xml_selector in record_xml_selectors:
+            yield self.get_conference_paper_page_request(
+                xml_selector=record_xml_selector,
+            )
 
-    def get_conference_paper_page_request(self, raw_xml, meta=None):
+    def get_conference_paper_page_request(self, xml_selector, meta=None):
         """Gets the conference paper html page, for the pdf link for the
         conference paper, and later the internal conference id.
         """
         meta = meta or {}
 
-        identifier = raw_xml.xpath(
+        identifier = xml_selector.xpath(
             './/metadata/pex-dc/identifier/text()'
         ).extract_first()
         conference_paper_url = "{0}{1}".format(
             self.base_conference_paper_url,
             identifier,
         )
-        meta['xml_record'] = raw_xml
+        meta['xml_record'] = xml_selector.extract()
 
         # the meta parameter will be passed over to the callback as a property
         # in the response parameter
@@ -137,11 +138,11 @@ def parse_conference_paper(self, response):
 
         # prepare next callback step
         response.meta['html_record'] = response.body
-        yield self.get_conference_proceendings_page_request(
+        yield self.get_conference_proceedings_page_request(
             meta=response.meta,
         )
 
-    def get_conference_proceendings_page_request(self, meta):
+    def get_conference_proceedings_page_request(self, meta):
         """Gets the conference proceedings page, using the indernal conference
         id from the record html page retrieved before.
         """
@@ -155,9 +156,10 @@ def get_conference_proceendings_page_request(self, meta):
         )
 
         page_selector = Selector(
-            text=meta.get('html_record'),
-            type='html',
+            text=meta.get('xml_record'),
+            type='xml',
         )
+        page_selector.remove_namespaces()
         pos_id = page_selector.xpath(
             ".//metadata/pex-dc/identifier/text()"
         ).extract_first()
@@ -220,15 +222,15 @@ def build_conference_paper_item(
         ).extract_first()
         record.add_value(
             'journal_title',
-            self._get_journal_title(identifier=identifier),
+            self._get_journal_title(pos_ext_identifier=identifier),
         )
         record.add_value(
             'journal_volume',
-            self._get_journal_volume(identifier=identifier),
+            self._get_journal_volume(pos_ext_identifier=identifier),
         )
         record.add_value(
             'journal_artid',
-            self._get_journal_artid(identifier=identifier),
+            self._get_journal_artid(pos_ext_identifier=identifier),
         )
 
         record.add_xpath('title', '//metadata/pex-dc/title/text()')
@@ -240,8 +242,13 @@ def build_conference_paper_item(
         record.add_value('language', self._get_language(selector=selector))
         record.add_value('authors', self._get_authors(selector=selector))
         record.add_value('collections', ['conferencepaper'])
-        record.add_value('urls', conference_paper_pdf_url)
-        record.add_value('_fulltext_url', self._get_conference_paper_pdf_url())
+        record.add_value('urls', [conference_paper_url])
+        record.add_value(
+            '_fft',
+            self._set_fft(
+                path=conference_paper_pdf_url,
+            ),
+        )
 
         parsed_item = ParsedItem(
             record=record.load_item(),
@@ -277,7 +284,7 @@ def build_conference_proceedings_item(
         record.add_value('journal_title', 'PoS')
         record.add_value(
             'journal_volume',
-            self._get_journal_volume(pos_id=pos_id),
+            self._get_journal_volume(pos_ext_identifier=pos_id),
         )
 
         parsed_proceeding = ParsedItem(
@@ -309,6 +316,14 @@ def _get_conference_paper_pdf_url(self, conference_paper_page_html):
             conference_paper_pdf_relative_url,
         )
 
+    @staticmethod
+    def _set_fft(path):
+        return [
+            {
+                'path': path,
+            },
+        ]
+
     @staticmethod
     def _get_language(selector):
         language = selector.xpath(
@@ -317,16 +332,16 @@ def _get_language(selector):
         return language if language != 'en' else None
 
     @staticmethod
-    def _get_journal_title(pos_id):
-        return re.split('[()]', pos_id)[0]
+    def _get_journal_title(pos_ext_identifier):
+        return re.split('[()]', pos_ext_identifier)[0]
 
     @staticmethod
-    def _get_journal_volume(pos_id):
-        return re.split('[()]', pos_id)[1]
+    def _get_journal_volume(pos_ext_identifier):
+        return re.split('[()]', pos_ext_identifier)[1]
 
     @staticmethod
-    def _get_journal_artid(pos_id):
-        return re.split('[()]', pos_id)[2]
+    def _get_journal_artid(pos_ext_identifier):
+        return re.split('[()]', pos_ext_identifier)[2]
 
     @staticmethod
     def _get_ext_systems_number(selector):

diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json
@@ -41,6 +41,16 @@
         "title": "Heavy Flavour Physics Review"
       }
     ],
+    "_fft": [
+      {
+        "path": "https://server.local/187/001/pdf"
+      }
+    ],
+    "urls": [
+      {
+        "value": "https://server.local/PoS(LATTICE%202013)001.html"
+      }
+    ],
     "authors": [
       {
         "affiliations": [

diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py
@@ -32,7 +32,13 @@ def override_generated_fields(record):
 
 
 @pytest.fixture(scope="function")
-def set_up_environment():
+def wait_until_services_are_up(seconds=10):
+    # The test must wait until the docker environment is up (takes about 10 seconds).
+    sleep(seconds)
+
+
+@pytest.fixture(scope="function")
+def configuration():
     package_location = get_test_suite_path(
         'pos',
         'fixtures',
@@ -41,9 +47,6 @@ def set_up_environment():
         test_suite='functional',
     )
 
-    # The test must wait until the docker environment is up (takes about 10 seconds).
-    sleep(10)
-
     yield {
         'CRAWLER_HOST_URL': 'http://scrapyd:6800',
         'CRAWLER_PROJECT': 'hepcrawl',
@@ -69,21 +72,22 @@ def set_up_environment():
     ]
 )
 def test_pos_conference_paper_record_and_proceedings_record(
-        set_up_environment,
-        expected_results,
+    configuration,
+    wait_until_services_are_up,
+    expected_results,
 ):
-    crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))
+    crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL'))
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=100,
         events_limit=1,
         crawler_instance=crawler,
-        project=set_up_environment.get('CRAWLER_PROJECT'),
+        project=configuration.get('CRAWLER_PROJECT'),
         spider='pos',
         settings={},
-        **set_up_environment.get('CRAWLER_ARGUMENTS')
+        **configuration.get('CRAWLER_ARGUMENTS')
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
@@ -93,6 +97,8 @@ def test_pos_conference_paper_record_and_proceedings_record(
 
 
 # TODO create test that receives conference paper record AND proceedings record.
+# 'Crawl-once' plug-in needed.
 
 
 # TODO create test that receives proceedings record ONLY.
+# 'Crawl-once' plug-in needed.