global: minimal adaptation to documents

Signed-off-by: David Caro <[email protected]>
inspirehep · Oct 28, 2017 · 8cd9ca1 · 8cd9ca1
1 parent 0d7ee20
commit 8cd9ca1
Show file tree

Hide file tree

Showing 18 changed files with 78 additions and 58 deletions.
diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py
@@ -312,7 +312,7 @@ def build_item_rich(self, response):
             # NOTE: maybe this should be removed as the 'rich' format records
             # are not open access.
             record.add_value(
-                "additional_files",
+                "documents",
                 self._create_file(
                     get_first(response.meta["pdf_links"]),
                     "INSPIRE-PUBLIC",
@@ -384,7 +384,7 @@ def build_item_jats(self, response):
 
         if "pdf_links" in response.meta:
             record.add_value(
-                "additional_files",
+                "documents",
                 self._create_file(
                     get_first(response.meta["pdf_links"]),
                     "INSPIRE-PUBLIC",

diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
@@ -995,7 +995,10 @@ def build_item(self, response):
 
         xml_file = response.meta.get("xml_url")
         if xml_file:
-            record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(xml_file, "HIDDEN", "Fulltext"),
+            )
             sd_url = self._get_sd_url(xml_file)
             if requests.head(sd_url).status_code == 200:  # Test if valid url
                 record.add_value("urls", sd_url)

diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py
@@ -154,13 +154,13 @@ def get_journal_pages(node):
         else:
             return journal_pages, ''
 
-    def create_file(self, file_path, file_access, file_type):
-        """Create a structured dictionary to add to 'files' item."""
+    def create_document(self, file_path):
+        """Create a structured dictionary to add to 'documents' item."""
         file_dict = {
-            "access": file_access,
+            "hidden": True,
             "description": self.name.upper(),
             "url": file_path,
-            "type": file_type,
+            "fulltext": True,
         }
         return file_dict
 
@@ -219,9 +219,9 @@ def parse_node(self, response, node):
         record.add_value('file_urls', pdf_links)
         if xml_links:
             record.add_value(
-                'additional_files',
+                'documents',
                 [
-                    self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext")
+                    self.create_document(xml)
                     for xml in xml_links
                 ]
             )

diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py
@@ -232,7 +232,10 @@ def build_item(self, response):
 
         pdf_files = response.meta.get("pdf_links")
         if pdf_files:
-            record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+            )
         record.add_value('authors', response.meta.get("authors"))
         record.add_value('date_published', response.meta.get("date_published"))
         record.add_value('thesis', response.meta.get("thesis_info"))

diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py
@@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage):
             if pattern in pdf_path:
                 return os.path.join(self.pdf_files, pdf_path)
 
-    def add_file(self, file_path, file_access, file_type):
+    def add_document(self, file_path, hidden, fulltext):
         """Create a structured dictionary and add to 'files' item."""
         file_dict = {
-            "access": file_access,
+            "hidden": hidden,
+            "fulltext": fulltext,
             "description": self.name.upper(),
             "url": file_path,
-            "type": file_type,
         }
         return file_dict
 
@@ -206,21 +206,25 @@ def parse_node(self, response, node):
         record.add_value('collections', self.get_collections(doctype))
 
         xml_file_path = response.url
-        record.add_value("additional_files",
-                         self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext"))
+        record.add_value(
+            "documents",
+            self.add_document(xml_file_path, hidden=True, fulltext=True),
+        )
         if self.pdf_files:
             pdf_file_path = self.get_pdf_path(volume, issue, fpage)
             if pdf_file_path:
                 if doctype and "erratum" in doctype.lower():
-                    file_type = "Erratum"
+                    fulltext = False
                 else:
-                    file_type = "Fulltext"
+                    fulltext = True
                 if journal_title in self.OPEN_ACCESS_JOURNALS:
-                    file_access = "INSPIRE-PUBLIC"  # FIXME: right?
+                    hidden = False
                 else:
-                    file_access = "INSPIRE-HIDDEN"
-                record.add_value("additional_files",
-                                 self.add_file(pdf_file_path, file_access, file_type))
+                    hidden = True
+                record.add_value(
+                    "documents",
+                    self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),
+                )
 
         parsed_item = ParsedItem(
             record=record.load_item(),

diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py
@@ -177,7 +177,7 @@ def build_item(self, response):
         record.add_value('title', response.meta.get("title"))
         record.add_value('urls', response.meta.get("urls"))
         record.add_value("abstract", response.meta.get("abstract"))
-        record.add_value("additional_files", response.meta.get("files"))
+        record.add_value("documents", response.meta.get("files"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
         parsed_item = ParsedItem(

diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py
@@ -207,8 +207,10 @@ def build_item(self, response):
 
         pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
         if pdf_files:
-            record.add_value('additional_files', self.add_file(
-                pdf_files, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+            )
         record.add_value('authors', self.get_authors(node))
         record.add_xpath('date_published',
                          "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")

diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py
@@ -121,7 +121,10 @@ def parse_node(self, response, node):
             return None
 
         pdf_files = node.xpath(".//a/@href").extract()
-        record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
+        record.add_value(
+            'documents',
+            self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+        )
         record.add_value('authors', self.get_authors(node))
         record.add_value('date_published', year)
         record.add_value('thesis', {'degree_type': thesis_type})

diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py
@@ -101,16 +101,16 @@ def get_splash_links(self, node):
 
         return out_links
 
-    def add_file(self, pdf_files, file_access, file_type):
+    def add_document(self, pdf_files):
         """Create a structured dictionary and add to ``files`` item."""
         # NOTE: should this be moved to utils?
         file_dicts = []
         for link in pdf_files:
             file_dict = {
-                "access": file_access,
+                "hidden": True,
+                "fulltext": True,
                 "description": self.name.title(),
                 "url": urljoin(self.domain, link),
-                "type": file_type,
             }
             file_dicts.append(file_dict)
         return file_dicts
@@ -149,7 +149,7 @@ def scrape_for_pdf(self, response):
             "//a[@class='contenttype-file state-internal url']/@href").extract()
 
         response.meta["abstract"] = abstract
-        response.meta["additional_files"] = self.add_file(file_paths, "HIDDEN", "Fulltext")
+        response.meta["documents"] = self.add_document(file_paths)
 
         return self.build_item(response)
 
@@ -165,7 +165,7 @@ def build_item(self, response):
         record.add_value('title', response.meta.get("title"))
         record.add_value('urls', response.meta.get("urls"))
         record.add_value("abstract", response.meta.get("abstract"))
-        record.add_value("additional_files", response.meta.get("additional_files"))
+        record.add_value("documents", response.meta.get("documents"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
         parsed_item = ParsedItem(

diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py
@@ -389,7 +389,6 @@ def test_no_dois_jats():
     record = parsed_item.record
 
     assert "dois" not in record
-    assert "additional_files" not in record
     assert isinstance(record, HEPRecord)
 
 
@@ -413,7 +412,6 @@ def test_no_dois_rich():
     record = parsed_item.record
 
     assert "dois" not in record
-    assert "additional_files" not in record
     assert isinstance(record, HEPRecord)
 
 

diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py
@@ -326,8 +326,8 @@ def test_authors(record):
 
 def test_files(record):
     """Test file urls."""
-    assert record["additional_files"]
-    assert record["additional_files"][0]['url'] == "elsevier/sample_consyn_record.xml"
+    assert record["documents"]
+    assert record["documents"][0]['url'] == "elsevier/sample_consyn_record.xml"
 
 
 def test_dois(record):

diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py
@@ -96,9 +96,9 @@ def test_urls(record):
 def test_additional_files(record):
     """Test additional files."""
     url = "http://downloads.hindawi.com/journals/aa/2010/194946.xml"
-    assert "additional_files" in record
-    assert record["additional_files"][0]["url"] == url
-    assert record["additional_files"][0]["access"] == "INSPIRE-HIDDEN"
+    assert "documents" in record
+    assert record["documents"][0]["url"] == url
+    assert record["documents"][0]["hidden"]
 
 
 def test_collections(record):

diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py
@@ -83,8 +83,10 @@ def test_date_published(record):
 
 def test_files(record):
     """Test pdf files."""
-    assert record["additional_files"][0][
-        "url"] == "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf"
+    assert record["documents"][0]["url"] == (
+        "http://www.infn.it/thesis/PDF/getfile.php"
+        "?filename=10136-Fedon-dottorato.pdf"
+    )
 
 
 def test_thesis(record):

diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py
@@ -154,10 +154,10 @@ def test_files(record):
     """Test files dictionary."""
     pdf_filename = "test_143_3_336.pdf"
 
-    assert "additional_files" in record
-    assert record["additional_files"][1]["access"] == 'INSPIRE-HIDDEN'
-    assert record["additional_files"][1]["type"] == 'Fulltext'
-    assert record["additional_files"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)
+    assert "documents" in record
+    assert record["documents"][1]["hidden"]
+    assert record["documents"][1]["fulltext"]
+    assert record["documents"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)
 
 
 @pytest.fixture
@@ -196,13 +196,12 @@ def erratum_open_access_record():
 def test_files_erratum_open_access_record(erratum_open_access_record):
     """Test files dict with open access journal with erratum article."""
     pdf_filename = "test_143_3_336.pdf"
-    assert "additional_files" in erratum_open_access_record
-    assert erratum_open_access_record["additional_files"][
-        1]["access"] == 'INSPIRE-PUBLIC'
-    assert erratum_open_access_record[
-        "additional_files"][1]["type"] == 'Erratum'
-    assert erratum_open_access_record["additional_files"][
-        1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)
+    assert "documents" in erratum_open_access_record
+    assert not erratum_open_access_record["documents"][1]["hidden"]
+    assert not erratum_open_access_record["documents"][1]["fulltext"]
+    assert erratum_open_access_record["documents"][1]["url"] == (
+        os.path.join(TEST_PDF_DIR, pdf_filename)
+    )
 
 
 def test_not_published_record():

diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py
@@ -145,8 +145,8 @@ def test_url(record):
 def test_pdf_link(record):
     """Test pdf link(s)"""
     files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf"
-    assert 'additional_files' in record
-    assert record['additional_files'][1]['url'] == files
+    assert 'documents' in record
+    assert record['documents'][1]['url'] == files
 
 
 def test_no_author_no_date_no_url():

diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py
@@ -106,7 +106,10 @@ def test_date_published(record):
 
 def test_files(record):
     """Test pdf files."""
-    assert record["additional_files"][0]["url"] == "http://dspace.mit.edu/bitstream/handle/1721.1/99287/922886248-MIT.pdf?sequence=1"
+    assert record["documents"][0]["url"] == (
+        "http://dspace.mit.edu/bitstream/handle/1721.1/99287/"
+        "922886248-MIT.pdf?sequence=1"
+    )
 
 
 def test_thesis(record):

diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py
@@ -91,6 +91,9 @@ def test_authors(record):
 
 def test_pdf_link(record):
     """Test pdf link(s)"""
-    files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf"
-    assert 'additional_files' in record
-    assert record['additional_files'][0]['url'] == files
+    files = (
+        "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/"
+        "Guragain_Hari-DISSERTATION.pdf"
+    )
+    assert 'documents' in record
+    assert record['documents'][0]['url'] == files
diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py
@@ -113,8 +113,8 @@ def test_url(record):
 def test_pdf_link(record):
     """Test pdf link(s)"""
     files = "http://www.t2k.org/docs/thesis/001/IJT-THESIS"
-    assert 'additional_files' in record
-    assert record['additional_files'][0]['url'] == files
+    assert 'documents' in record
+    assert record['documents'][0]['url'] == files
 
 
 @pytest.fixture