Merge pull request #160 from spirosdelviniotis/hepcrawl_pos_spider

pos: update pos spider
inspirehep · Nov 1, 2017 · 4f33b74 · 4f33b74
2 parents 746ec16 + 1dd708d
commit 4f33b74
Show file tree

Hide file tree

Showing 35 changed files with 1,282 additions and 247 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -30,6 +30,7 @@ env:
     - SUITE=functional_arxiv
     - SUITE=functional_desy
     - SUITE=functional_cds
+    - SUITE=functional_pos
 
 matrix:
   fast_finish: true

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -33,29 +33,43 @@ services:
   functional_wsp:
     <<: *service_base
     command: py.test -vv tests/functional/wsp
-    links:
-      - scrapyd
-      - ftp_server
+    depends_on:
+      scrapyd:
+        condition: service_healthy
+      ftp_server:
+        condition: service_healthy
 
   functional_desy:
     <<: *service_base
     command: py.test -vv tests/functional/desy
-    links:
-      - scrapyd
-      - ftp_server
+    depends_on:
+      scrapyd:
+        condition: service_healthy
+      ftp_server:
+        condition: service_healthy
 
   functional_arxiv:
     <<: *service_base
     command: py.test -vv tests/functional/arxiv
-    links:
-      - scrapyd
+    depends_on:
+      scrapyd:
+        condition: service_healthy
 
   functional_cds:
     <<: *service_base
     command: py.test -vv tests/functional/cds
     links:
       - scrapyd
 
+  functional_pos:
+    <<: *service_base
+    command: py.test -vv tests/functional/pos
+    depends_on:
+      scrapyd:
+        condition: service_healthy
+      http-server.local:
+        condition: service_healthy
+
   unit:
     <<: *service_base
     command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*"
@@ -64,14 +78,16 @@ services:
   celery:
     <<: *service_base
     command: celery worker --events --app hepcrawl.testlib.tasks --loglevel=debug
-    links:
-      - rabbitmq
+    depends_on:
+      rabbitmq:
+        condition: service_healthy
 
   scrapyd:
     <<: *service_base
     command: bash -c "rm -f twistd.pid && exec scrapyd"
-    links:
-      - celery
+    depends_on:
+      celery:
+        condition: service_started
     healthcheck:
       timeout: 5s
       interval: 5s
@@ -83,8 +99,9 @@ services:
   scrapyd-deploy:
     <<: *service_base
     command: bash -c "scrapyd-deploy"
-    links:
-      - scrapyd
+    depends_on:
+      scrapyd:
+        condition: service_healthy
 
   ftp_server:
     image: stilliard/pure-ftpd:hardened
@@ -96,5 +113,29 @@ services:
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd
 
+  http-server.local:
+    image: nginx:stable-alpine
+    volumes:
+      - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
+      - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl
+      - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/
+    ports:
+      - 443:443
+    healthcheck:
+      timeout: 5s
+      interval: 5s
+      retries: 5
+      test:
+        - "CMD-SHELL"
+        - "curl https://localhost:443/"
+
   rabbitmq:
     image: rabbitmq
+    healthcheck:
+      timeout: 5s
+      interval: 5s
+      retries: 5
+      test:
+        - "CMD"
+        - "rabbitmqctl"
+        - "status"
diff --git a/hepcrawl/items.py b/hepcrawl/items.py
@@ -44,15 +44,17 @@ class HEPRecord(scrapy.Item):
     file_urls = scrapy.Field()
     """List of files to be downloaded with FilesPipeline and added to files."""
 
-    additional_files = scrapy.Field()
+    documents = scrapy.Field()
     """Files (fulltexts, package) belonging to this item.
 
     Example:
         ::
 
             [{
-                "type": "Fulltext",  # Fulltext, Supplemental, Data, Figure
-                "uri": "file:///path/to/file",  # can also be HTTP
+                "fulltext": true,
+                "url": "file:///path/to/file",
+                "description": "some fancy stuff",
+                "key": "usually_a_file_name.pdf",
             }]
     """
 

diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
@@ -116,10 +116,23 @@ def _has_to_be_crawled(self, request, spider):
         request_db_key = self._get_key(request)
 
         if request_db_key not in self.db:
+            LOGGER.debug(
+                'Crawl-Once: key %s for request %s not found in the db, '
+                'should be crawled.' % (request_db_key, request)
+            )
             return True
 
         new_file_timestamp = self._get_timestamp(request, spider)
         old_file_timestamp = self.db.get(key=request_db_key)
+        LOGGER.debug(
+            'Crawl-Once: key %s for request %s found in the db, '
+            'considering timestamps new(%s) and old(%s).' % (
+                request_db_key,
+                request,
+                new_file_timestamp,
+                old_file_timestamp,
+            )
+        )
         return new_file_timestamp > old_file_timestamp
 
     def _get_key(self, request):

diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py
@@ -312,7 +312,7 @@ def build_item_rich(self, response):
             # NOTE: maybe this should be removed as the 'rich' format records
             # are not open access.
             record.add_value(
-                "additional_files",
+                "documents",
                 self._create_file(
                     get_first(response.meta["pdf_links"]),
                     "INSPIRE-PUBLIC",
@@ -384,7 +384,7 @@ def build_item_jats(self, response):
 
         if "pdf_links" in response.meta:
             record.add_value(
-                "additional_files",
+                "documents",
                 self._create_file(
                     get_first(response.meta["pdf_links"]),
                     "INSPIRE-PUBLIC",

diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
@@ -995,7 +995,10 @@ def build_item(self, response):
 
         xml_file = response.meta.get("xml_url")
         if xml_file:
-            record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(xml_file, "HIDDEN", "Fulltext"),
+            )
             sd_url = self._get_sd_url(xml_file)
             if requests.head(sd_url).status_code == 200:  # Test if valid url
                 record.add_value("urls", sd_url)

diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py
@@ -154,13 +154,13 @@ def get_journal_pages(node):
         else:
             return journal_pages, ''
 
-    def create_file(self, file_path, file_access, file_type):
-        """Create a structured dictionary to add to 'files' item."""
+    def create_document(self, file_path):
+        """Create a structured dictionary to add to 'documents' item."""
         file_dict = {
-            "access": file_access,
+            "hidden": True,
             "description": self.name.upper(),
             "url": file_path,
-            "type": file_type,
+            "fulltext": True,
         }
         return file_dict
 
@@ -219,9 +219,9 @@ def parse_node(self, response, node):
         record.add_value('file_urls', pdf_links)
         if xml_links:
             record.add_value(
-                'additional_files',
+                'documents',
                 [
-                    self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext")
+                    self.create_document(xml)
                     for xml in xml_links
                 ]
             )

diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py
@@ -232,7 +232,10 @@ def build_item(self, response):
 
         pdf_files = response.meta.get("pdf_links")
         if pdf_files:
-            record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+            )
         record.add_value('authors', response.meta.get("authors"))
         record.add_value('date_published', response.meta.get("date_published"))
         record.add_value('thesis', response.meta.get("thesis_info"))

diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py
@@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage):
             if pattern in pdf_path:
                 return os.path.join(self.pdf_files, pdf_path)
 
-    def add_file(self, file_path, file_access, file_type):
+    def add_document(self, file_path, hidden, fulltext):
         """Create a structured dictionary and add to 'files' item."""
         file_dict = {
-            "access": file_access,
+            "hidden": hidden,
+            "fulltext": fulltext,
             "description": self.name.upper(),
             "url": file_path,
-            "type": file_type,
         }
         return file_dict
 
@@ -206,21 +206,25 @@ def parse_node(self, response, node):
         record.add_value('collections', self.get_collections(doctype))
 
         xml_file_path = response.url
-        record.add_value("additional_files",
-                         self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext"))
+        record.add_value(
+            "documents",
+            self.add_document(xml_file_path, hidden=True, fulltext=True),
+        )
         if self.pdf_files:
             pdf_file_path = self.get_pdf_path(volume, issue, fpage)
             if pdf_file_path:
                 if doctype and "erratum" in doctype.lower():
-                    file_type = "Erratum"
+                    fulltext = False
                 else:
-                    file_type = "Fulltext"
+                    fulltext = True
                 if journal_title in self.OPEN_ACCESS_JOURNALS:
-                    file_access = "INSPIRE-PUBLIC"  # FIXME: right?
+                    hidden = False
                 else:
-                    file_access = "INSPIRE-HIDDEN"
-                record.add_value("additional_files",
-                                 self.add_file(pdf_file_path, file_access, file_type))
+                    hidden = True
+                record.add_value(
+                    "documents",
+                    self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),
+                )
 
         parsed_item = ParsedItem(
             record=record.load_item(),

diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py
@@ -177,7 +177,7 @@ def build_item(self, response):
         record.add_value('title', response.meta.get("title"))
         record.add_value('urls', response.meta.get("urls"))
         record.add_value("abstract", response.meta.get("abstract"))
-        record.add_value("additional_files", response.meta.get("files"))
+        record.add_value("documents", response.meta.get("files"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
         parsed_item = ParsedItem(

diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py
@@ -207,8 +207,10 @@ def build_item(self, response):
 
         pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
         if pdf_files:
-            record.add_value('additional_files', self.add_file(
-                pdf_files, "HIDDEN", "Fulltext"))
+            record.add_value(
+                'documents',
+                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+            )
         record.add_value('authors', self.get_authors(node))
         record.add_xpath('date_published',
                          "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")

diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py
@@ -121,7 +121,10 @@ def parse_node(self, response, node):
             return None
 
         pdf_files = node.xpath(".//a/@href").extract()
-        record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
+        record.add_value(
+            'documents',
+            self.add_file(pdf_files, "HIDDEN", "Fulltext"),
+        )
         record.add_value('authors', self.get_authors(node))
         record.add_value('date_published', year)
         record.add_value('thesis', {'degree_type': thesis_type})