From b97cd4b23b8d88fc24110f2101355e92b382b40e Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Thu, 17 Aug 2017 08:30:06 +0200
Subject: [PATCH 01/26] middlwares: indentation fix

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/middlewares.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
index dab5c7e4..5554c502 100644
--- a/hepcrawl/middlewares.py
+++ b/hepcrawl/middlewares.py
@@ -11,8 +11,8 @@
 
 from __future__ import absolute_import, division, print_function
 
-class ErrorHandlingMiddleware(object):
 
+class ErrorHandlingMiddleware(object):
     """Log errors."""
 
     @classmethod

From 15ab37df2ba3284964a9b6c54fa84aa8da630fd6 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Thu, 17 Aug 2017 15:05:21 +0200
Subject: [PATCH 02/26] setup: add `scrapy-crawl-once` plug-in

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 4d1518a8..ca1e0337 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
     'inspire-schemas~=46.0',
     'inspire-dojson~=46.0',
     'Scrapy>=1.1.0',
+    'scrapy-crawl-once>=0.1.1',
     # TODO: unpin once they support wheel building again
     'scrapyd==1.1.0',
     'scrapyd-client>=1.0.1',

From 7694fe37fa7a9cd3dae2b5905aaeaf179c7c72be Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Thu, 17 Aug 2017 15:20:11 +0200
Subject: [PATCH 03/26] middlewares: add support for crawling only once

* Adds: extends `scrapy-crawl-once` plug-in for supporting custom data-fields in DB per spider.
* Adds: enables `scrapy-crawl-once` plug-in.

Closes #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/middlewares.py | 124 ++++++++++++++++++++++++++++++++++++++++
 hepcrawl/settings.py    |   5 ++
 2 files changed, 129 insertions(+)

diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
index 5554c502..86a9998d 100644
--- a/hepcrawl/middlewares.py
+++ b/hepcrawl/middlewares.py
@@ -11,6 +11,17 @@
 
 from __future__ import absolute_import, division, print_function
 
+import os
+import time
+
+from ftplib import FTP
+from six.moves.urllib.parse import urlparse
+
+from scrapy.exceptions import IgnoreRequest
+from scrapy_crawl_once.middlewares import CrawlOnceMiddleware
+
+from hepcrawl.utils import ftp_connection_info
+
 
 class ErrorHandlingMiddleware(object):
     """Log errors."""
@@ -34,3 +45,116 @@ def process_exception(self, request, exception, spider):
             'exception': exception,
             'sender': request,
         })
+
+
+class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware):
+    """
+    This spider and downloader middleware allows to avoid re-crawling pages
+    which were already downloaded in previous crawls.
+
+    To enable it, modify ``settings.py``::
+
+        SPIDER_MIDDLEWARES = {
+            # ...
+            'scrapy_crawl_once.CrawlOnceMiddleware': 100,
+            # ...
+        }
+
+        DOWNLOADER_MIDDLEWARES = {
+            # ...
+            'scrapy_crawl_once.CrawlOnceMiddleware': 50,
+            # ...
+        }
+
+    By default it does nothing. To avoid crawling a particular page
+    multiple times set ``request.meta['crawl_once'] = True``. Other
+    ``request.meta`` keys:
+
+    * ``crawl_once_value`` - a value to store in DB. By default, timestamp
+      is stored for Http/Https requests and last-modified is stored for FTP/File requests.
+    * ``crawl_once_key`` - unique file name is used.
+
+    Settings:
+
+    * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware.
+      Default is True.
+    * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database.
+      By default ``.scrapy/crawl_once/`` path is used; this folder contains
+      ``<spider_name>.sqlite`` files with databases of seen requests.
+    * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key
+      (False by default). When True, all requests are handled by
+      this middleware unless disabled explicitly using
+      ``request.meta['crawl_once'] = False``.
+    """
+
+    @staticmethod
+    def _is_newer(this_timestamp, than_this_timestamp, scheme):
+        if scheme in ['ftp', 'file']:
+            return this_timestamp > than_this_timestamp
+
+    def _has_to_be_crawled(self, request, spider):
+        request_db_key = self._get_key(request)
+
+        if request_db_key not in self.db:
+            return True
+
+        new_request_timestamp = self._get_timestamp(request, spider)
+        parsed_url = urlparse(request.url)
+        if self._is_newer(
+            new_request_timestamp,
+            self.db.get(key=request_db_key),
+            scheme=parsed_url.scheme,
+        ):
+            return True
+
+        return False
+
+    def process_request(self, request, spider):
+        if not request.meta.get('crawl_once', self.default):
+            return
+
+        request.meta['crawl_once_key'] = os.path.basename(request.url)
+        request.meta['crawl_once_value'] = self._get_timestamp(request, spider)
+
+        if not self._has_to_be_crawled(request, spider):
+            self.stats.inc_value('crawl_once/ignored')
+            raise IgnoreRequest()
+
+    @staticmethod
+    def _get_timestamp(request, spider):
+        def _get_ftp_relative_path(url, host):
+            return url.replace(
+                'ftp://{0}/'.format(host),
+                '',
+            )
+
+        def _get_ftp_timestamp(spider, url):
+            ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc)
+            ftp = FTP(
+                host=ftp_host,
+                user=params['ftp_user'],
+                passwd=params['ftp_password'],
+            )
+            return ftp.sendcmd(
+                'MDTM {}'.format(
+                    _get_ftp_relative_path(
+                        url=url,
+                        host=ftp_host
+                    )
+                )
+            )
+
+        def _get_file_timestamp(url):
+            file_path = url.replace('file://', '')
+            return os.stat(file_path).st_mtime
+
+        parsed_url = urlparse(request.url)
+        full_url = request.url
+        if parsed_url.scheme == 'ftp':
+            last_modified = _get_ftp_timestamp(spider, full_url)
+        elif parsed_url.scheme == 'file':
+            last_modified = _get_file_timestamp(full_url)
+        else:
+            last_modified = time.time()
+
+        return last_modified
diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index 31d608bf..91f82829 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -62,14 +62,19 @@
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 SPIDER_MIDDLEWARES = {
     'hepcrawl.middlewares.ErrorHandlingMiddleware': 543,
+    'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
 }
 
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
     'hepcrawl.middlewares.ErrorHandlingMiddleware': 543,
+    'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
 }
 
+CRAWL_ONCE_ENABLED = True
+CRAWL_ONCE_DEFAULT = True
+
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 EXTENSIONS = {

From a283b21ac3616911c86a04d30eafb98b10ffe49b Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Thu, 17 Aug 2017 17:10:46 +0200
Subject: [PATCH 04/26] tests: support for `scrapy-crawl-once`

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 tests/functional/arxiv/test_arxiv.py | 4 ++++
 tests/functional/wsp/test_wsp.py     | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py
index 0f58b17d..2ef86b87 100644
--- a/tests/functional/arxiv/test_arxiv.py
+++ b/tests/functional/arxiv/test_arxiv.py
@@ -11,6 +11,7 @@
 
 from __future__ import absolute_import, division, print_function
 
+import os
 from time import sleep
 import pytest
 
@@ -20,6 +21,7 @@
 from hepcrawl.testlib.fixtures import (
     get_test_suite_path,
     expected_json_results_from_file,
+    clean_dir,
 )
 
 
@@ -51,6 +53,8 @@ def set_up_local_environment():
         }
     }
 
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
+
 
 @pytest.mark.parametrize(
     'expected_results',
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index 42f691c9..ffa4c4c9 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -56,6 +56,7 @@ def set_up_ftp_environment():
     }
 
     clean_dir(path='/tmp/WSP/')
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
 
 @pytest.fixture(scope="function")
@@ -81,6 +82,7 @@ def set_up_local_environment():
 
 def remove_generated_files(package_location):
     clean_dir(path='/tmp/WSP/')
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
     _, dirs, files = next(os.walk(package_location))
     for dir_name in dirs:

From 1301d255530a1d1ec3beb6c3f81274a5e4ebd265 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Thu, 17 Aug 2017 17:33:04 +0200
Subject: [PATCH 05/26] tests: add tests for `scrapy-clawl-once`

* Adds: tests about `scrapy-clawl-once` for FTP and FILE.

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 tests/functional/arxiv/test_arxiv.py | 49 ++++++++++++++
 tests/functional/wsp/test_wsp.py     | 98 +++++++++++++++++++++++++++-
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py
index 2ef86b87..ce01eecb 100644
--- a/tests/functional/arxiv/test_arxiv.py
+++ b/tests/functional/arxiv/test_arxiv.py
@@ -88,3 +88,52 @@ def test_arxiv(set_up_local_environment, expected_results):
     expected_results = [override_generated_fields(expected) for expected in expected_results]
 
     assert gotten_results == expected_results
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'arxiv',
+            'fixtures',
+            'arxiv_smoke_record.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_arxiv_crawl_twice(set_up_local_environment, expected_results):
+    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        events_limit=1,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='arXiv',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='arXiv',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index ffa4c4c9..d034e7aa 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -44,7 +44,7 @@ def set_up_ftp_environment():
     )
 
     # The test must wait until the docker environment is up (takes about 10 seconds).
-    sleep(10)
+    sleep(7)
 
     yield {
         'CRAWLER_HOST_URL': 'http://scrapyd:6800',
@@ -126,6 +126,54 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
     assert gotten_results == expected_results
 
 
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'wsp',
+            'fixtures',
+            'wsp_smoke_records.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
+    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []
+
+
 @pytest.mark.parametrize(
     'expected_results',
     [
@@ -158,3 +206,51 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
     expected_results = [override_generated_fields(expected) for expected in expected_results]
 
     assert gotten_results == expected_results
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'wsp',
+            'fixtures',
+            'wsp_smoke_records.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results):
+    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []

From 98a7796a8d0985cab16c45712e7b68c986e24f77 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 14:15:14 +0200
Subject: [PATCH 06/26] tests: delete unused function for `arxiv` tests

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 tests/unit/test_arxiv_all.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index 1f4155c9..45689def 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -44,6 +44,7 @@ def _get_processed_record(item, spider):
         )
     )
 
+    assert records
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 

From a7588190af45090a2e9dd5c3bee06e4bf0c19b21 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 14:32:51 +0200
Subject: [PATCH 07/26] wsp: add temporary folder to the crawlings

* Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider`
	methods deletes this temporary folder.

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/pipelines.py          |  5 +++++
 hepcrawl/spiders/wsp_spider.py | 13 +++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 8cd31c0e..4ff2b3c9 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -16,6 +16,7 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+import shutil
 
 import requests
 
@@ -175,6 +176,10 @@ def close_spider(self, spider):
         """Post results to BROKER API."""
         from celery.utils.log import get_task_logger
         logger = get_task_logger(__name__)
+
+        if hasattr(spider, 'tmp_dir'):
+            shutil.rmtree(path=spider.tmp_dir, ignore_errors=True)
+
         if 'SCRAPY_JOB' in os.environ and self.count > 0:
             task_endpoint = spider.settings[
                 'API_PIPELINE_TASK_ENDPOINT_MAPPING'
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index 058e6cc0..bc569bb3 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -78,6 +78,7 @@ def __init__(
         ftp_folder="/WSP",
         ftp_host=None,
         ftp_netrc=None,
+        tmp_dir=None,
         *args,
         **kwargs
     ):
@@ -86,10 +87,8 @@ def __init__(
         self.ftp_folder = ftp_folder
         self.ftp_host = ftp_host
         self.ftp_netrc = ftp_netrc
-        self.target_folder = "/tmp/WSP"
+        self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
         self.package_path = package_path
-        if not os.path.exists(self.target_folder):
-            os.makedirs(self.target_folder)
 
     def start_requests(self):
         """List selected folder on remote FTP and yield new zip files."""
@@ -116,9 +115,10 @@ def start_requests(self):
                 # Cast to byte-string for scrapy compatibility
                 remote_file = str(remote_file)
                 ftp_params["ftp_local_filename"] = os.path.join(
-                    self.target_folder,
+                    self.tmp_dir,
                     os.path.basename(remote_file)
                 )
+
                 remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
                 yield Request(
                     str(remote_url),
@@ -132,6 +132,7 @@ def handle_package_ftp(self, response):
         zip_filepath = response.body
         zip_target_folder, dummy = os.path.splitext(zip_filepath)
         xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
+
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
@@ -142,8 +143,8 @@ def handle_package_file(self, response):
         """Handle a local zip package and yield every XML."""
         self.log("Visited file %s" % response.url)
         zip_filepath = urlparse.urlsplit(response.url).path
-        zip_target_folder, dummy = os.path.splitext(zip_filepath)
-        xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
+        xml_files = unzip_xml_files(zip_filepath, self.tmp_dir)
+
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),

From 28a2603cc0a32c4e9aa5a077640fe01f18a8fd68 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 14:38:29 +0200
Subject: [PATCH 08/26] testlib: refactored `clean_dir` default arguments

* Adds: new default argument to `clean_dir` method is the generated DB folder from `scrapy-crawl-once` plugin.

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/testlib/fixtures.py         | 6 +++---
 tests/functional/arxiv/test_arxiv.py | 3 +--
 tests/functional/wsp/test_wsp.py     | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py
index 73f28f96..c227d109 100644
--- a/hepcrawl/testlib/fixtures.py
+++ b/hepcrawl/testlib/fixtures.py
@@ -133,13 +133,13 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
 
     return expected_data
 
-
-def clean_dir(path):
+def clean_dir(path=os.path.join(os.getcwd(), '.scrapy')):
     """
     Deletes all contained files of given target directory path.
 
     Args:
-        path: Absolute path of target directory to be cleaned.
+        path(str): path of directory to be deleted. Default path is the produced DB per spider that
+            stores the requested urls.
 
     Example:
 
diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py
index ce01eecb..22025020 100644
--- a/tests/functional/arxiv/test_arxiv.py
+++ b/tests/functional/arxiv/test_arxiv.py
@@ -11,7 +11,6 @@
 
 from __future__ import absolute_import, division, print_function
 
-import os
 from time import sleep
 import pytest
 
@@ -53,7 +52,7 @@ def set_up_local_environment():
         }
     }
 
-    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
+    clean_dir()
 
 
 @pytest.mark.parametrize(
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index d034e7aa..27f36955 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -44,7 +44,7 @@ def set_up_ftp_environment():
     )
 
     # The test must wait until the docker environment is up (takes about 10 seconds).
-    sleep(7)
+    sleep(10)
 
     yield {
         'CRAWLER_HOST_URL': 'http://scrapyd:6800',
@@ -55,7 +55,7 @@ def set_up_ftp_environment():
         }
     }
 
-    clean_dir(path='/tmp/WSP/')
+    clean_dir()
     clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
 
@@ -81,7 +81,7 @@ def set_up_local_environment():
 
 
 def remove_generated_files(package_location):
-    clean_dir(path='/tmp/WSP/')
+    clean_dir()
     clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
     _, dirs, files = next(os.walk(package_location))

From 026bdfb985d1d5e29cf54612eefabc5a05d8ed00 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 14:41:16 +0200
Subject: [PATCH 09/26] tests: add `clean_dir` to tests that using the DB

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 tests/unit/test_alpha.py            | 6 ++++--
 tests/unit/test_aps.py              | 5 ++++-
 tests/unit/test_arxiv_all.py        | 8 ++++++--
 tests/unit/test_arxiv_single.py     | 8 ++++++--
 tests/unit/test_pipelines.py        | 9 +++++++--
 tests/unit/test_pos.py              | 9 +++++++--
 tests/unit/test_world_scientific.py | 9 +++++++--
 7 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py
index 96bf9af1..d5ff6331 100644
--- a/tests/unit/test_alpha.py
+++ b/tests/unit/test_alpha.py
@@ -13,8 +13,10 @@
 
 from hepcrawl.spiders import alpha_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
-
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 @pytest.fixture
 def results():
diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py
index 3bb3698c..e8e64962 100644
--- a/tests/unit/test_aps.py
+++ b/tests/unit/test_aps.py
@@ -12,7 +12,10 @@
 import pytest
 
 from hepcrawl.spiders import aps_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index 45689def..63ce0822 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -16,7 +16,10 @@
 
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import arxiv_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -48,8 +51,9 @@ def _get_processed_record(item, spider):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items]
+    yield [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items]
 
+    clean_dir()
 
 def test_page_nr(many_results):
     """Test extracting page_nr"""
diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py
index 329a2a49..709c6d9a 100644
--- a/tests/unit/test_arxiv_single.py
+++ b/tests/unit/test_arxiv_single.py
@@ -17,7 +17,10 @@
 
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import arxiv_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -44,8 +47,9 @@ def _get_processed_item(item, spider):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
+    yield [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
 
+    clean_dir()
 
 
 def test_abstracts(results):
diff --git a/tests/unit/test_pipelines.py b/tests/unit/test_pipelines.py
index 050df092..08b81319 100644
--- a/tests/unit/test_pipelines.py
+++ b/tests/unit/test_pipelines.py
@@ -21,7 +21,10 @@
 from hepcrawl.spiders import arxiv_spider
 from hepcrawl.pipelines import InspireAPIPushPipeline
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -44,7 +47,9 @@ def json_spider_record(tmpdir):
     )
     parsed_record = items.next()
     assert parsed_record
-    return spider, parsed_record
+    yield spider, parsed_record
+
+    clean_dir()
 
 
 @pytest.fixture
diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py
index bea29b34..0eccd4fc 100644
--- a/tests/unit/test_pos.py
+++ b/tests/unit/test_pos.py
@@ -19,7 +19,10 @@
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import pos_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -55,7 +58,9 @@ def record(scrape_pos_page_body):
     parsed_record = pipeline.process_item(parsed_item, spider)
     assert parsed_record
 
-    return parsed_record
+    yield parsed_record
+
+    clean_dir()
 
 
 def test_titles(record):
diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py
index 291d00d0..38adc79b 100644
--- a/tests/unit/test_world_scientific.py
+++ b/tests/unit/test_world_scientific.py
@@ -18,7 +18,10 @@
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import wsp_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 def create_spider():
@@ -44,7 +47,9 @@ def get_records(response_file_name):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return (pipeline.process_item(record, spider) for record in records)
+    yield (pipeline.process_item(record, spider) for record in records)
+
+    clean_dir()
 
 
 def get_one_record(response_file_name):

From e7ff520eb94e249f122e3c118393d6abd7f71c0d Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 14:44:14 +0200
Subject: [PATCH 10/26] wsp: minor fix

* Adds: makes staticmethod `_get_collections` class method.
* Adds: minor indentation fix.
* Removes: unused import.

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/spiders/wsp_spider.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index bc569bb3..f638bd03 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -13,6 +13,7 @@
 
 import os
 import urlparse
+import tempfile
 
 from scrapy import Request
 from scrapy.spiders import XMLFeedSpider
@@ -75,7 +76,7 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
     def __init__(
         self,
         package_path=None,
-        ftp_folder="/WSP",
+        ftp_folder="WSP",
         ftp_host=None,
         ftp_netrc=None,
         tmp_dir=None,
@@ -163,7 +164,10 @@ def parse_node(self, response, node):
         record = HEPLoader(item=HEPRecord(), selector=node, response=response)
         if article_type in ['correction',
                             'addendum']:
-            record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")
+            record.add_xpath(
+                'related_article_doi',
+                "//related-article[@ext-link-type='doi']/@href",
+            )
             record.add_value('journal_doctype', article_type)
 
         dois = node.xpath("//article-id[@pub-id-type='doi']/text()").extract()
@@ -221,10 +225,16 @@ def parse_node(self, response, node):
 
         return parsed_item
 
-    def _get_collections(self, node, article_type, current_journal_title):
+    @staticmethod
+    def _get_collections(node, article_type, current_journal_title):
         """Return this articles' collection."""
         conference = node.xpath('.//conference').extract()
-        if conference or current_journal_title == "International Journal of Modern Physics: Conference Series":
+        if (
+                conference
+                or current_journal_title == (
+                        "International Journal of Modern Physics: Conference Series"
+                )
+        ):
             return ['HEP', 'ConferencePaper']
         elif article_type == "review-article":
             return ['HEP', 'Review']

From b509c82b7e8c87003a294170caa229eefad5b919 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 15:09:04 +0200
Subject: [PATCH 11/26] pipelines: minor fix

* Adds: make `InspireAPIPushPipeline._cleanup` staticmethod.

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 hepcrawl/pipelines.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 4ff2b3c9..7a51e4cb 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -125,7 +125,8 @@ def _prepare_payload(self, spider):
         ]
         return payload
 
-    def _cleanup(self, spider):
+    @staticmethod
+    def _cleanup(spider):
         """Run cleanup."""
         # Cleanup errors
         if 'errors' in spider.state:

From a82a158ffa8dbfd4be7e43d9e981bd5890e78596 Mon Sep 17 00:00:00 2001
From: Spiros Delviniotis <spirosdelviniotis@gmail.com>
Date: Fri, 18 Aug 2017 16:01:24 +0200
Subject: [PATCH 12/26] global: add `CRAWL_ONCE_PATH` to settings

* Adds: variable in `settings.py` to specify where to store `scrapy-crawl-once` DB.

Addresses #161

Signed-off-by: Spiros Delviniotis <spirosdelviniotis@gmail.com>
---
 docker-compose.test.yml        |   2 ++
 hepcrawl/settings.py           |   4 ++++
 hepcrawl/testlib/fixtures.py   |   4 +++-
 tests/Dockerfile.hepcrawl_base |   4 ++--
 tests/docker_entrypoint.sh     |   3 +++
 tests/fix_rights               | Bin 8992 -> 9032 bytes
 tests/fix_rights.c             |   8 ++++++--
 7 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index d14d7c87..d9b69c75 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -18,6 +18,7 @@ services:
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
       - APP_FILES_STORE=/tmp/file_urls
+      - APP_CRAWL_ONCE_PATH=/var/lib/scrapy/crawl_once/
       - COVERAGE_PROCESS_START=/code/.coveragerc
       - BASE_USER_UID=${BASE_USER_UID:-1000}
       - BASE_USER_GIT=${BASE_USER_GIT:-1000}
@@ -28,6 +29,7 @@ services:
       - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
       - /tmp/WSP:/tmp/WSP
       - /tmp/file_urls:/tmp/file_urls
+      - ${PWD}/.scrapy/crawl_once:/var/lib/scrapy/crawl_once
 
   functional_wsp:
     <<: *service_base
diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index 91f82829..216f7694 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -74,6 +74,10 @@
 
 CRAWL_ONCE_ENABLED = True
 CRAWL_ONCE_DEFAULT = True
+CRAWL_ONCE_PATH = os.environ.get(
+    'APP_CRAWL_ONCE_PATH',
+    '/var/lib/scrapy/crawl_once/',
+)
 
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py
index c227d109..b78c1a31 100644
--- a/hepcrawl/testlib/fixtures.py
+++ b/hepcrawl/testlib/fixtures.py
@@ -15,6 +15,7 @@
 
 from scrapy.http import Request, TextResponse
 from scrapy.selector import Selector
+from hepcrawl.settings import CRAWL_ONCE_PATH
 
 
 def fake_response_from_file(file_name, test_suite='unit', url='http://www.example.com', response_type=TextResponse):
@@ -133,7 +134,8 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
 
     return expected_data
 
-def clean_dir(path=os.path.join(os.getcwd(), '.scrapy')):
+
+def clean_dir(path=CRAWL_ONCE_PATH):
     """
     Deletes all contained files of given target directory path.
 
diff --git a/tests/Dockerfile.hepcrawl_base b/tests/Dockerfile.hepcrawl_base
index eb91b69f..8db9f43e 100644
--- a/tests/Dockerfile.hepcrawl_base
+++ b/tests/Dockerfile.hepcrawl_base
@@ -26,10 +26,10 @@ RUN yum install -y epel-release && \
         python-virtualenv && \
     yum clean all
 
-RUN mkdir /code /hepcrawl_venv
+RUN mkdir /code /hepcrawl_venv /var/lib/scrapy
 
 RUN useradd test
-RUN chown -R test:test /code /hepcrawl_venv
+RUN chown -R test:test /code /hepcrawl_venv /var/lib/scrapy
 
 ADD ./docker_entrypoint.sh /docker_entrypoint.sh
 ADD ./fix_rights /fix_rights
diff --git a/tests/docker_entrypoint.sh b/tests/docker_entrypoint.sh
index 1762a421..4785699d 100755
--- a/tests/docker_entrypoint.sh
+++ b/tests/docker_entrypoint.sh
@@ -23,6 +23,8 @@ restore_venv_tmp_code_rights() {
         /fix_rights --codedir "$BASE_USER_UID:$BASE_USER_GID"
         echo "Restoring permissions of tmpdir to $BASE_USER_UID:$BASE_USER_GID"
         /fix_rights --tmpdir "$BASE_USER_UID:$BASE_USER_GID"
+        echo "Restoring permissions of vardir to $BASE_USER_UID:$BASE_USER_GID"
+        /fix_rights --vardir "$BASE_USER_UID:$BASE_USER_GID"
     else
         echo "No BASE_USER_UID env var defined, skipping venv, codedir, tmpdir permission" \
             "restore."
@@ -57,6 +59,7 @@ main() {
     /fix_rights --virtualenv 'test:test'
     /fix_rights --codedir 'test:test'
     /fix_rights --tmpdir 'test:test'
+    /fix_rights --vardir 'test:test'
     trap restore_venv_tmp_code_rights EXIT
 
     if ! [[ -f "$VENV_PATH/bin/activate" ]]; then
diff --git a/tests/fix_rights b/tests/fix_rights
index 98677b2c948eebb928e2439ed495a81515b06b8a..ecf219b08f9c9e8e4e7d5887f56060738ae56f33 100755
GIT binary patch
delta 1261
zcmZ9Me`r%z6vyv<&zHQH)c6vczNBU3$LOy>l2(LJ=WL#~<pnyao9$rWmef{L|7go=
z)5cL&YUeO3-BzxDD2!2ItPD0+%-Hk~Ydd9eAR~?r%KXD_RLGd_2Z*Ts@jUnCbrmnX
zd+t5o&pkQky!$fUa<k==8hmD|cE(a-MfxbQPW=|bNh08n?JGjjw<1LK#R~Zn-zk0Y
z?%jW@lGWdxwKqlEF4e`3uWoE)&oKcundhTO$t}cnll#IHMTKEa(;gn%;b}tX@~9X`
z?Mg5WMj3x>Q!@94DlXH&%IWz*`;lu@(3SyyU;niu*LQfL=vZiTKS%X;W$f=X379Hz
znzWhH7AvHt<eHbt1@en2!qf=FYOaU$Uo}@$-lCx=nrD^qd0v@7b1BMZa)q^ZO*=iG
z5|s(=cb*(FOQWNPk}oRTLAlC%0d6k*HT-;tGX^-b1%A2-9^3+H=(FITn=0~pm4#Dq
zNM|OfK1lUxU9Zg*fldIfVaU3WETVa3vVC5ed@ZkJex{lGh{(J46xKBDr|ZS0%5TaY
zaz>Ne<-@I$m7j55=+I=4+BagEN3qPWt+n@*@%y~8u+A1n$=iI|H<VDrD4ZHl!<{|7
z;iJjE;a(IBs@+FZ>Ojv>H$vneKERCxgF^}85_U;+p?3ACXn&&1+t)wX)7R^*90*Y=
zrc+e(?oADJtNZrsJ+Ahp`uAB7!WHZ*bx;cjPwT3tjexxcwiWCTP{HS5M>aI=2H5)>
z<U^r~Nw1PiLWsHfl>8x7`O5(MmUct7i61Zrwwu^*aEX~mP2lQ+XLg+gEG^s;w2H}&
zS$48Sv_<h^_K?lYlo#L*((%}^nqfI1dqTX3@gk{rI9U-bgO%_KNjRL_9LQzKb~+wm
zhPRTpN<RYkpQT}(CvImozCvWD2QFR0DZ$~B&I3+6a-)8@l63I;jL>;<0raoQZKve^
z9&#6fQ4GdF&<&kg(&q=HzD#OZ1Jcg8QQHfoz<?iOEE~yhGeWq)??5&bCu3#Rv8!;J
z?q<2Dt_KXN1}G7gBl_7qKXx0Z>Gw&8-ea&0<i9eB+YhRj+{Qa`2k}V~7pFAczof#7
zX8B^g_x?ZOCu5R?&t=a`Gnns?Uw@}fKCEOL-Ek&6T@hy)KT8--NLVz=StJdfh-0(3
zWuu2ICZ_<}`p>!IiCA2gC{2~Vpr@W6F|H#Ez6iXHC%#6Q8~lxUgq-v@3bUe)K?Mu`
Oh>(|xyRQg-+4BrIo@@93

delta 1112
zcmZ8gU1$?o6ux(2l9{O{lbCTfhTZ%?t&KINFB@DHlQk{3(n7`7zC>(AO6!9b|0=R9
zE3IOvttrO`L0D1^>vkWy!4bC0vNfU@eX;(6=!=R%{6YLf6szO8lejKDaL>8teBV7k
zcjjJd%r<5g)6UV_Q}UopOc|6%??h-PNc4y+C6!|-DQ<33Fi!7!#%qE#$%7~Uec$uf
z@w%-&`|74QC-q}O*wT{~<AfE@MNL!u%@!x6e#0={bUXb?LMHo_PSVbn_Mu?|?;I@s
zXpe7_CuRo<yIdR3a=^HU<UzCA#`8{poODON_){EjXZxS`LD*JJ`=HgfSH5M|YPkJS
zgzL7L@Ir(tdp!-qD!UH`?aHbcw$9SU(#%K0I8fL<j}2MxivbKZl%Dppp@uvgD2Qx`
zOeHGB8()b9xMtVd3P_(?TSm=pOA|;jj}9rAF3a&3yp43RjE^9Gmf{Shqh;n*Hr!rd
z!!1`?`U-ER1y+jReD6W<^7-yeq9d^(kv0<TiMHn9>MPa&4mrG(+x1Q9w@v9=&9#r%
zz%!AZ`)Dcs2_xcOm@6%WtW*mfQhof#t}TJ>eY<+MZwplKjPRSIKFf+IS}U%b9}MFs
zv~4qn(TsL27R~YaC&RdaHfexcUL=qi_(6^ctsh{kyeRZ5k`M6#5)S2{ux6gkGC=3r
zMUuqO7F^;dI4w6?qnJa`X_eCgO4VUd9C4b{*E#HhdlIjM%cY8)7WEy<1f@9$x;2_d
zty)|p1?$kupjP#t-4Ceniwv7RBQymsJ<Dhof@%$Nn^eD-fBFAQ9X97CPLhND>KZtz
zdNA`#Rr5|@Tp?b<X7T@lQB|YUrhW+Zs<!+!GF^g~w9aA@0ppk>U+H{KrK}U<ZsH(O
z7+X-&bjGajAbD%t#KRSqA6W`nNM`rDq;i@+2`=X4qK$$})2z!84Zt-zpSC~({SMCN
z{<poj<6X|qxBI^nUKrCfdPM(QJ4N+g?<PSn`noK_D*?j(h?J{Lk%KOO%>8Aiyl6H-
z#tHm^q_400zCR{s@K5KGk@QgDBBhJraWICDEf`vjx(oGs7z?eIepgHy5#eztCg-)X
LcT#{SzVAK*PI@vR

diff --git a/tests/fix_rights.c b/tests/fix_rights.c
index 8eaa9911..2dead30c 100644
--- a/tests/fix_rights.c
+++ b/tests/fix_rights.c
@@ -11,6 +11,7 @@
 char *VENV_PATH = "/hepcrawl_venv/";
 char *CODE_PATH = "/code/";
 char *TMP_PATH = "/tmp/";
+char *VAR_PATH = "/var/";
 
 
 int main (int argc, char *argv[]) {
@@ -30,7 +31,7 @@ int main (int argc, char *argv[]) {
     if (argc != 3) {
         fprintf(
             stderr,
-            "Usage: %s --virtualenv|--codedir|--tmpdir <user>:<group>\n",
+            "Usage: %s --virtualenv|--codedir|--tmpdir|--var <user>:<group>\n",
             argv[0]
         );
         exit(EXIT_FAILURE);
@@ -48,11 +49,14 @@ int main (int argc, char *argv[]) {
     } else if (strcmp(argv[1], "--tmpdir") == 0) {
         // tmp dir permissions
         chown_argv[3] = TMP_PATH;
+    } else if (strcmp(argv[1], "--vardir") == 0) {
+        // var dir permissions
+        chown_argv[3] = VAR_PATH;
     } else {
         fprintf(stderr, "Bad option %s.", argv[1]);
         fprintf(
             stderr,
-            "Usage: %s --virtualenv|--codedir|--tmpdir <user>:<group>\n",
+            "Usage: %s --virtualenv|--codedir|--tmpdir|--vardir <user>:<group>\n",
             argv[0]
         );
         exit(EXIT_FAILURE);

From 5a358e179f387a94cd34d8eb26fea311fd487d66 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Mon, 4 Sep 2017 12:55:07 +0200
Subject: [PATCH 13/26] middlewares: refactor crawl once middleware

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/middlewares.py | 125 +++++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 66 deletions(-)

diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
index 86a9998d..1044ba0d 100644
--- a/hepcrawl/middlewares.py
+++ b/hepcrawl/middlewares.py
@@ -24,8 +24,6 @@
 
 
 class ErrorHandlingMiddleware(object):
-    """Log errors."""
-
     @classmethod
     def from_crawler(cls, crawler):
         return cls(crawler.settings)
@@ -35,13 +33,11 @@ def __init__(self, settings):
 
     def process_spider_exception(self, response, exception, spider):
         """Register the error in the spider and continue."""
-        self.process_exception(response, exception, spider)
+        return self.process_exception(response, exception, spider)
 
     def process_exception(self, request, exception, spider):
         """Register the error in the spider and continue."""
-        if 'errors' not in spider.state:
-            spider.state['errors'] = []
-        spider.state['errors'].append({
+        spider.state.setdefault('errors', []).append({
             'exception': exception,
             'sender': request,
         })
@@ -68,47 +64,28 @@ class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware):
 
     By default it does nothing. To avoid crawling a particular page
     multiple times set ``request.meta['crawl_once'] = True``. Other
-    ``request.meta`` keys:
+    ``request.meta`` keys that modify it's behavior:
 
     * ``crawl_once_value`` - a value to store in DB. By default, timestamp
-      is stored for Http/Https requests and last-modified is stored for FTP/File requests.
+        is stored for Http/Https requests and last-modified is stored for
+        FTP/File requests.
     * ``crawl_once_key`` - unique file name is used.
 
     Settings:
 
-    * ``CRAWL_ONCE_ENABLED`` - set it to False to disable middleware.
-      Default is True.
-    * ``CRAWL_ONCE_PATH`` - a path to a folder with crawled requests database.
-      By default ``.scrapy/crawl_once/`` path is used; this folder contains
-      ``<spider_name>.sqlite`` files with databases of seen requests.
-    * ``CRAWL_ONCE_DEFAULT`` - default value for ``crawl_once`` meta key
-      (False by default). When True, all requests are handled by
-      this middleware unless disabled explicitly using
-      ``request.meta['crawl_once'] = False``.
-    """
-
-    @staticmethod
-    def _is_newer(this_timestamp, than_this_timestamp, scheme):
-        if scheme in ['ftp', 'file']:
-            return this_timestamp > than_this_timestamp
-
-    def _has_to_be_crawled(self, request, spider):
-        request_db_key = self._get_key(request)
-
-        if request_db_key not in self.db:
-            return True
-
-        new_request_timestamp = self._get_timestamp(request, spider)
-        parsed_url = urlparse(request.url)
-        if self._is_newer(
-            new_request_timestamp,
-            self.db.get(key=request_db_key),
-            scheme=parsed_url.scheme,
-        ):
-            return True
+    * ``CRAWL_ONCE_ENABLED``:set it to False to disable middleware. Default
+        is True.
+    * ``CRAWL_ONCE_PATH``: a path to a folder with crawled requests database.
+        By default ``.scrapy/crawl_once/`` path is used; this folder contains
+        ``<spider_name>.sqlite`` files with databases of seen requests.
+    * ``CRAWL_ONCE_DEFAULT``: default value for ``crawl_once`` meta key (False
+        by default). When True, all requests are handled by this middleware
+        unless disabled explicitly using
+        ``request.meta['crawl_once'] = False``.
 
-        return False
 
+    For more info see: https://github.com/TeamHG-Memex/scrapy-crawl-once
+    """
     def process_request(self, request, spider):
         if not request.meta.get('crawl_once', self.default):
             return
@@ -120,41 +97,57 @@ def process_request(self, request, spider):
             self.stats.inc_value('crawl_once/ignored')
             raise IgnoreRequest()
 
-    @staticmethod
-    def _get_timestamp(request, spider):
-        def _get_ftp_relative_path(url, host):
-            return url.replace(
-                'ftp://{0}/'.format(host),
-                '',
-            )
+    def _has_to_be_crawled(self, request, spider):
+        request_db_key = self._get_key(request)
 
-        def _get_ftp_timestamp(spider, url):
-            ftp_host, params = ftp_connection_info(spider.ftp_host, spider.ftp_netrc)
-            ftp = FTP(
-                host=ftp_host,
-                user=params['ftp_user'],
-                passwd=params['ftp_password'],
-            )
-            return ftp.sendcmd(
-                'MDTM {}'.format(
-                    _get_ftp_relative_path(
-                        url=url,
-                        host=ftp_host
-                    )
-                )
-            )
+        if request_db_key not in self.db:
+            return True
 
-        def _get_file_timestamp(url):
-            file_path = url.replace('file://', '')
-            return os.stat(file_path).st_mtime
+        new_file_timestamp = self._get_timestamp(request, spider)
+        old_file_timestamp = self.db.get(key=request_db_key)
+        return new_file_timestamp > old_file_timestamp
 
+    @classmethod
+    def _get_timestamp(cls, request, spider):
         parsed_url = urlparse(request.url)
         full_url = request.url
         if parsed_url.scheme == 'ftp':
-            last_modified = _get_ftp_timestamp(spider, full_url)
+            last_modified = cls._get_ftp_timestamp(spider, full_url)
         elif parsed_url.scheme == 'file':
-            last_modified = _get_file_timestamp(full_url)
+            last_modified = cls._get_file_timestamp(full_url)
         else:
             last_modified = time.time()
 
         return last_modified
+
+    @classmethod
+    def _get_ftp_timestamp(cls, spider, url):
+        ftp_host, params = ftp_connection_info(
+            spider.ftp_host,
+            spider.ftp_netrc,
+        )
+        ftp = FTP(
+            host=ftp_host,
+            user=params['ftp_user'],
+            passwd=params['ftp_password'],
+        )
+        return ftp.sendcmd(
+            'MDTM {}'.format(
+                cls._get_ftp_relative_path(
+                    url=url,
+                    host=ftp_host
+                )
+            )
+        )
+
+    @staticmethod
+    def _get_ftp_relative_path(url, host):
+        return url.replace(
+            'ftp://{0}/'.format(host),
+            '',
+        )
+
+    @staticmethod
+    def _get_file_timestamp(url):
+        file_path = url.replace('file://', '')
+        return os.stat(file_path).st_mtime

From 16ae688d1c63a654813e6227ac0f2e3650e24984 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Mon, 4 Sep 2017 12:59:32 +0200
Subject: [PATCH 14/26] pep8: wsp_spider

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/spiders/wsp_spider.py | 48 ++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index f638bd03..3498ca31 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -43,22 +43,27 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
        on the remote server and downloads them to a designated local folder,
        using ``WorldScientificSpider.start_requests()``.
     2. Then the ZIP file is unpacked and it lists all the XML files found
-       inside, via ``WorldScientificSpider.handle_package()``. Note the callback from
-       ``WorldScientificSpider.start_requests()``.
-    3. Finally, now each XML file is parsed via ``WorldScientificSpider.parse_node()``.
+       inside, via ``WorldScientificSpider.handle_package()``. Note the
+       callback from ``WorldScientificSpider.start_requests()``.
+    3. Finally, now each XML file is parsed via
+       ``WorldScientificSpider.parse_node()``.
 
 
     Example:
         To run a crawl, you need to pass FTP connection information via
         ``ftp_host`` and ``ftp_netrc``::
 
-            $ scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
+            $ scrapy crawl \\
+                WSP \\
+                -a 'ftp_host=ftp.example.com' \\
+                -a 'ftp_netrc=/path/to/netrc'
     """
 
     name = 'WSP'
     custom_settings = {}
     start_urls = []
-    iterator = 'iternodes'  # This is actually unnecessary, since it's the default value
+    # This is actually unnecessary, since it's the default value
+    iterator = 'iternodes'
     itertag = 'article'
 
     allowed_article_types = [
@@ -88,7 +93,10 @@ def __init__(
         self.ftp_folder = ftp_folder
         self.ftp_host = ftp_host
         self.ftp_netrc = ftp_netrc
-        self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
+        self.tmp_dir = (
+            tmp_dir or
+            tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
+        )
         self.package_path = package_path
 
     def start_requests(self):
@@ -100,9 +108,15 @@ def start_requests(self):
             )
 
             for file_path in new_files_paths:
-                yield Request("file://{0}".format(file_path), callback=self.handle_package_file)
+                yield Request(
+                    "file://{0}".format(file_path),
+                    callback=self.handle_package_file,
+                )
         else:
-            ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+            ftp_host, ftp_params = ftp_connection_info(
+                self.ftp_host,
+                self.ftp_netrc,
+            )
 
             new_files_paths = ftp_list_files(
                 self.ftp_folder,
@@ -157,7 +171,10 @@ def parse_node(self, response, node):
         node.remove_namespaces()
         article_type = node.xpath('@article-type').extract()
         self.log("Got article_type {0}".format(article_type))
-        if article_type is None or article_type[0] not in self.allowed_article_types:
+        if (
+            article_type is None or
+            article_type[0] not in self.allowed_article_types
+        ):
             # Filter out non-interesting article types
             return
 
@@ -216,7 +233,10 @@ def parse_node(self, response, node):
         )
         record.add_value('license', license)
 
-        record.add_value('collections', self._get_collections(node, article_type, journal_title))
+        record.add_value(
+            'collections',
+            self._get_collections(node, article_type, journal_title),
+        )
 
         parsed_item = ParsedItem(
             record=dict(record.load_item()),
@@ -230,10 +250,10 @@ def _get_collections(node, article_type, current_journal_title):
         """Return this articles' collection."""
         conference = node.xpath('.//conference').extract()
         if (
-                conference
-                or current_journal_title == (
-                        "International Journal of Modern Physics: Conference Series"
-                )
+            conference or
+            current_journal_title == (
+                "International Journal of Modern Physics: Conference Series"
+            )
         ):
             return ['HEP', 'ConferencePaper']
         elif article_type == "review-article":

From 1ac8785fad8f7a5ac326709f9a7eda557090188f Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Mon, 4 Sep 2017 13:03:36 +0200
Subject: [PATCH 15/26] setup: pin scrapy-crawl-once to the major version

Signed-off-by: David Caro <david@dcaro.es>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ca1e0337..16ce46f0 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
     'inspire-schemas~=46.0',
     'inspire-dojson~=46.0',
     'Scrapy>=1.1.0',
-    'scrapy-crawl-once>=0.1.1',
+    'scrapy-crawl-once~=0.1,>=0.1.1',
     # TODO: unpin once they support wheel building again
     'scrapyd==1.1.0',
     'scrapyd-client>=1.0.1',

From 6dffdc8c7989f49364f4ad845e03fff7c1b13b62 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Mon, 4 Sep 2017 13:08:14 +0200
Subject: [PATCH 16/26] tests: remove unneeded var dir chown

Signed-off-by: David Caro <david@dcaro.es>
---
 docker-compose.test.yml    | 3 +--
 tests/docker_entrypoint.sh | 3 ---
 tests/fix_rights.c         | 8 ++------
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index d9b69c75..64eff7df 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -18,7 +18,7 @@ services:
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
       - APP_FILES_STORE=/tmp/file_urls
-      - APP_CRAWL_ONCE_PATH=/var/lib/scrapy/crawl_once/
+      - APP_CRAWL_ONCE_PATH=/code/.scrapy
       - COVERAGE_PROCESS_START=/code/.coveragerc
       - BASE_USER_UID=${BASE_USER_UID:-1000}
       - BASE_USER_GIT=${BASE_USER_GIT:-1000}
@@ -29,7 +29,6 @@ services:
       - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
       - /tmp/WSP:/tmp/WSP
       - /tmp/file_urls:/tmp/file_urls
-      - ${PWD}/.scrapy/crawl_once:/var/lib/scrapy/crawl_once
 
   functional_wsp:
     <<: *service_base
diff --git a/tests/docker_entrypoint.sh b/tests/docker_entrypoint.sh
index 4785699d..1762a421 100755
--- a/tests/docker_entrypoint.sh
+++ b/tests/docker_entrypoint.sh
@@ -23,8 +23,6 @@ restore_venv_tmp_code_rights() {
         /fix_rights --codedir "$BASE_USER_UID:$BASE_USER_GID"
         echo "Restoring permissions of tmpdir to $BASE_USER_UID:$BASE_USER_GID"
         /fix_rights --tmpdir "$BASE_USER_UID:$BASE_USER_GID"
-        echo "Restoring permissions of vardir to $BASE_USER_UID:$BASE_USER_GID"
-        /fix_rights --vardir "$BASE_USER_UID:$BASE_USER_GID"
     else
         echo "No BASE_USER_UID env var defined, skipping venv, codedir, tmpdir permission" \
             "restore."
@@ -59,7 +57,6 @@ main() {
     /fix_rights --virtualenv 'test:test'
     /fix_rights --codedir 'test:test'
     /fix_rights --tmpdir 'test:test'
-    /fix_rights --vardir 'test:test'
     trap restore_venv_tmp_code_rights EXIT
 
     if ! [[ -f "$VENV_PATH/bin/activate" ]]; then
diff --git a/tests/fix_rights.c b/tests/fix_rights.c
index 2dead30c..8eaa9911 100644
--- a/tests/fix_rights.c
+++ b/tests/fix_rights.c
@@ -11,7 +11,6 @@
 char *VENV_PATH = "/hepcrawl_venv/";
 char *CODE_PATH = "/code/";
 char *TMP_PATH = "/tmp/";
-char *VAR_PATH = "/var/";
 
 
 int main (int argc, char *argv[]) {
@@ -31,7 +30,7 @@ int main (int argc, char *argv[]) {
     if (argc != 3) {
         fprintf(
             stderr,
-            "Usage: %s --virtualenv|--codedir|--tmpdir|--var <user>:<group>\n",
+            "Usage: %s --virtualenv|--codedir|--tmpdir <user>:<group>\n",
             argv[0]
         );
         exit(EXIT_FAILURE);
@@ -49,14 +48,11 @@ int main (int argc, char *argv[]) {
     } else if (strcmp(argv[1], "--tmpdir") == 0) {
         // tmp dir permissions
         chown_argv[3] = TMP_PATH;
-    } else if (strcmp(argv[1], "--vardir") == 0) {
-        // var dir permissions
-        chown_argv[3] = VAR_PATH;
     } else {
         fprintf(stderr, "Bad option %s.", argv[1]);
         fprintf(
             stderr,
-            "Usage: %s --virtualenv|--codedir|--tmpdir|--vardir <user>:<group>\n",
+            "Usage: %s --virtualenv|--codedir|--tmpdir <user>:<group>\n",
             argv[0]
         );
         exit(EXIT_FAILURE);

From 6ffa2df68033c387b2f2d454357917e896c05de3 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:39:05 +0200
Subject: [PATCH 17/26] unit: small pep8 refactor

Signed-off-by: David Caro <david@dcaro.es>
---
 tests/unit/test_arxiv_all.py        | 15 ++++--
 tests/unit/test_world_scientific.py | 84 +++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index 63ce0822..21a5fd99 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -7,7 +7,12 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
 
 import pytest
 
@@ -47,14 +52,18 @@ def _get_processed_record(item, spider):
         )
     )
 
-    assert records
+    assert parsed_items
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    yield [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items]
+    yield [
+        _get_processed_record(parsed_item, spider)
+        for parsed_item in parsed_items
+    ]
 
     clean_dir()
 
+
 def test_page_nr(many_results):
     """Test extracting page_nr"""
     page_nrs = [
diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py
index 38adc79b..f14144fd 100644
--- a/tests/unit/test_world_scientific.py
+++ b/tests/unit/test_world_scientific.py
@@ -7,7 +7,12 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
 
 import pytest
 import os
@@ -24,6 +29,12 @@
 )
 
 
+@pytest.fixture(scope='function', autouse=True)
+def cleanup():
+    yield
+    clean_dir()
+
+
 def create_spider():
     crawler = Crawler(spidercls=wsp_spider.WorldScientificSpider)
     return wsp_spider.WorldScientificSpider.from_crawler(crawler)
@@ -47,9 +58,10 @@ def get_records(response_file_name):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    yield (pipeline.process_item(record, spider) for record in records)
-
-    clean_dir()
+    return (
+        pipeline.process_item(record, spider)
+        for record in records
+    )
 
 
 def get_one_record(response_file_name):
@@ -72,17 +84,25 @@ def override_generated_fields(record):
         [
             get_one_record('world_scientific/sample_ws_record.xml'),
             (
-                "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have recently been used as light absorbers in hybrid"
-                " organic-inorganic solid-state solar cells, with efficiencies above 15%. To date, it is essential to"
-                " add Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the hole transport materials (HTM) to get"
-                " a higher conductivity. However, the detrimental effect of high LiTFSI concentration on the charge transport"
-                ", DOS in the conduction band of the TiO$_{2}$ substrate and device stability results in an overall "
-                "compromise for a satisfactory device. Using a higher mobility hole conductor to avoid lithium salt "
-                "is an interesting alternative. Herein, we successfully made an efficient perovskite solar cell by "
-                "applying a hole conductor PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in the absence of"
-                " LiTFSI. Under AM 1.5 illumination of 100 mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is "
-                "comparable to the efficiency of 12.3% with the addition of 1.3 mM LiTFSI. An unsealed device without "
-                "Li$^{+}$ shows interestingly a promising stability."
+                "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have "
+                "recently been used as light absorbers in hybrid"
+                " organic-inorganic solid-state solar cells, with "
+                "efficiencies above 15%. To date, it is essential to add "
+                "Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the "
+                "hole transport materials (HTM) to get a higher conductivity. "
+                "However, the detrimental effect of high LiTFSI concentration "
+                "on the charge transport, DOS in the conduction band of the "
+                "TiO$_{2}$ substrate and device stability results in an "
+                "overall compromise for a satisfactory device. Using a higher "
+                "mobility hole conductor to avoid lithium salt is an "
+                "interesting alternative. Herein, we successfully made an "
+                "efficient perovskite solar cell by applying a hole conductor "
+                "PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in "
+                "the absence of LiTFSI. Under AM 1.5 illumination of 100 "
+                "mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is "
+                "comparable to the efficiency of 12.3% with the addition of "
+                "1.3 mM LiTFSI. An unsealed device without Li$^{+}$ shows "
+                "interestingly a promising stability."
             ),
         ],
     ],
@@ -103,7 +123,10 @@ def test_abstract(generated_record, expected_abstract):
             get_one_record('world_scientific/sample_ws_record.xml'),
             [{
                 'source': 'WSP',
-                'title': 'High-efficient Solid-state Perovskite Solar Cell Without Lithium Salt in the Hole Transport Material',
+                'title': (
+                    'High-efficient Solid-state Perovskite Solar Cell Without '
+                    'Lithium Salt in the Hole Transport Material'
+                ),
             }],
         ],
     ],
@@ -296,12 +319,18 @@ def test_publication_info(generated_record, expected_publication_info):
         [
             get_one_record('world_scientific/sample_ws_record.xml'),
             {
-                'authors': ["BI, DONGQIN", "BOSCHLOO, GERRIT", "HAGFELDT, ANDERS"],
+                'authors': [
+                    "BI, DONGQIN",
+                    "BOSCHLOO, GERRIT",
+                    "HAGFELDT, ANDERS",
+                ],
                 'affiliation': (
-                    'Department of Chemistry-Angstrom Laboratory, Uppsala University, Box 532, SE 751 20 Uppsala, Sweden'
+                    'Department of Chemistry-Angstrom Laboratory, Uppsala '
+                    'University, Box 532, SE 751 20 Uppsala, Sweden'
                 ),
                 'xref_affiliation': (
-                    'Physics Department, Brookhaven National Laboratory, Upton, NY 11973, USA'
+                    'Physics Department, Brookhaven National Laboratory, '
+                    'Upton, NY 11973, USA'
                 ),
             },
         ],
@@ -319,11 +348,14 @@ def test_authors(generated_record, expected_authors):
     for index, name in enumerate(expected_authors['authors']):
             assert generated_record['authors'][index]['full_name'] == name
             assert expected_authors['affiliation'] in [
-                aff['value'] for aff in generated_record['authors'][index]['affiliations']
+                aff['value']
+                for aff in generated_record['authors'][index]['affiliations']
             ]
             if index == 1:
                 assert expected_authors['xref_affiliation'] in [
-                    aff['value'] for aff in generated_record['authors'][index]['affiliations']
+                    aff['value']
+                    for aff
+                    in generated_record['authors'][index]['affiliations']
                 ]
 
 
@@ -418,7 +450,10 @@ def test_pipeline_record(generated_record):
         'abstracts': [
             {
                 'source': 'WSP',
-                'value': u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa.',
+                'value': (
+                    u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, '
+                    'bla blaaa blaa. Bla blaaa blaa.'
+                ),
             },
         ],
         'acquisition_source': {
@@ -431,7 +466,10 @@ def test_pipeline_record(generated_record):
             {
                 'affiliations': [
                     {
-                        'value': u'Department, University, City, City_code 123456, C. R. Country_2',
+                        'value': (
+                            u'Department, University, City, City_code 123456, '
+                            'C. R. Country_2'
+                        ),
                     },
                 ],
                 'full_name': u'author_surname_2, author_name_1',

From ddda124aa67ad1898f4d861523873cab656fc17f Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:39:27 +0200
Subject: [PATCH 18/26] gitignore: add some test product files

Signed-off-by: David Caro <david@dcaro.es>
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 01895086..3b606de7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,8 @@ nosetests.xml
 coverage.xml
 twistd.pid
 .coverage.*
+tests/unit/responses/edp/test_gz
+tests/unit/responses/edp/test_rich
 
 # Translations
 *.mo
@@ -57,6 +59,8 @@ jobs
 dbs
 items
 logs
+.scrapy
+scrapy_feed_uri
 
 # Local settings
 local_settings.py

From 610fce43caf6bacc8d05a42a59f897dd474b761d Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:40:01 +0200
Subject: [PATCH 19/26] docker-compose: swap sleep with healthcheck

That allows to properly wait for the services to be up, instead of
adding a dummy sleep.

Signed-off-by: David Caro <david@dcaro.es>
---
 .travis.yml             |  2 +-
 docker-compose.test.yml | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 91407e6e..53f7dee5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 
 install:
   - travis_retry docker-compose -f docker-compose.deps.yml run --rm pip
-  - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd_deploy
+  - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd-deploy
 
 script:
   - travis_retry docker-compose -f docker-compose.test.yml run --rm ${SUITE}
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index 64eff7df..074f50ce 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -66,10 +66,17 @@ services:
     command: bash -c "rm -f twistd.pid && exec scrapyd"
     links:
       - celery
+    healthcheck:
+      timeout: 5s
+      interval: 5s
+      retries: 5
+      test:
+        - "CMD-SHELL"
+        - "curl http://localhost:6800/listprojects.json"
 
-  scrapyd_deploy:
+  scrapyd-deploy:
     <<: *service_base
-    command: bash -c "sleep 8 && scrapyd-deploy"  # make sure that the scrapyd is up
+    command: bash -c "scrapyd-deploy"
     links:
       - scrapyd
 

From ff1eb860ebfc058eff60eff724c81f9ef30bb623 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:41:05 +0200
Subject: [PATCH 20/26] utils: nicer ParsedItem string reperesentation

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 256dd508..caff462d 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -10,6 +10,7 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+import pprint
 import re
 from operator import itemgetter
 from itertools import groupby
@@ -467,3 +468,6 @@ def __getattr__(self, key):
 
     def __setattr__(self, key, value):
         self[key] = value
+
+    def __str__(self):
+        return pprint.pformat(self)

From f111993c09a08e30f8d0355a44e74058244bcfd2 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:42:16 +0200
Subject: [PATCH 21/26] celery_monitor: small refactor and default event fix

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/testlib/celery_monitor.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py
index 1347ab22..c8d94949 100644
--- a/hepcrawl/testlib/celery_monitor.py
+++ b/hepcrawl/testlib/celery_monitor.py
@@ -24,7 +24,13 @@
 
 
 class CeleryMonitor(object):
-    def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2):
+    def __init__(
+        self,
+        app,
+        monitor_timeout=3,
+        monitor_iter_limit=100,
+        events_limit=2,
+    ):
         self.results = []
         self.recv = None
         self.app = app
@@ -39,7 +45,13 @@ def __enter__(self):
         def announce_succeeded_tasks(event):
             state.event(event)
             task = state.tasks.get(event['uuid'])
-            LOGGER.info('TASK SUCCEEDED: %s[%s] %s' % (task.name, task.uuid, task.info(),))
+            LOGGER.info(
+                'TASK SUCCEEDED: %s[%s] %s' % (
+                    task.name,
+                    task.uuid,
+                    task.info(),
+                )
+            )
             tasks = self.app.AsyncResult(task.id)
             for task in tasks.result:
                 self.results.append(task)
@@ -48,7 +60,9 @@ def announce_succeeded_tasks(event):
         def announce_failed_tasks(event):
             state.event(event)
             task = state.tasks.get(event['uuid'])
-            LOGGER.info('TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),))
+            LOGGER.info(
+                'TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),)
+            )
             self.results.append(task.info())
             self.recv.should_stop = True
 
@@ -62,7 +76,11 @@ def announce_failed_tasks(event):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        events_iter = self.recv.itercapture(limit=None, timeout=self.monitor_timeout, wakeup=True)
+        events_iter = self.recv.itercapture(
+            limit=None,
+            timeout=self.monitor_timeout,
+            wakeup=True,
+        )
         self._wait_for_results(events_iter)
         self.connection.__exit__()
 
@@ -84,8 +102,8 @@ def do_crawl(
         app,
         monitor_timeout,
         monitor_iter_limit,
-        events_limit,
         crawler_instance,
+        events_limit=2,
         project='hepcrawl',
         spider='WSP',
         settings=None,

From bcf8f76f0b7d8eafcff8951b4490ca416c9993a9 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:42:55 +0200
Subject: [PATCH 22/26] tohep: add some useful debug logs

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/tohep.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py
index 8fcbc735..88e5e8d2 100644
--- a/hepcrawl/tohep.py
+++ b/hepcrawl/tohep.py
@@ -25,10 +25,14 @@
 
 import os
 import datetime
+import logging
 
 from inspire_schemas.api import LiteratureBuilder
 
 
+LOGGER = logging.getLogger(__name__)
+
+
 class UnknownItemFormat(Exception):
     pass
 
@@ -195,10 +199,12 @@ def hep_to_hep(hep_record, record_files):
     hepcrawl one (normally, marc-ingesting spiders).
     """
     if record_files:
+        LOGGER.debug('Updating fft fields from: %s', hep_record['_fft'])
         hep_record['_fft'] = _get_updated_fft_fields(
             current_fft_fields=hep_record['_fft'],
             record_files=record_files,
         )
+        LOGGER.debug('Updated fft fields to: %s', hep_record['_fft'])
 
     return hep_record
 

From 5aacb1ce3982ec9073ed21f99b3c8546f438ff1e Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:43:57 +0200
Subject: [PATCH 23/26] middlewares: use better key for crawl-once

That way uses the schema (ftp/http/local...) and the file name as
key, instead of just the file name, to avoid downloading a file from
ftp and not crawling it after.

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/middlewares.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
index 1044ba0d..99551e93 100644
--- a/hepcrawl/middlewares.py
+++ b/hepcrawl/middlewares.py
@@ -13,6 +13,7 @@
 
 import os
 import time
+import logging
 
 from ftplib import FTP
 from six.moves.urllib.parse import urlparse
@@ -23,6 +24,9 @@
 from hepcrawl.utils import ftp_connection_info
 
 
+LOGGER = logging.getLogger(__name__)
+
+
 class ErrorHandlingMiddleware(object):
     @classmethod
     def from_crawler(cls, crawler):
@@ -88,15 +92,26 @@ class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware):
     """
     def process_request(self, request, spider):
         if not request.meta.get('crawl_once', self.default):
+            if 'crawl_once' in request.meta:
+                LOGGER.info('Crawl-Once: skipping by explicit crawl_once meta')
+            else:
+                LOGGER.info('Crawl-Once: skipping by default crawl_once meta')
             return
 
-        request.meta['crawl_once_key'] = os.path.basename(request.url)
+        request.meta['crawl_once_key'] = self._get_key(request)
         request.meta['crawl_once_value'] = self._get_timestamp(request, spider)
 
         if not self._has_to_be_crawled(request, spider):
+            LOGGER.info(
+                'Crawl-Once: Skipping due to `has_to_be_crawled`, %s' % request
+            )
             self.stats.inc_value('crawl_once/ignored')
             raise IgnoreRequest()
 
+        LOGGER.info(
+            'Crawl-Once: Not skipping: %s' % request
+        )
+
     def _has_to_be_crawled(self, request, spider):
         request_db_key = self._get_key(request)
 
@@ -107,6 +122,16 @@ def _has_to_be_crawled(self, request, spider):
         old_file_timestamp = self.db.get(key=request_db_key)
         return new_file_timestamp > old_file_timestamp
 
+    def _get_key(self, request):
+        parsed_url = urlparse(request.url)
+        fname = os.path.basename(parsed_url.path)
+        if parsed_url.scheme == 'file':
+            prefix = 'local'
+        else:
+            prefix = 'remote'
+
+        return prefix + '::' + fname
+
     @classmethod
     def _get_timestamp(cls, request, spider):
         parsed_url = urlparse(request.url)

From 1a53584b22f0fb504f270c65bea36dd3306ecd97 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:45:24 +0200
Subject: [PATCH 24/26] pipelines: added extra debug log

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/pipelines.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 7a51e4cb..d9949338 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -17,6 +17,7 @@
 
 import os
 import shutil
+import pprint
 
 import requests
 
@@ -93,10 +94,16 @@ def open_spider(self, spider):
     def _post_enhance_item(self, item, spider):
         source = spider.name
 
-        return item_to_hep(
+        enhanced_record = item_to_hep(
             item=item,
             source=source,
         )
+        spider.logger.debug(
+            'Got post-enhanced hep record:\n%s' % pprint.pformat(
+                enhanced_record
+            )
+        )
+        return enhanced_record
 
     def process_item(self, item, spider):
         """Convert internal format to INSPIRE data model."""

From 3110b0f9d1a954cf071b4ffbeec5f75c9f3263f5 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:45:50 +0200
Subject: [PATCH 25/26] desy: adapt to the new middleware

As It changes the actual url of the files to download, the hash also
changes.

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/spiders/desy_spider.py               | 15 +++++-
 .../fixtures/desy_local_records_expected.json | 22 ++++-----
 tests/functional/desy/test_desy.py            | 48 ++++++++++---------
 3 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index ec70ec39..efae063e 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -178,13 +178,18 @@ def start_requests(self):
             yield request
 
     @staticmethod
-    def _get_full_uri(current_path, base_url, schema, hostname=''):
+    def _get_full_uri(current_path, base_url, schema, hostname=None):
+        hostname = hostname or ''
         if os.path.isabs(current_path):
             full_path = current_path
         else:
             full_path = os.path.join(base_url, current_path)
 
-        return '{schema}://{hostname}{full_path}'.format(**vars())
+        return '{schema}://{hostname}{full_path}'.format(
+            schema=schema,
+            hostname=hostname,
+            full_path=full_path,
+        )
 
     def parse(self, response):
         """Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`.
@@ -208,8 +213,12 @@ def parse(self, response):
             url_schema = 'file'
             hostname = None
 
+        self.log('Getting marc xml records...')
         marcxml_records = self._get_marcxml_records(response.body)
+        self.log('Got %d marc xml records' % len(marcxml_records))
+        self.log('Getting hep records...')
         hep_records = self._hep_records_from_marcxml(marcxml_records)
+        self.log('Got %d hep records' % len(hep_records))
 
         for hep_record in hep_records:
             list_file_urls = [
@@ -222,12 +231,14 @@ def parse(self, response):
                 for fft_path in hep_record['_fft']
             ]
 
+            self.log('Got the following fft urls: %s' % list_file_urls)
             parsed_item = ParsedItem(
                 record=hep_record,
                 file_urls=list_file_urls,
                 ftp_params=ftp_params,
                 record_format='hep',
             )
+            self.log('Got item: %s' % parsed_item)
 
             yield parsed_item
 
diff --git a/tests/functional/desy/fixtures/desy_local_records_expected.json b/tests/functional/desy/fixtures/desy_local_records_expected.json
index 1dc784b9..dc7baf23 100644
--- a/tests/functional/desy/fixtures/desy_local_records_expected.json
+++ b/tests/functional/desy/fixtures/desy_local_records_expected.json
@@ -10,7 +10,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -19,7 +19,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -78,7 +78,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -87,7 +87,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -146,7 +146,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -155,7 +155,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -214,7 +214,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -223,7 +223,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -1754,7 +1754,7 @@
             "format": ".pdf",
             "filename": "dummy",
             "version": 1,
-            "path": "/tmp/file_urls/full/c011422ef40ef111a72bd72092066dd3c1cc7a39.pdf",
+            "path": "/tmp/file_urls/full/0df3efe7842cf285ae0eeed845cca003dd755674.pdf",
             "type": "Main"
         },
         {
@@ -1763,7 +1763,7 @@
             "format": ".txt",
             "filename": "test_fft_1",
             "version": 1,
-            "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+            "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
             "type": "Main"
         },
         {
@@ -1772,7 +1772,7 @@
             "format": ".txt",
             "filename": "test_fft_2",
             "version": 1,
-            "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+            "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
             "type": "Main"
         }
     ],
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
index d7286f7e..c3e7ca4f 100644
--- a/tests/functional/desy/test_desy.py
+++ b/tests/functional/desy/test_desy.py
@@ -13,6 +13,7 @@
 
 import copy
 import hashlib
+import os
 from time import sleep
 
 import pytest
@@ -76,6 +77,29 @@ def _generate_md5_hash(file_path):
     assert file_1_hash == file_2_hash
 
 
+def assert_ffts_content_matches_expected(record):
+    for fft_field in record.get('_fft', []):
+        assert_fft_content_matches_expected(fft_field)
+
+
+def assert_fft_content_matches_expected(fft_field):
+    expected_file_name = get_file_name_from_fft(fft_field)
+    assert_files_equal(expected_file_name, fft_field['path'])
+
+
+def get_file_name_from_fft(fft_field):
+    file_path = get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        'DESY',
+        'FFT',
+        fft_field['filename'] + fft_field['format'],
+        test_suite='functional',
+    )
+    return file_path
+
+
 def get_ftp_settings():
     netrc_location = get_test_suite_path(
         'desy',
@@ -120,6 +144,7 @@ def cleanup():
     sleep(10)
     yield
 
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
     clean_dir('/tmp/file_urls')
     clean_dir('/tmp/DESY')
 
@@ -180,26 +205,3 @@ def test_desy(
 
     for record in gotten_results:
         assert_ffts_content_matches_expected(record)
-
-
-def assert_ffts_content_matches_expected(record):
-    for fft_field in record.get('_fft', []):
-        assert_fft_content_matches_expected(fft_field)
-
-
-def assert_fft_content_matches_expected(fft_field):
-    expected_file_name = get_file_name_from_fft(fft_field)
-    assert_files_equal(expected_file_name, fft_field['path'])
-
-
-def get_file_name_from_fft(fft_field):
-    file_path = get_test_suite_path(
-        'desy',
-        'fixtures',
-        'ftp_server',
-        'DESY',
-        'FFT',
-        fft_field['filename'] + fft_field['format'],
-        test_suite='functional',
-    )
-    return file_path

From bd72a4fe6366c7d1980a49c8ca981185669246bf Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 20 Sep 2017 13:59:39 +0200
Subject: [PATCH 26/26] wsp: adapt to new middleware and refactor

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/spiders/wsp_spider.py   | 104 +++++++++++++++++++------------
 tests/functional/wsp/test_wsp.py |  74 +++++++++++++++-------
 2 files changed, 114 insertions(+), 64 deletions(-)

diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index 3498ca31..1868c9c5 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -49,6 +49,19 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
        ``WorldScientificSpider.parse_node()``.
 
 
+    Args:
+        local_package_dir(str): path to the local directory holding the zip
+            files to parse and extract the records for, if set, will ignore all
+            the ftp options.
+        ftp_folder(str): remote folder in the ftp server to get the zip files
+            from.
+        ftp_host(str): host name of the ftp server to connect to.
+        ftp_netrc(str): path to the netrc file containing the authentication
+            settings for the ftp.
+        target_folder(str): path to the temporary local directory to download
+            the files to.
+
+
     Example:
         To run a crawl, you need to pass FTP connection information via
         ``ftp_host`` and ``ftp_netrc``::
@@ -80,11 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
 
     def __init__(
         self,
-        package_path=None,
+        local_package_dir=None,
         ftp_folder="WSP",
         ftp_host=None,
         ftp_netrc=None,
-        tmp_dir=None,
+        target_folder=None,
         *args,
         **kwargs
     ):
@@ -93,53 +106,62 @@ def __init__(
         self.ftp_folder = ftp_folder
         self.ftp_host = ftp_host
         self.ftp_netrc = ftp_netrc
-        self.tmp_dir = (
-            tmp_dir or
+        self.target_folder = (
+            target_folder or
             tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
         )
-        self.package_path = package_path
+        self.local_package_dir = local_package_dir
 
-    def start_requests(self):
-        """List selected folder on remote FTP and yield new zip files."""
-        if self.package_path:
-            new_files_paths = local_list_files(
-                self.package_path,
-                self.target_folder
+    def _get_local_requests(self):
+        new_files_paths = local_list_files(
+            self.local_package_dir,
+            self.target_folder
+        )
+
+        for file_path in new_files_paths:
+            yield Request(
+                "file://{0}".format(file_path),
+                callback=self.handle_package_file,
             )
 
-            for file_path in new_files_paths:
-                yield Request(
-                    "file://{0}".format(file_path),
-                    callback=self.handle_package_file,
-                )
-        else:
-            ftp_host, ftp_params = ftp_connection_info(
-                self.ftp_host,
-                self.ftp_netrc,
+    def _get_remote_requests(self):
+        ftp_host, ftp_params = ftp_connection_info(
+            self.ftp_host,
+            self.ftp_netrc,
+        )
+
+        new_files_paths = ftp_list_files(
+            self.ftp_folder,
+            destination_folder=self.target_folder,
+            ftp_host=ftp_host,
+            user=ftp_params['ftp_user'],
+            password=ftp_params['ftp_password']
+        )
+
+        for remote_file in new_files_paths:
+            # Cast to byte-string for scrapy compatibility
+            remote_file = str(remote_file)
+            ftp_params["ftp_local_filename"] = os.path.join(
+                self.target_folder,
+                os.path.basename(remote_file)
             )
 
-            new_files_paths = ftp_list_files(
-                self.ftp_folder,
-                destination_folder=self.target_folder,
-                ftp_host=ftp_host,
-                user=ftp_params['ftp_user'],
-                password=ftp_params['ftp_password']
+            remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
+            yield Request(
+                str(remote_url),
+                meta=ftp_params,
+                callback=self.handle_package_ftp
             )
 
-            for remote_file in new_files_paths:
-                # Cast to byte-string for scrapy compatibility
-                remote_file = str(remote_file)
-                ftp_params["ftp_local_filename"] = os.path.join(
-                    self.tmp_dir,
-                    os.path.basename(remote_file)
-                )
-
-                remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
-                yield Request(
-                    str(remote_url),
-                    meta=ftp_params,
-                    callback=self.handle_package_ftp
-                )
+    def start_requests(self):
+        """List selected folder on remote FTP and yield new zip files."""
+        if self.local_package_dir:
+            requests_iter = self._get_local_requests()
+        else:
+            requests_iter = self._get_remote_requests()
+
+        for request in requests_iter:
+            yield request
 
     def handle_package_ftp(self, response):
         """Handle a zip package and yield every XML found."""
@@ -158,7 +180,7 @@ def handle_package_file(self, response):
         """Handle a local zip package and yield every XML."""
         self.log("Visited file %s" % response.url)
         zip_filepath = urlparse.urlsplit(response.url).path
-        xml_files = unzip_xml_files(zip_filepath, self.tmp_dir)
+        xml_files = unzip_xml_files(zip_filepath, self.target_folder)
 
         for xml_file in xml_files:
             yield Request(
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index 27f36955..493837ec 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -28,13 +28,15 @@
 
 def override_generated_fields(record):
     record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216'
-    record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad'
+    record['acquisition_source']['submission_number'] = (
+        u'5652c7f6190f11e79e8000224dabeaad'
+    )
 
     return record
 
 
 @pytest.fixture(scope="function")
-def set_up_ftp_environment():
+def ftp_environment():
     netrc_location = get_test_suite_path(
         'wsp',
         'fixtures',
@@ -43,7 +45,8 @@ def set_up_ftp_environment():
         test_suite='functional',
     )
 
-    # The test must wait until the docker environment is up (takes about 10 seconds).
+    # The test must wait until the docker environment is up (takes about 10
+    # seconds).
     sleep(10)
 
     yield {
@@ -73,7 +76,7 @@ def set_up_local_environment():
         'CRAWLER_HOST_URL': 'http://scrapyd:6800',
         'CRAWLER_PROJECT': 'hepcrawl',
         'CRAWLER_ARGUMENTS': {
-            'package_path': package_location,
+            'local_package_dir': package_location,
         }
     }
 
@@ -105,8 +108,10 @@ def remove_generated_files(package_location):
         'smoke',
     ]
 )
-def test_wsp_ftp(set_up_ftp_environment, expected_results):
-    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+def test_wsp_ftp(ftp_environment, expected_results):
+    crawler = get_crawler_instance(
+        ftp_environment.get('CRAWLER_HOST_URL'),
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -114,14 +119,18 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
         monitor_iter_limit=100,
         events_limit=1,
         crawler_instance=crawler,
-        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        project=ftp_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
         settings={},
-        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
     )
 
-    gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    gotten_results = [
+        override_generated_fields(result) for result in results
+    ]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results
 
@@ -139,22 +148,29 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
         'crawl_twice',
     ]
 )
-def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
-    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+def test_wsp_ftp_crawl_twice(ftp_environment, expected_results):
+    crawler = get_crawler_instance(
+        ftp_environment.get('CRAWLER_HOST_URL'),
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=20,
+        events_limit=2,
         crawler_instance=crawler,
-        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        project=ftp_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
         settings={},
-        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
     )
 
-    gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    gotten_results = [
+        override_generated_fields(result) for result in results
+    ]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results
 
@@ -162,11 +178,12 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=20,
+        events_limit=2,
         crawler_instance=crawler,
-        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        project=ftp_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
         settings={},
-        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
@@ -188,7 +205,9 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
     ]
 )
 def test_wsp_local_package_path(set_up_local_environment, expected_results):
-    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+    crawler = get_crawler_instance(
+        set_up_local_environment.get('CRAWLER_HOST_URL')
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -203,7 +222,9 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results
 
@@ -221,8 +242,13 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
         'crawl_twice',
     ]
 )
-def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results):
-    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+def test_wsp_local_package_path_crawl_twice(
+    set_up_local_environment,
+    expected_results,
+):
+    crawler = get_crawler_instance(
+        set_up_local_environment.get('CRAWLER_HOST_URL')
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -236,7 +262,9 @@ def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_r
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results