diff --git a/.gitignore b/.gitignore
index 01895086..3b606de7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,8 @@ nosetests.xml
 coverage.xml
 twistd.pid
 .coverage.*
+tests/unit/responses/edp/test_gz
+tests/unit/responses/edp/test_rich
 
 # Translations
 *.mo
@@ -57,6 +59,8 @@ jobs
 dbs
 items
 logs
+.scrapy
+scrapy_feed_uri
 
 # Local settings
 local_settings.py
diff --git a/.travis.yml b/.travis.yml
index 91407e6e..53f7dee5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 
 install:
   - travis_retry docker-compose -f docker-compose.deps.yml run --rm pip
-  - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd_deploy
+  - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd-deploy
 
 script:
   - travis_retry docker-compose -f docker-compose.test.yml run --rm ${SUITE}
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index d14d7c87..074f50ce 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -18,6 +18,7 @@ services:
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
       - APP_FILES_STORE=/tmp/file_urls
+      - APP_CRAWL_ONCE_PATH=/code/.scrapy
       - COVERAGE_PROCESS_START=/code/.coveragerc
       - BASE_USER_UID=${BASE_USER_UID:-1000}
       - BASE_USER_GIT=${BASE_USER_GIT:-1000}
@@ -65,10 +66,17 @@ services:
     command: bash -c "rm -f twistd.pid && exec scrapyd"
     links:
       - celery
+    healthcheck:
+      timeout: 5s
+      interval: 5s
+      retries: 5
+      test:
+        - "CMD-SHELL"
+        - "curl http://localhost:6800/listprojects.json"
 
-  scrapyd_deploy:
+  scrapyd-deploy:
     <<: *service_base
-    command: bash -c "sleep 8 && scrapyd-deploy"  # make sure that the scrapyd is up
+    command: bash -c "scrapyd-deploy"
     links:
       - scrapyd
 
diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py
index dab5c7e4..99551e93 100644
--- a/hepcrawl/middlewares.py
+++ b/hepcrawl/middlewares.py
@@ -11,10 +11,23 @@
 
 from __future__ import absolute_import, division, print_function
 
-class ErrorHandlingMiddleware(object):
+import os
+import time
+import logging
+
+from ftplib import FTP
+from six.moves.urllib.parse import urlparse
+
+from scrapy.exceptions import IgnoreRequest
+from scrapy_crawl_once.middlewares import CrawlOnceMiddleware
+
+from hepcrawl.utils import ftp_connection_info
+
 
-    """Log errors."""
+LOGGER = logging.getLogger(__name__)
 
+
+class ErrorHandlingMiddleware(object):
     @classmethod
     def from_crawler(cls, crawler):
         return cls(crawler.settings)
@@ -24,13 +37,142 @@ def __init__(self, settings):
 
     def process_spider_exception(self, response, exception, spider):
         """Register the error in the spider and continue."""
-        self.process_exception(response, exception, spider)
+        return self.process_exception(response, exception, spider)
 
     def process_exception(self, request, exception, spider):
         """Register the error in the spider and continue."""
-        if 'errors' not in spider.state:
-            spider.state['errors'] = []
-        spider.state['errors'].append({
+        spider.state.setdefault('errors', []).append({
             'exception': exception,
             'sender': request,
         })
+
+
+class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware):
+    """
+    This spider and downloader middleware allows to avoid re-crawling pages
+    which were already downloaded in previous crawls.
+
+    To enable it, modify ``settings.py``::
+
+        SPIDER_MIDDLEWARES = {
+            # ...
+            'scrapy_crawl_once.CrawlOnceMiddleware': 100,
+            # ...
+        }
+
+        DOWNLOADER_MIDDLEWARES = {
+            # ...
+            'scrapy_crawl_once.CrawlOnceMiddleware': 50,
+            # ...
+        }
+
+    By default it does nothing. To avoid crawling a particular page
+    multiple times set ``request.meta['crawl_once'] = True``. Other
+    ``request.meta`` keys that modify it's behavior:
+
+    * ``crawl_once_value`` - a value to store in DB. By default, timestamp
+        is stored for Http/Https requests and last-modified is stored for
+        FTP/File requests.
+    * ``crawl_once_key`` - unique file name is used.
+
+    Settings:
+
+    * ``CRAWL_ONCE_ENABLED``:set it to False to disable middleware. Default
+        is True.
+    * ``CRAWL_ONCE_PATH``: a path to a folder with crawled requests database.
+        By default ``.scrapy/crawl_once/`` path is used; this folder contains
+        ``<spider_name>.sqlite`` files with databases of seen requests.
+    * ``CRAWL_ONCE_DEFAULT``: default value for ``crawl_once`` meta key (False
+        by default). When True, all requests are handled by this middleware
+        unless disabled explicitly using
+        ``request.meta['crawl_once'] = False``.
+
+
+    For more info see: https://github.com/TeamHG-Memex/scrapy-crawl-once
+    """
+    def process_request(self, request, spider):
+        if not request.meta.get('crawl_once', self.default):
+            if 'crawl_once' in request.meta:
+                LOGGER.info('Crawl-Once: skipping by explicit crawl_once meta')
+            else:
+                LOGGER.info('Crawl-Once: skipping by default crawl_once meta')
+            return
+
+        request.meta['crawl_once_key'] = self._get_key(request)
+        request.meta['crawl_once_value'] = self._get_timestamp(request, spider)
+
+        if not self._has_to_be_crawled(request, spider):
+            LOGGER.info(
+                'Crawl-Once: Skipping due to `has_to_be_crawled`, %s' % request
+            )
+            self.stats.inc_value('crawl_once/ignored')
+            raise IgnoreRequest()
+
+        LOGGER.info(
+            'Crawl-Once: Not skipping: %s' % request
+        )
+
+    def _has_to_be_crawled(self, request, spider):
+        request_db_key = self._get_key(request)
+
+        if request_db_key not in self.db:
+            return True
+
+        new_file_timestamp = self._get_timestamp(request, spider)
+        old_file_timestamp = self.db.get(key=request_db_key)
+        return new_file_timestamp > old_file_timestamp
+
+    def _get_key(self, request):
+        parsed_url = urlparse(request.url)
+        fname = os.path.basename(parsed_url.path)
+        if parsed_url.scheme == 'file':
+            prefix = 'local'
+        else:
+            prefix = 'remote'
+
+        return prefix + '::' + fname
+
+    @classmethod
+    def _get_timestamp(cls, request, spider):
+        parsed_url = urlparse(request.url)
+        full_url = request.url
+        if parsed_url.scheme == 'ftp':
+            last_modified = cls._get_ftp_timestamp(spider, full_url)
+        elif parsed_url.scheme == 'file':
+            last_modified = cls._get_file_timestamp(full_url)
+        else:
+            last_modified = time.time()
+
+        return last_modified
+
+    @classmethod
+    def _get_ftp_timestamp(cls, spider, url):
+        ftp_host, params = ftp_connection_info(
+            spider.ftp_host,
+            spider.ftp_netrc,
+        )
+        ftp = FTP(
+            host=ftp_host,
+            user=params['ftp_user'],
+            passwd=params['ftp_password'],
+        )
+        return ftp.sendcmd(
+            'MDTM {}'.format(
+                cls._get_ftp_relative_path(
+                    url=url,
+                    host=ftp_host
+                )
+            )
+        )
+
+    @staticmethod
+    def _get_ftp_relative_path(url, host):
+        return url.replace(
+            'ftp://{0}/'.format(host),
+            '',
+        )
+
+    @staticmethod
+    def _get_file_timestamp(url):
+        file_path = url.replace('file://', '')
+        return os.stat(file_path).st_mtime
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 8cd31c0e..d9949338 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -16,6 +16,8 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+import shutil
+import pprint
 
 import requests
 
@@ -92,10 +94,16 @@ def open_spider(self, spider):
     def _post_enhance_item(self, item, spider):
         source = spider.name
 
-        return item_to_hep(
+        enhanced_record = item_to_hep(
             item=item,
             source=source,
         )
+        spider.logger.debug(
+            'Got post-enhanced hep record:\n%s' % pprint.pformat(
+                enhanced_record
+            )
+        )
+        return enhanced_record
 
     def process_item(self, item, spider):
         """Convert internal format to INSPIRE data model."""
@@ -124,7 +132,8 @@ def _prepare_payload(self, spider):
         ]
         return payload
 
-    def _cleanup(self, spider):
+    @staticmethod
+    def _cleanup(spider):
         """Run cleanup."""
         # Cleanup errors
         if 'errors' in spider.state:
@@ -175,6 +184,10 @@ def close_spider(self, spider):
         """Post results to BROKER API."""
         from celery.utils.log import get_task_logger
         logger = get_task_logger(__name__)
+
+        if hasattr(spider, 'tmp_dir'):
+            shutil.rmtree(path=spider.tmp_dir, ignore_errors=True)
+
         if 'SCRAPY_JOB' in os.environ and self.count > 0:
             task_endpoint = spider.settings[
                 'API_PIPELINE_TASK_ENDPOINT_MAPPING'
diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index 31d608bf..216f7694 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -62,14 +62,23 @@
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 SPIDER_MIDDLEWARES = {
     'hepcrawl.middlewares.ErrorHandlingMiddleware': 543,
+    'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
 }
 
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
     'hepcrawl.middlewares.ErrorHandlingMiddleware': 543,
+    'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
 }
 
+CRAWL_ONCE_ENABLED = True
+CRAWL_ONCE_DEFAULT = True
+CRAWL_ONCE_PATH = os.environ.get(
+    'APP_CRAWL_ONCE_PATH',
+    '/var/lib/scrapy/crawl_once/',
+)
+
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 EXTENSIONS = {
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index ec70ec39..efae063e 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -178,13 +178,18 @@ def start_requests(self):
             yield request
 
     @staticmethod
-    def _get_full_uri(current_path, base_url, schema, hostname=''):
+    def _get_full_uri(current_path, base_url, schema, hostname=None):
+        hostname = hostname or ''
         if os.path.isabs(current_path):
             full_path = current_path
         else:
             full_path = os.path.join(base_url, current_path)
 
-        return '{schema}://{hostname}{full_path}'.format(**vars())
+        return '{schema}://{hostname}{full_path}'.format(
+            schema=schema,
+            hostname=hostname,
+            full_path=full_path,
+        )
 
     def parse(self, response):
         """Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`.
@@ -208,8 +213,12 @@ def parse(self, response):
             url_schema = 'file'
             hostname = None
 
+        self.log('Getting marc xml records...')
         marcxml_records = self._get_marcxml_records(response.body)
+        self.log('Got %d marc xml records' % len(marcxml_records))
+        self.log('Getting hep records...')
         hep_records = self._hep_records_from_marcxml(marcxml_records)
+        self.log('Got %d hep records' % len(hep_records))
 
         for hep_record in hep_records:
             list_file_urls = [
@@ -222,12 +231,14 @@ def parse(self, response):
                 for fft_path in hep_record['_fft']
             ]
 
+            self.log('Got the following fft urls: %s' % list_file_urls)
             parsed_item = ParsedItem(
                 record=hep_record,
                 file_urls=list_file_urls,
                 ftp_params=ftp_params,
                 record_format='hep',
             )
+            self.log('Got item: %s' % parsed_item)
 
             yield parsed_item
 
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index 058e6cc0..1868c9c5 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -13,6 +13,7 @@
 
 import os
 import urlparse
+import tempfile
 
 from scrapy import Request
 from scrapy.spiders import XMLFeedSpider
@@ -42,22 +43,40 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
        on the remote server and downloads them to a designated local folder,
        using ``WorldScientificSpider.start_requests()``.
     2. Then the ZIP file is unpacked and it lists all the XML files found
-       inside, via ``WorldScientificSpider.handle_package()``. Note the callback from
-       ``WorldScientificSpider.start_requests()``.
-    3. Finally, now each XML file is parsed via ``WorldScientificSpider.parse_node()``.
+       inside, via ``WorldScientificSpider.handle_package()``. Note the
+       callback from ``WorldScientificSpider.start_requests()``.
+    3. Finally, now each XML file is parsed via
+       ``WorldScientificSpider.parse_node()``.
+
+
+    Args:
+        local_package_dir(str): path to the local directory holding the zip
+            files to parse and extract the records for, if set, will ignore all
+            the ftp options.
+        ftp_folder(str): remote folder in the ftp server to get the zip files
+            from.
+        ftp_host(str): host name of the ftp server to connect to.
+        ftp_netrc(str): path to the netrc file containing the authentication
+            settings for the ftp.
+        target_folder(str): path to the temporary local directory to download
+            the files to.
 
 
     Example:
         To run a crawl, you need to pass FTP connection information via
         ``ftp_host`` and ``ftp_netrc``::
 
-            $ scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
+            $ scrapy crawl \\
+                WSP \\
+                -a 'ftp_host=ftp.example.com' \\
+                -a 'ftp_netrc=/path/to/netrc'
     """
 
     name = 'WSP'
     custom_settings = {}
     start_urls = []
-    iterator = 'iternodes'  # This is actually unnecessary, since it's the default value
+    # This is actually unnecessary, since it's the default value
+    iterator = 'iternodes'
     itertag = 'article'
 
     allowed_article_types = [
@@ -74,10 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
 
     def __init__(
         self,
-        package_path=None,
-        ftp_folder="/WSP",
+        local_package_dir=None,
+        ftp_folder="WSP",
         ftp_host=None,
         ftp_netrc=None,
+        target_folder=None,
         *args,
         **kwargs
     ):
@@ -86,45 +106,62 @@ def __init__(
         self.ftp_folder = ftp_folder
         self.ftp_host = ftp_host
         self.ftp_netrc = ftp_netrc
-        self.target_folder = "/tmp/WSP"
-        self.package_path = package_path
-        if not os.path.exists(self.target_folder):
-            os.makedirs(self.target_folder)
+        self.target_folder = (
+            target_folder or
+            tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
+        )
+        self.local_package_dir = local_package_dir
 
-    def start_requests(self):
-        """List selected folder on remote FTP and yield new zip files."""
-        if self.package_path:
-            new_files_paths = local_list_files(
-                self.package_path,
-                self.target_folder
+    def _get_local_requests(self):
+        new_files_paths = local_list_files(
+            self.local_package_dir,
+            self.target_folder
+        )
+
+        for file_path in new_files_paths:
+            yield Request(
+                "file://{0}".format(file_path),
+                callback=self.handle_package_file,
             )
 
-            for file_path in new_files_paths:
-                yield Request("file://{0}".format(file_path), callback=self.handle_package_file)
-        else:
-            ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
-
-            new_files_paths = ftp_list_files(
-                self.ftp_folder,
-                destination_folder=self.target_folder,
-                ftp_host=ftp_host,
-                user=ftp_params['ftp_user'],
-                password=ftp_params['ftp_password']
+    def _get_remote_requests(self):
+        ftp_host, ftp_params = ftp_connection_info(
+            self.ftp_host,
+            self.ftp_netrc,
+        )
+
+        new_files_paths = ftp_list_files(
+            self.ftp_folder,
+            destination_folder=self.target_folder,
+            ftp_host=ftp_host,
+            user=ftp_params['ftp_user'],
+            password=ftp_params['ftp_password']
+        )
+
+        for remote_file in new_files_paths:
+            # Cast to byte-string for scrapy compatibility
+            remote_file = str(remote_file)
+            ftp_params["ftp_local_filename"] = os.path.join(
+                self.target_folder,
+                os.path.basename(remote_file)
             )
 
-            for remote_file in new_files_paths:
-                # Cast to byte-string for scrapy compatibility
-                remote_file = str(remote_file)
-                ftp_params["ftp_local_filename"] = os.path.join(
-                    self.target_folder,
-                    os.path.basename(remote_file)
-                )
-                remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
-                yield Request(
-                    str(remote_url),
-                    meta=ftp_params,
-                    callback=self.handle_package_ftp
-                )
+            remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
+            yield Request(
+                str(remote_url),
+                meta=ftp_params,
+                callback=self.handle_package_ftp
+            )
+
+    def start_requests(self):
+        """List selected folder on remote FTP and yield new zip files."""
+        if self.local_package_dir:
+            requests_iter = self._get_local_requests()
+        else:
+            requests_iter = self._get_remote_requests()
+
+        for request in requests_iter:
+            yield request
 
     def handle_package_ftp(self, response):
         """Handle a zip package and yield every XML found."""
@@ -132,6 +169,7 @@ def handle_package_ftp(self, response):
         zip_filepath = response.body
         zip_target_folder, dummy = os.path.splitext(zip_filepath)
         xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
+
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
@@ -142,8 +180,8 @@ def handle_package_file(self, response):
         """Handle a local zip package and yield every XML."""
         self.log("Visited file %s" % response.url)
         zip_filepath = urlparse.urlsplit(response.url).path
-        zip_target_folder, dummy = os.path.splitext(zip_filepath)
-        xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
+        xml_files = unzip_xml_files(zip_filepath, self.target_folder)
+
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
@@ -155,14 +193,20 @@ def parse_node(self, response, node):
         node.remove_namespaces()
         article_type = node.xpath('@article-type').extract()
         self.log("Got article_type {0}".format(article_type))
-        if article_type is None or article_type[0] not in self.allowed_article_types:
+        if (
+            article_type is None or
+            article_type[0] not in self.allowed_article_types
+        ):
             # Filter out non-interesting article types
             return
 
         record = HEPLoader(item=HEPRecord(), selector=node, response=response)
         if article_type in ['correction',
                             'addendum']:
-            record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")
+            record.add_xpath(
+                'related_article_doi',
+                "//related-article[@ext-link-type='doi']/@href",
+            )
             record.add_value('journal_doctype', article_type)
 
         dois = node.xpath("//article-id[@pub-id-type='doi']/text()").extract()
@@ -211,7 +255,10 @@ def parse_node(self, response, node):
         )
         record.add_value('license', license)
 
-        record.add_value('collections', self._get_collections(node, article_type, journal_title))
+        record.add_value(
+            'collections',
+            self._get_collections(node, article_type, journal_title),
+        )
 
         parsed_item = ParsedItem(
             record=dict(record.load_item()),
@@ -220,10 +267,16 @@ def parse_node(self, response, node):
 
         return parsed_item
 
-    def _get_collections(self, node, article_type, current_journal_title):
+    @staticmethod
+    def _get_collections(node, article_type, current_journal_title):
         """Return this articles' collection."""
         conference = node.xpath('.//conference').extract()
-        if conference or current_journal_title == "International Journal of Modern Physics: Conference Series":
+        if (
+            conference or
+            current_journal_title == (
+                "International Journal of Modern Physics: Conference Series"
+            )
+        ):
             return ['HEP', 'ConferencePaper']
         elif article_type == "review-article":
             return ['HEP', 'Review']
diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py
index 1347ab22..c8d94949 100644
--- a/hepcrawl/testlib/celery_monitor.py
+++ b/hepcrawl/testlib/celery_monitor.py
@@ -24,7 +24,13 @@
 
 
 class CeleryMonitor(object):
-    def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2):
+    def __init__(
+        self,
+        app,
+        monitor_timeout=3,
+        monitor_iter_limit=100,
+        events_limit=2,
+    ):
         self.results = []
         self.recv = None
         self.app = app
@@ -39,7 +45,13 @@ def __enter__(self):
         def announce_succeeded_tasks(event):
             state.event(event)
             task = state.tasks.get(event['uuid'])
-            LOGGER.info('TASK SUCCEEDED: %s[%s] %s' % (task.name, task.uuid, task.info(),))
+            LOGGER.info(
+                'TASK SUCCEEDED: %s[%s] %s' % (
+                    task.name,
+                    task.uuid,
+                    task.info(),
+                )
+            )
             tasks = self.app.AsyncResult(task.id)
             for task in tasks.result:
                 self.results.append(task)
@@ -48,7 +60,9 @@ def announce_succeeded_tasks(event):
         def announce_failed_tasks(event):
             state.event(event)
             task = state.tasks.get(event['uuid'])
-            LOGGER.info('TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),))
+            LOGGER.info(
+                'TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),)
+            )
             self.results.append(task.info())
             self.recv.should_stop = True
 
@@ -62,7 +76,11 @@ def announce_failed_tasks(event):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        events_iter = self.recv.itercapture(limit=None, timeout=self.monitor_timeout, wakeup=True)
+        events_iter = self.recv.itercapture(
+            limit=None,
+            timeout=self.monitor_timeout,
+            wakeup=True,
+        )
         self._wait_for_results(events_iter)
         self.connection.__exit__()
 
@@ -84,8 +102,8 @@ def do_crawl(
         app,
         monitor_timeout,
         monitor_iter_limit,
-        events_limit,
         crawler_instance,
+        events_limit=2,
         project='hepcrawl',
         spider='WSP',
         settings=None,
diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py
index 73f28f96..b78c1a31 100644
--- a/hepcrawl/testlib/fixtures.py
+++ b/hepcrawl/testlib/fixtures.py
@@ -15,6 +15,7 @@
 
 from scrapy.http import Request, TextResponse
 from scrapy.selector import Selector
+from hepcrawl.settings import CRAWL_ONCE_PATH
 
 
 def fake_response_from_file(file_name, test_suite='unit', url='http://www.example.com', response_type=TextResponse):
@@ -134,12 +135,13 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
     return expected_data
 
 
-def clean_dir(path):
+def clean_dir(path=CRAWL_ONCE_PATH):
     """
     Deletes all contained files of given target directory path.
 
     Args:
-        path: Absolute path of target directory to be cleaned.
+        path(str): path of directory to be deleted. Default path is the produced DB per spider that
+            stores the requested urls.
 
     Example:
 
diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py
index 8fcbc735..88e5e8d2 100644
--- a/hepcrawl/tohep.py
+++ b/hepcrawl/tohep.py
@@ -25,10 +25,14 @@
 
 import os
 import datetime
+import logging
 
 from inspire_schemas.api import LiteratureBuilder
 
 
+LOGGER = logging.getLogger(__name__)
+
+
 class UnknownItemFormat(Exception):
     pass
 
@@ -195,10 +199,12 @@ def hep_to_hep(hep_record, record_files):
     hepcrawl one (normally, marc-ingesting spiders).
     """
     if record_files:
+        LOGGER.debug('Updating fft fields from: %s', hep_record['_fft'])
         hep_record['_fft'] = _get_updated_fft_fields(
             current_fft_fields=hep_record['_fft'],
             record_files=record_files,
         )
+        LOGGER.debug('Updated fft fields to: %s', hep_record['_fft'])
 
     return hep_record
 
diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 256dd508..caff462d 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -10,6 +10,7 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+import pprint
 import re
 from operator import itemgetter
 from itertools import groupby
@@ -467,3 +468,6 @@ def __getattr__(self, key):
 
     def __setattr__(self, key, value):
         self[key] = value
+
+    def __str__(self):
+        return pprint.pformat(self)
diff --git a/setup.py b/setup.py
index 4d1518a8..16ce46f0 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
     'inspire-schemas~=46.0',
     'inspire-dojson~=46.0',
     'Scrapy>=1.1.0',
+    'scrapy-crawl-once~=0.1,>=0.1.1',
     # TODO: unpin once they support wheel building again
     'scrapyd==1.1.0',
     'scrapyd-client>=1.0.1',
diff --git a/tests/Dockerfile.hepcrawl_base b/tests/Dockerfile.hepcrawl_base
index eb91b69f..8db9f43e 100644
--- a/tests/Dockerfile.hepcrawl_base
+++ b/tests/Dockerfile.hepcrawl_base
@@ -26,10 +26,10 @@ RUN yum install -y epel-release && \
         python-virtualenv && \
     yum clean all
 
-RUN mkdir /code /hepcrawl_venv
+RUN mkdir /code /hepcrawl_venv /var/lib/scrapy
 
 RUN useradd test
-RUN chown -R test:test /code /hepcrawl_venv
+RUN chown -R test:test /code /hepcrawl_venv /var/lib/scrapy
 
 ADD ./docker_entrypoint.sh /docker_entrypoint.sh
 ADD ./fix_rights /fix_rights
diff --git a/tests/fix_rights b/tests/fix_rights
index 98677b2c..ecf219b0 100755
Binary files a/tests/fix_rights and b/tests/fix_rights differ
diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py
index 0f58b17d..22025020 100644
--- a/tests/functional/arxiv/test_arxiv.py
+++ b/tests/functional/arxiv/test_arxiv.py
@@ -20,6 +20,7 @@
 from hepcrawl.testlib.fixtures import (
     get_test_suite_path,
     expected_json_results_from_file,
+    clean_dir,
 )
 
 
@@ -51,6 +52,8 @@ def set_up_local_environment():
         }
     }
 
+    clean_dir()
+
 
 @pytest.mark.parametrize(
     'expected_results',
@@ -84,3 +87,52 @@ def test_arxiv(set_up_local_environment, expected_results):
     expected_results = [override_generated_fields(expected) for expected in expected_results]
 
     assert gotten_results == expected_results
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'arxiv',
+            'fixtures',
+            'arxiv_smoke_record.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_arxiv_crawl_twice(set_up_local_environment, expected_results):
+    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        events_limit=1,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='arXiv',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='arXiv',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []
diff --git a/tests/functional/desy/fixtures/desy_local_records_expected.json b/tests/functional/desy/fixtures/desy_local_records_expected.json
index 1dc784b9..dc7baf23 100644
--- a/tests/functional/desy/fixtures/desy_local_records_expected.json
+++ b/tests/functional/desy/fixtures/desy_local_records_expected.json
@@ -10,7 +10,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -19,7 +19,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -78,7 +78,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -87,7 +87,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -146,7 +146,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -155,7 +155,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -214,7 +214,7 @@
       "version": 1,
       "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
       "type": "Main",
       "filename": "test_fft_1"
     },
@@ -223,7 +223,7 @@
       "creation_datetime": "2017-06-27T09:43:16",
       "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
       "format": ".txt",
-      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
       "type": "Main",
       "filename": "test_fft_2"
     }
@@ -1754,7 +1754,7 @@
             "format": ".pdf",
             "filename": "dummy",
             "version": 1,
-            "path": "/tmp/file_urls/full/c011422ef40ef111a72bd72092066dd3c1cc7a39.pdf",
+            "path": "/tmp/file_urls/full/0df3efe7842cf285ae0eeed845cca003dd755674.pdf",
             "type": "Main"
         },
         {
@@ -1763,7 +1763,7 @@
             "format": ".txt",
             "filename": "test_fft_1",
             "version": 1,
-            "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+            "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt",
             "type": "Main"
         },
         {
@@ -1772,7 +1772,7 @@
             "format": ".txt",
             "filename": "test_fft_2",
             "version": 1,
-            "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+            "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt",
             "type": "Main"
         }
     ],
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
index d7286f7e..c3e7ca4f 100644
--- a/tests/functional/desy/test_desy.py
+++ b/tests/functional/desy/test_desy.py
@@ -13,6 +13,7 @@
 
 import copy
 import hashlib
+import os
 from time import sleep
 
 import pytest
@@ -76,6 +77,29 @@ def _generate_md5_hash(file_path):
     assert file_1_hash == file_2_hash
 
 
+def assert_ffts_content_matches_expected(record):
+    for fft_field in record.get('_fft', []):
+        assert_fft_content_matches_expected(fft_field)
+
+
+def assert_fft_content_matches_expected(fft_field):
+    expected_file_name = get_file_name_from_fft(fft_field)
+    assert_files_equal(expected_file_name, fft_field['path'])
+
+
+def get_file_name_from_fft(fft_field):
+    file_path = get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        'DESY',
+        'FFT',
+        fft_field['filename'] + fft_field['format'],
+        test_suite='functional',
+    )
+    return file_path
+
+
 def get_ftp_settings():
     netrc_location = get_test_suite_path(
         'desy',
@@ -120,6 +144,7 @@ def cleanup():
     sleep(10)
     yield
 
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
     clean_dir('/tmp/file_urls')
     clean_dir('/tmp/DESY')
 
@@ -180,26 +205,3 @@ def test_desy(
 
     for record in gotten_results:
         assert_ffts_content_matches_expected(record)
-
-
-def assert_ffts_content_matches_expected(record):
-    for fft_field in record.get('_fft', []):
-        assert_fft_content_matches_expected(fft_field)
-
-
-def assert_fft_content_matches_expected(fft_field):
-    expected_file_name = get_file_name_from_fft(fft_field)
-    assert_files_equal(expected_file_name, fft_field['path'])
-
-
-def get_file_name_from_fft(fft_field):
-    file_path = get_test_suite_path(
-        'desy',
-        'fixtures',
-        'ftp_server',
-        'DESY',
-        'FFT',
-        fft_field['filename'] + fft_field['format'],
-        test_suite='functional',
-    )
-    return file_path
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index 42f691c9..493837ec 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -28,13 +28,15 @@
 
 def override_generated_fields(record):
     record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216'
-    record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad'
+    record['acquisition_source']['submission_number'] = (
+        u'5652c7f6190f11e79e8000224dabeaad'
+    )
 
     return record
 
 
 @pytest.fixture(scope="function")
-def set_up_ftp_environment():
+def ftp_environment():
     netrc_location = get_test_suite_path(
         'wsp',
         'fixtures',
@@ -43,7 +45,8 @@ def set_up_ftp_environment():
         test_suite='functional',
     )
 
-    # The test must wait until the docker environment is up (takes about 10 seconds).
+    # The test must wait until the docker environment is up (takes about 10
+    # seconds).
     sleep(10)
 
     yield {
@@ -55,7 +58,8 @@ def set_up_ftp_environment():
         }
     }
 
-    clean_dir(path='/tmp/WSP/')
+    clean_dir()
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
 
 @pytest.fixture(scope="function")
@@ -72,7 +76,7 @@ def set_up_local_environment():
         'CRAWLER_HOST_URL': 'http://scrapyd:6800',
         'CRAWLER_PROJECT': 'hepcrawl',
         'CRAWLER_ARGUMENTS': {
-            'package_path': package_location,
+            'local_package_dir': package_location,
         }
     }
 
@@ -80,7 +84,8 @@ def set_up_local_environment():
 
 
 def remove_generated_files(package_location):
-    clean_dir(path='/tmp/WSP/')
+    clean_dir()
+    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
 
     _, dirs, files = next(os.walk(package_location))
     for dir_name in dirs:
@@ -103,8 +108,10 @@ def remove_generated_files(package_location):
         'smoke',
     ]
 )
-def test_wsp_ftp(set_up_ftp_environment, expected_results):
-    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+def test_wsp_ftp(ftp_environment, expected_results):
+    crawler = get_crawler_instance(
+        ftp_environment.get('CRAWLER_HOST_URL'),
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -112,18 +119,78 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
         monitor_iter_limit=100,
         events_limit=1,
         crawler_instance=crawler,
-        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        project=ftp_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
         settings={},
-        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
     )
 
-    gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    gotten_results = [
+        override_generated_fields(result) for result in results
+    ]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results
 
 
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'wsp',
+            'fixtures',
+            'wsp_smoke_records.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_wsp_ftp_crawl_twice(ftp_environment, expected_results):
+    crawler = get_crawler_instance(
+        ftp_environment.get('CRAWLER_HOST_URL'),
+    )
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        events_limit=2,
+        crawler_instance=crawler,
+        project=ftp_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [
+        override_generated_fields(result) for result in results
+    ]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
+
+    assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        events_limit=2,
+        crawler_instance=crawler,
+        project=ftp_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []
+
+
 @pytest.mark.parametrize(
     'expected_results',
     [
@@ -138,7 +205,9 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
     ]
 )
 def test_wsp_local_package_path(set_up_local_environment, expected_results):
-    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+    crawler = get_crawler_instance(
+        set_up_local_environment.get('CRAWLER_HOST_URL')
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -153,6 +222,63 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
+
+    assert gotten_results == expected_results
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'wsp',
+            'fixtures',
+            'wsp_smoke_records.json',
+        ),
+    ],
+    ids=[
+        'crawl_twice',
+    ]
+)
+def test_wsp_local_package_path_crawl_twice(
+    set_up_local_environment,
+    expected_results,
+):
+    crawler = get_crawler_instance(
+        set_up_local_environment.get('CRAWLER_HOST_URL')
+    )
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [
+        override_generated_fields(expected) for expected in expected_results
+    ]
 
     assert gotten_results == expected_results
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=20,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='WSP',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+
+    assert gotten_results == []
diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py
index 96bf9af1..d5ff6331 100644
--- a/tests/unit/test_alpha.py
+++ b/tests/unit/test_alpha.py
@@ -13,8 +13,10 @@
 
 from hepcrawl.spiders import alpha_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
-
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 @pytest.fixture
 def results():
diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py
index 3bb3698c..e8e64962 100644
--- a/tests/unit/test_aps.py
+++ b/tests/unit/test_aps.py
@@ -12,7 +12,10 @@
 import pytest
 
 from hepcrawl.spiders import aps_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index 1f4155c9..21a5fd99 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -7,7 +7,12 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
 
 import pytest
 
@@ -16,7 +21,10 @@
 
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import arxiv_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -44,10 +52,16 @@ def _get_processed_record(item, spider):
         )
     )
 
+    assert parsed_items
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items]
+    yield [
+        _get_processed_record(parsed_item, spider)
+        for parsed_item in parsed_items
+    ]
+
+    clean_dir()
 
 
 def test_page_nr(many_results):
diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py
index 329a2a49..709c6d9a 100644
--- a/tests/unit/test_arxiv_single.py
+++ b/tests/unit/test_arxiv_single.py
@@ -17,7 +17,10 @@
 
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import arxiv_spider
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -44,8 +47,9 @@ def _get_processed_item(item, spider):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
+    yield [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
 
+    clean_dir()
 
 
 def test_abstracts(results):
diff --git a/tests/unit/test_pipelines.py b/tests/unit/test_pipelines.py
index 050df092..08b81319 100644
--- a/tests/unit/test_pipelines.py
+++ b/tests/unit/test_pipelines.py
@@ -21,7 +21,10 @@
 from hepcrawl.spiders import arxiv_spider
 from hepcrawl.pipelines import InspireAPIPushPipeline
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -44,7 +47,9 @@ def json_spider_record(tmpdir):
     )
     parsed_record = items.next()
     assert parsed_record
-    return spider, parsed_record
+    yield spider, parsed_record
+
+    clean_dir()
 
 
 @pytest.fixture
diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py
index bea29b34..0eccd4fc 100644
--- a/tests/unit/test_pos.py
+++ b/tests/unit/test_pos.py
@@ -19,7 +19,10 @@
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import pos_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
 
 
 @pytest.fixture
@@ -55,7 +58,9 @@ def record(scrape_pos_page_body):
     parsed_record = pipeline.process_item(parsed_item, spider)
     assert parsed_record
 
-    return parsed_record
+    yield parsed_record
+
+    clean_dir()
 
 
 def test_titles(record):
diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py
index 291d00d0..f14144fd 100644
--- a/tests/unit/test_world_scientific.py
+++ b/tests/unit/test_world_scientific.py
@@ -7,7 +7,12 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
 
 import pytest
 import os
@@ -18,7 +23,16 @@
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import wsp_spider
 
-from hepcrawl.testlib.fixtures import fake_response_from_file
+from hepcrawl.testlib.fixtures import (
+    fake_response_from_file,
+    clean_dir,
+)
+
+
+@pytest.fixture(scope='function', autouse=True)
+def cleanup():
+    yield
+    clean_dir()
 
 
 def create_spider():
@@ -44,7 +58,10 @@ def get_records(response_file_name):
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
 
-    return (pipeline.process_item(record, spider) for record in records)
+    return (
+        pipeline.process_item(record, spider)
+        for record in records
+    )
 
 
 def get_one_record(response_file_name):
@@ -67,17 +84,25 @@ def override_generated_fields(record):
         [
             get_one_record('world_scientific/sample_ws_record.xml'),
             (
-                "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have recently been used as light absorbers in hybrid"
-                " organic-inorganic solid-state solar cells, with efficiencies above 15%. To date, it is essential to"
-                " add Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the hole transport materials (HTM) to get"
-                " a higher conductivity. However, the detrimental effect of high LiTFSI concentration on the charge transport"
-                ", DOS in the conduction band of the TiO$_{2}$ substrate and device stability results in an overall "
-                "compromise for a satisfactory device. Using a higher mobility hole conductor to avoid lithium salt "
-                "is an interesting alternative. Herein, we successfully made an efficient perovskite solar cell by "
-                "applying a hole conductor PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in the absence of"
-                " LiTFSI. Under AM 1.5 illumination of 100 mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is "
-                "comparable to the efficiency of 12.3% with the addition of 1.3 mM LiTFSI. An unsealed device without "
-                "Li$^{+}$ shows interestingly a promising stability."
+                "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have "
+                "recently been used as light absorbers in hybrid"
+                " organic-inorganic solid-state solar cells, with "
+                "efficiencies above 15%. To date, it is essential to add "
+                "Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the "
+                "hole transport materials (HTM) to get a higher conductivity. "
+                "However, the detrimental effect of high LiTFSI concentration "
+                "on the charge transport, DOS in the conduction band of the "
+                "TiO$_{2}$ substrate and device stability results in an "
+                "overall compromise for a satisfactory device. Using a higher "
+                "mobility hole conductor to avoid lithium salt is an "
+                "interesting alternative. Herein, we successfully made an "
+                "efficient perovskite solar cell by applying a hole conductor "
+                "PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in "
+                "the absence of LiTFSI. Under AM 1.5 illumination of 100 "
+                "mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is "
+                "comparable to the efficiency of 12.3% with the addition of "
+                "1.3 mM LiTFSI. An unsealed device without Li$^{+}$ shows "
+                "interestingly a promising stability."
             ),
         ],
     ],
@@ -98,7 +123,10 @@ def test_abstract(generated_record, expected_abstract):
             get_one_record('world_scientific/sample_ws_record.xml'),
             [{
                 'source': 'WSP',
-                'title': 'High-efficient Solid-state Perovskite Solar Cell Without Lithium Salt in the Hole Transport Material',
+                'title': (
+                    'High-efficient Solid-state Perovskite Solar Cell Without '
+                    'Lithium Salt in the Hole Transport Material'
+                ),
             }],
         ],
     ],
@@ -291,12 +319,18 @@ def test_publication_info(generated_record, expected_publication_info):
         [
             get_one_record('world_scientific/sample_ws_record.xml'),
             {
-                'authors': ["BI, DONGQIN", "BOSCHLOO, GERRIT", "HAGFELDT, ANDERS"],
+                'authors': [
+                    "BI, DONGQIN",
+                    "BOSCHLOO, GERRIT",
+                    "HAGFELDT, ANDERS",
+                ],
                 'affiliation': (
-                    'Department of Chemistry-Angstrom Laboratory, Uppsala University, Box 532, SE 751 20 Uppsala, Sweden'
+                    'Department of Chemistry-Angstrom Laboratory, Uppsala '
+                    'University, Box 532, SE 751 20 Uppsala, Sweden'
                 ),
                 'xref_affiliation': (
-                    'Physics Department, Brookhaven National Laboratory, Upton, NY 11973, USA'
+                    'Physics Department, Brookhaven National Laboratory, '
+                    'Upton, NY 11973, USA'
                 ),
             },
         ],
@@ -314,11 +348,14 @@ def test_authors(generated_record, expected_authors):
     for index, name in enumerate(expected_authors['authors']):
             assert generated_record['authors'][index]['full_name'] == name
             assert expected_authors['affiliation'] in [
-                aff['value'] for aff in generated_record['authors'][index]['affiliations']
+                aff['value']
+                for aff in generated_record['authors'][index]['affiliations']
             ]
             if index == 1:
                 assert expected_authors['xref_affiliation'] in [
-                    aff['value'] for aff in generated_record['authors'][index]['affiliations']
+                    aff['value']
+                    for aff
+                    in generated_record['authors'][index]['affiliations']
                 ]
 
 
@@ -413,7 +450,10 @@ def test_pipeline_record(generated_record):
         'abstracts': [
             {
                 'source': 'WSP',
-                'value': u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa.',
+                'value': (
+                    u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, '
+                    'bla blaaa blaa. Bla blaaa blaa.'
+                ),
             },
         ],
         'acquisition_source': {
@@ -426,7 +466,10 @@ def test_pipeline_record(generated_record):
             {
                 'affiliations': [
                     {
-                        'value': u'Department, University, City, City_code 123456, C. R. Country_2',
+                        'value': (
+                            u'Department, University, City, City_code 123456, '
+                            'C. R. Country_2'
+                        ),
                     },
                 ],
                 'full_name': u'author_surname_2, author_name_1',