From e9fb27a01a797b4073a515a94cd7c36779034482 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:32:51 +0200 Subject: [PATCH] wsp: add temporary folder to the crawlings * Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider` methods deletes this temporary folder. Addresses #161 Signed-off-by: Spiros Delviniotis --- docker-compose.test.yml | 1 - hepcrawl/pipelines.py | 5 +++++ hepcrawl/spiders/wsp_spider.py | 25 ++++++++++++++++++------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 7ffe0122..7c4c2668 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -25,7 +25,6 @@ services: - ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/ - ${PWD}:/code/ - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf - - /tmp/WSP:/tmp/WSP functional_wsp: <<: *service_base diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 62ba867c..1590d697 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -17,6 +17,7 @@ import datetime import os +import shutil import requests @@ -184,6 +185,10 @@ def close_spider(self, spider): """Post results to BROKER API.""" from celery.utils.log import get_task_logger logger = get_task_logger(__name__) + + if hasattr(spider, 'tmp_dir'): + shutil.rmtree(path=spider.tmp_dir, ignore_errors=True) + if 'SCRAPY_JOB' in os.environ and self.count > 0: task_endpoint = spider.settings[ 'API_PIPELINE_TASK_ENDPOINT_MAPPING' diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index d7efaf41..1f0786d3 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -13,6 +13,8 @@ import os import urlparse +import tempfile +import shutil from scrapy import Request from scrapy.spiders import XMLFeedSpider @@ -71,16 +73,23 @@ class WorldScientificSpider(Jats, XMLFeedSpider): 'rapid-communications' ] - def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs): + def __init__( + self, + package_path=None, + ftp_folder="WSP", + ftp_host=None, + ftp_netrc=None, + tmp_dir=None, + *args, + **kwargs + ): """Construct WSP spider.""" super(WorldScientificSpider, self).__init__(*args, **kwargs) self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.local_tmp_folder = "/tmp/WSP" + self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') self.package_path = package_path - if not os.path.exists(self.local_tmp_folder): - os.makedirs(self.local_tmp_folder) def start_requests(self): """List selected folder on remote FTP and yield zip files.""" @@ -103,9 +112,10 @@ def start_requests(self): # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( - self.local_tmp_folder, + self.tmp_dir, os.path.basename(remote_file) ) + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) yield Request( str(remote_url), @@ -119,6 +129,7 @@ def handle_package_ftp(self, response): zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), @@ -129,8 +140,8 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - zip_target_folder, dummy = os.path.splitext(zip_filepath) - xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + xml_files = unzip_xml_files(zip_filepath, self.tmp_dir) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file),