From 5f6f873f401b9be907900c779350bd61401a70a9 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 18 Aug 2017 14:32:51 +0200 Subject: [PATCH] wsp: add temporary folder to the crawlings * Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider` methods deletes this temporary folder. Addresses #161 Signed-off-by: Spiros Delviniotis --- hepcrawl/pipelines.py | 5 +++++ hepcrawl/spiders/wsp_spider.py | 13 +++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 8cd31c0e..4ff2b3c9 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -16,6 +16,7 @@ from __future__ import absolute_import, division, print_function import os +import shutil import requests @@ -175,6 +176,10 @@ def close_spider(self, spider): """Post results to BROKER API.""" from celery.utils.log import get_task_logger logger = get_task_logger(__name__) + + if hasattr(spider, 'tmp_dir'): + shutil.rmtree(path=spider.tmp_dir, ignore_errors=True) + if 'SCRAPY_JOB' in os.environ and self.count > 0: task_endpoint = spider.settings[ 'API_PIPELINE_TASK_ENDPOINT_MAPPING' diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 058e6cc0..bc569bb3 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -78,6 +78,7 @@ def __init__( ftp_folder="/WSP", ftp_host=None, ftp_netrc=None, + tmp_dir=None, *args, **kwargs ): @@ -86,10 +87,8 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.target_folder = "/tmp/WSP" + self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') self.package_path = package_path - if not os.path.exists(self.target_folder): - os.makedirs(self.target_folder) def start_requests(self): """List selected folder on remote FTP and yield new zip files.""" @@ -116,9 +115,10 @@ def start_requests(self): # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( - self.target_folder, + self.tmp_dir, os.path.basename(remote_file) ) + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) yield Request( str(remote_url), @@ -132,6 +132,7 @@ def handle_package_ftp(self, response): zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), @@ -142,8 +143,8 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - zip_target_folder, dummy = os.path.splitext(zip_filepath) - xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + xml_files = unzip_xml_files(zip_filepath, self.tmp_dir) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file),