Skip to content

Commit

Permalink
wsp: add temporary folder to the crawlings
Browse files Browse the repository at this point in the history
* Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider`
	methods deletes this temporary folder.

Addresses inspirehep#161

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis authored and david-caro committed Sep 20, 2017
1 parent 98a7796 commit a758819
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
5 changes: 5 additions & 0 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from __future__ import absolute_import, division, print_function

import os
import shutil

import requests

Expand Down Expand Up @@ -175,6 +176,10 @@ def close_spider(self, spider):
"""Post results to BROKER API."""
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)

if hasattr(spider, 'tmp_dir'):
shutil.rmtree(path=spider.tmp_dir, ignore_errors=True)

if 'SCRAPY_JOB' in os.environ and self.count > 0:
task_endpoint = spider.settings[
'API_PIPELINE_TASK_ENDPOINT_MAPPING'
Expand Down
13 changes: 7 additions & 6 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def __init__(
ftp_folder="/WSP",
ftp_host=None,
ftp_netrc=None,
tmp_dir=None,
*args,
**kwargs
):
Expand All @@ -86,10 +87,8 @@ def __init__(
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
self.target_folder = "/tmp/WSP"
self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
self.package_path = package_path
if not os.path.exists(self.target_folder):
os.makedirs(self.target_folder)

def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
Expand All @@ -116,9 +115,10 @@ def start_requests(self):
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
self.target_folder,
self.tmp_dir,
os.path.basename(remote_file)
)

remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
yield Request(
str(remote_url),
Expand All @@ -132,6 +132,7 @@ def handle_package_ftp(self, response):
zip_filepath = response.body
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)

for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
Expand All @@ -142,8 +143,8 @@ def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
xml_files = unzip_xml_files(zip_filepath, self.tmp_dir)

for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
Expand Down

0 comments on commit a758819

Please sign in to comment.