Skip to content

Commit

Permalink
wsp: add temporary folder to the crawlings
Browse files Browse the repository at this point in the history
* Adds: create temporary folder to unzip the crawled files and in `InspireCeleryPushPipeline.close_spider`
	methods deletes this temporary folder.

Addresses inspirehep#161

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 18, 2017
1 parent 68c1361 commit da7a3b3
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 8 deletions.
1 change: 0 additions & 1 deletion docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ services:
- ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP

functional_wsp:
<<: *service_base
Expand Down
4 changes: 4 additions & 0 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import datetime
import os
import shutil

import requests

Expand Down Expand Up @@ -184,6 +185,9 @@ def close_spider(self, spider):
"""Post results to BROKER API."""
from celery.utils.log import get_task_logger
logger = get_task_logger(__name__)

shutil.rmtree(path=spider.tmp_dir, ignore_errors=True)

if 'SCRAPY_JOB' in os.environ and self.count > 0:
task_endpoint = spider.settings[
'API_PIPELINE_TASK_ENDPOINT_MAPPING'
Expand Down
25 changes: 18 additions & 7 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

import os
import urlparse
import tempfile
import shutil

from scrapy import Request
from scrapy.spiders import XMLFeedSpider
Expand Down Expand Up @@ -71,16 +73,23 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
'rapid-communications'
]

def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs):
def __init__(
self,
package_path=None,
ftp_folder="WSP",
ftp_host=None,
ftp_netrc=None,
tmp_dir=None,
*args,
**kwargs
):
"""Construct WSP spider."""
super(WorldScientificSpider, self).__init__(*args, **kwargs)
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
self.local_tmp_folder = "/tmp/WSP"
self.tmp_dir = tmp_dir or tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
self.package_path = package_path
if not os.path.exists(self.local_tmp_folder):
os.makedirs(self.local_tmp_folder)

def start_requests(self):
"""List selected folder on remote FTP and yield zip files."""
Expand All @@ -103,9 +112,10 @@ def start_requests(self):
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
self.local_tmp_folder,
self.tmp_dir,
os.path.basename(remote_file)
)

remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
yield Request(
str(remote_url),
Expand All @@ -119,6 +129,7 @@ def handle_package_ftp(self, response):
zip_filepath = response.body
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)

for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
Expand All @@ -129,8 +140,8 @@ def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
xml_files = unzip_xml_files(zip_filepath, self.tmp_dir)

for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
Expand Down

0 comments on commit da7a3b3

Please sign in to comment.