diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3f68131f..d7efaf41 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -77,37 +77,33 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.target_folder = "/tmp/WSP" + self.local_tmp_folder = "/tmp/WSP" self.package_path = package_path - if not os.path.exists(self.target_folder): - os.makedirs(self.target_folder) + if not os.path.exists(self.local_tmp_folder): + os.makedirs(self.local_tmp_folder) def start_requests(self): - """List selected folder on remote FTP and yield new zip files.""" + """List selected folder on remote FTP and yield zip files.""" if self.package_path: - new_files_paths = local_list_files( - self.package_path, - self.target_folder - ) + local_files_paths = local_list_files(self.package_path) - for file_path in new_files_paths: + for file_path in local_files_paths: yield Request("file://{0}".format(file_path), callback=self.handle_package_file) else: ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) - new_files_paths = ftp_list_files( + remote_files_paths = ftp_list_files( self.ftp_folder, - self.target_folder, server=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) - for remote_file in new_files_paths: + for remote_file in remote_files_paths: # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) ftp_params["ftp_local_filename"] = os.path.join( - self.target_folder, + self.local_tmp_folder, os.path.basename(remote_file) ) remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 4ad9db3c..d291d9f1 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -57,7 +57,7 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): return ftp_host, connection_params -def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False): +def ftp_list_files(server_folder, server, user, password, passive_mode=False): """List files from given FTP's server folder to target folder.""" session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, @@ -67,23 +67,19 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: file_names = host.listdir(os.path.join(host.curdir, '/', server_folder)) - return list_missing_files(server_folder, target_folder, file_names) + return list_files(server_folder, file_names) -def local_list_files(local_folder, target_folder): +def local_list_files(local_folder): file_names = os.listdir(local_folder) - return list_missing_files(local_folder, target_folder, file_names) + return list_files(local_folder, file_names) -def list_missing_files(remote_folder, target_folder, file_names): - missing_files = [] - for file_name in file_names: - destination_file = os.path.join(target_folder, file_name) - source_file = os.path.join(remote_folder, file_name) - if not os.path.exists(destination_file): - missing_files.append(source_file) - - return missing_files +def list_files(remote_folder, file_names): + return [ + os.path.join(remote_folder, file_name) + for file_name in file_names + ] def get_first(iterable, default=None): diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index a97f77ef..ba88b9f8 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -55,7 +55,6 @@ def set_up_ftp_environment(): } } - clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))