Skip to content

Commit

Permalink
wsp: refactored to support scrapy-crawl-once
Browse files Browse the repository at this point in the history
* Re-factored: `WSP` spider and `utils` module in order not to check the crawled records.

Addresses inspirehep#161

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 17, 2017
1 parent eae96e4 commit b258ce7
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 27 deletions.
22 changes: 9 additions & 13 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,37 +77,33 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
self.target_folder = "/tmp/WSP"
self.local_tmp_folder = "/tmp/WSP"
self.package_path = package_path
if not os.path.exists(self.target_folder):
os.makedirs(self.target_folder)
if not os.path.exists(self.local_tmp_folder):
os.makedirs(self.local_tmp_folder)

def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
"""List selected folder on remote FTP and yield zip files."""
if self.package_path:
new_files_paths = local_list_files(
self.package_path,
self.target_folder
)
local_files_paths = local_list_files(self.package_path)

for file_path in new_files_paths:
for file_path in local_files_paths:
yield Request("file://{0}".format(file_path), callback=self.handle_package_file)
else:
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)

new_files_paths = ftp_list_files(
remote_files_paths = ftp_list_files(
self.ftp_folder,
self.target_folder,
server=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)

for remote_file in new_files_paths:
for remote_file in remote_files_paths:
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
self.target_folder,
self.local_tmp_folder,
os.path.basename(remote_file)
)
remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
Expand Down
22 changes: 9 additions & 13 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False):
return ftp_host, connection_params


def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False):
def ftp_list_files(server_folder, server, user, password, passive_mode=False):
"""List files from given FTP's server folder to target folder."""
session_factory = ftputil.session.session_factory(
base_class=ftplib.FTP,
Expand All @@ -67,23 +67,19 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive

with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
return list_missing_files(server_folder, target_folder, file_names)
return list_files(server_folder, file_names)


def local_list_files(local_folder, target_folder):
def local_list_files(local_folder):
file_names = os.listdir(local_folder)
return list_missing_files(local_folder, target_folder, file_names)
return list_files(local_folder, file_names)


def list_missing_files(remote_folder, target_folder, file_names):
missing_files = []
for file_name in file_names:
destination_file = os.path.join(target_folder, file_name)
source_file = os.path.join(remote_folder, file_name)
if not os.path.exists(destination_file):
missing_files.append(source_file)

return missing_files
def list_files(remote_folder, file_names):
return [
os.path.join(remote_folder, file_name)
for file_name in file_names
]


def get_first(iterable, default=None):
Expand Down
1 change: 0 additions & 1 deletion tests/functional/wsp/test_wsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def set_up_ftp_environment():
}
}

clean_dir()
clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))


Expand Down

0 comments on commit b258ce7

Please sign in to comment.