Skip to content

Commit

Permalink
wsp: adapt to new middleware and refactor
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Sep 20, 2017
1 parent 3110b0f commit bd72a4f
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 64 deletions.
104 changes: 63 additions & 41 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,19 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
``WorldScientificSpider.parse_node()``.
Args:
local_package_dir(str): path to the local directory holding the zip
files to parse and extract the records for, if set, will ignore all
the ftp options.
ftp_folder(str): remote folder in the ftp server to get the zip files
from.
ftp_host(str): host name of the ftp server to connect to.
ftp_netrc(str): path to the netrc file containing the authentication
settings for the ftp.
target_folder(str): path to the temporary local directory to download
the files to.
Example:
To run a crawl, you need to pass FTP connection information via
``ftp_host`` and ``ftp_netrc``::
Expand Down Expand Up @@ -80,11 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider):

def __init__(
self,
package_path=None,
local_package_dir=None,
ftp_folder="WSP",
ftp_host=None,
ftp_netrc=None,
tmp_dir=None,
target_folder=None,
*args,
**kwargs
):
Expand All @@ -93,53 +106,62 @@ def __init__(
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
self.tmp_dir = (
tmp_dir or
self.target_folder = (
target_folder or
tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_')
)
self.package_path = package_path
self.local_package_dir = local_package_dir

def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.package_path:
new_files_paths = local_list_files(
self.package_path,
self.target_folder
def _get_local_requests(self):
new_files_paths = local_list_files(
self.local_package_dir,
self.target_folder
)

for file_path in new_files_paths:
yield Request(
"file://{0}".format(file_path),
callback=self.handle_package_file,
)

for file_path in new_files_paths:
yield Request(
"file://{0}".format(file_path),
callback=self.handle_package_file,
)
else:
ftp_host, ftp_params = ftp_connection_info(
self.ftp_host,
self.ftp_netrc,
def _get_remote_requests(self):
ftp_host, ftp_params = ftp_connection_info(
self.ftp_host,
self.ftp_netrc,
)

new_files_paths = ftp_list_files(
self.ftp_folder,
destination_folder=self.target_folder,
ftp_host=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)

for remote_file in new_files_paths:
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
self.target_folder,
os.path.basename(remote_file)
)

new_files_paths = ftp_list_files(
self.ftp_folder,
destination_folder=self.target_folder,
ftp_host=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
yield Request(
str(remote_url),
meta=ftp_params,
callback=self.handle_package_ftp
)

for remote_file in new_files_paths:
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
self.tmp_dir,
os.path.basename(remote_file)
)

remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file)
yield Request(
str(remote_url),
meta=ftp_params,
callback=self.handle_package_ftp
)
def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.local_package_dir:
requests_iter = self._get_local_requests()
else:
requests_iter = self._get_remote_requests()

for request in requests_iter:
yield request

def handle_package_ftp(self, response):
"""Handle a zip package and yield every XML found."""
Expand All @@ -158,7 +180,7 @@ def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
xml_files = unzip_xml_files(zip_filepath, self.tmp_dir)
xml_files = unzip_xml_files(zip_filepath, self.target_folder)

for xml_file in xml_files:
yield Request(
Expand Down
74 changes: 51 additions & 23 deletions tests/functional/wsp/test_wsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@

def override_generated_fields(record):
record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216'
record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad'
record['acquisition_source']['submission_number'] = (
u'5652c7f6190f11e79e8000224dabeaad'
)

return record


@pytest.fixture(scope="function")
def set_up_ftp_environment():
def ftp_environment():
netrc_location = get_test_suite_path(
'wsp',
'fixtures',
Expand All @@ -43,7 +45,8 @@ def set_up_ftp_environment():
test_suite='functional',
)

# The test must wait until the docker environment is up (takes about 10 seconds).
# The test must wait until the docker environment is up (takes about 10
# seconds).
sleep(10)

yield {
Expand Down Expand Up @@ -73,7 +76,7 @@ def set_up_local_environment():
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
'package_path': package_location,
'local_package_dir': package_location,
}
}

Expand Down Expand Up @@ -105,23 +108,29 @@ def remove_generated_files(package_location):
'smoke',
]
)
def test_wsp_ftp(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
def test_wsp_ftp(ftp_environment, expected_results):
crawler = get_crawler_instance(
ftp_environment.get('CRAWLER_HOST_URL'),
)

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
events_limit=1,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
project=ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
**ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]
gotten_results = [
override_generated_fields(result) for result in results
]
expected_results = [
override_generated_fields(expected) for expected in expected_results
]

assert gotten_results == expected_results

Expand All @@ -139,34 +148,42 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
'crawl_twice',
]
)
def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
def test_wsp_ftp_crawl_twice(ftp_environment, expected_results):
crawler = get_crawler_instance(
ftp_environment.get('CRAWLER_HOST_URL'),
)

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
events_limit=2,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
project=ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
**ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]
gotten_results = [
override_generated_fields(result) for result in results
]
expected_results = [
override_generated_fields(expected) for expected in expected_results
]

assert gotten_results == expected_results

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=20,
events_limit=2,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
project=ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
**ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
Expand All @@ -188,7 +205,9 @@ def test_wsp_ftp_crawl_twice(set_up_ftp_environment, expected_results):
]
)
def test_wsp_local_package_path(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
crawler = get_crawler_instance(
set_up_local_environment.get('CRAWLER_HOST_URL')
)

results = CeleryMonitor.do_crawl(
app=celery_app,
Expand All @@ -203,7 +222,9 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]
expected_results = [
override_generated_fields(expected) for expected in expected_results
]

assert gotten_results == expected_results

Expand All @@ -221,8 +242,13 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
'crawl_twice',
]
)
def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
def test_wsp_local_package_path_crawl_twice(
set_up_local_environment,
expected_results,
):
crawler = get_crawler_instance(
set_up_local_environment.get('CRAWLER_HOST_URL')
)

results = CeleryMonitor.do_crawl(
app=celery_app,
Expand All @@ -236,7 +262,9 @@ def test_wsp_local_package_path_crawl_twice(set_up_local_environment, expected_r
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]
expected_results = [
override_generated_fields(expected) for expected in expected_results
]

assert gotten_results == expected_results

Expand Down

0 comments on commit bd72a4f

Please sign in to comment.