Skip to content

Commit

Permalink
tests: adds WSP functional test for local package
Browse files Browse the repository at this point in the history
* Adds WSP functional test for local package path.
* Refactored existing WSP functional test (add setup-teardown fixtures).
* Refactored `utils.ftp_list_files` to `utils.list_files` for reusability.
* Fixed WSP local package crawling mechanism.

Closes inspirehep#106

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis authored and david-caro committed May 9, 2017
1 parent e74f9bb commit 76c79cc
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 20 deletions.
1 change: 1 addition & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ services:
- ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP # the tmp folder that keeps the history of the crawled records
links:
- rabbitmq
- celery
Expand Down
14 changes: 11 additions & 3 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -23,6 +23,7 @@
from ..utils import (
ftp_list_files,
ftp_connection_info,
local_list_files,
get_license,
unzip_xml_files,
)
Expand Down Expand Up @@ -88,7 +89,13 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc
def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.package_path:
yield Request(self.package_path, callback=self.handle_package_file)
dummy, new_files = local_list_files(
self.package_path,
self.target_folder
)

for _file in new_files:
yield Request("file://{0}".format(_file), callback=self.handle_package_file)
else:
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)

Expand Down Expand Up @@ -116,7 +123,7 @@ def start_requests(self):

def handle_package_ftp(self, response):
"""Handle a zip package and yield every XML found."""
self.log("Visited %s" % response.url)
self.log("Visited url %s" % response.url)
zip_filepath = response.body
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand All @@ -128,6 +135,7 @@ def handle_package_ftp(self, response):

def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand Down
29 changes: 20 additions & 9 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand Down Expand Up @@ -67,14 +67,25 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive

with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
files = host.listdir(host.curdir + '/' + server_folder)
missing_files = []
all_files = []
for filename in files:
destination_file = os.path.join(target_folder, filename)
source_file = os.path.join(server_folder, filename)
if not os.path.exists(destination_file):
missing_files.append(source_file)
all_files.append(source_file)
return list_files(server_folder, target_folder, files)


def local_list_files(local_folder, target_folder):
"""List files from given package folder to target folder."""
files = os.listdir(local_folder)
return list_files(local_folder, target_folder, files)


def list_files(remote_folder, target_folder, files):
missing_files = []
all_files = []
for filename in files:
destination_file = os.path.join(target_folder, filename)
source_file = os.path.join(remote_folder, filename)
if not os.path.exists(destination_file):
missing_files.append(source_file)
all_files.append(source_file)

return all_files, missing_files


Expand Down
63 changes: 55 additions & 8 deletions tests/functional/WSP/test_wsp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017 CERN.
# Copyright (C) 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -14,6 +14,7 @@
import pytest
import json
import os
import shutil

from scrapyd_api import ScrapydAPI
from time import sleep
Expand Down Expand Up @@ -50,14 +51,14 @@ def expected_results():
return expected_data


@pytest.fixture(scope="module")
def set_up_environment():
@pytest.fixture(scope="function")
def set_up_ftp_environment():
netrc_location = os.path.join(os.path.dirname(
os.path.realpath(__file__)),
'fixtures/ftp_server/.netrc'
)

return {
yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
Expand All @@ -66,9 +67,55 @@ def set_up_environment():
}
}

cleaner()


@pytest.fixture(scope="function")
def set_up_local_environment():
package_location = os.path.join(os.path.dirname(
os.path.realpath(__file__)),
'fixtures/ftp_server/WSP/'
)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
'package_path': package_location,
}
}

cleaner()
cleaner(package_location + 'IDAQPv20i01-03160015-1510863')


def cleaner(path='/tmp/WSP/'):
shutil.rmtree(path, ignore_errors=True)


def test_wsp_ftp(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))

# The test must wait until the docker environment is up (takes about 5 seconds).
sleep(5)

results = CeleryMonitor.do_crawl(
app=app,
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
)

assert [override_generated_fields(result) for result in results] == \
[override_generated_fields(expected) for expected in expected_results]


def test_wsp_normal_set_of_records(set_up_environment, expected_results):
crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))
def test_wsp_local_package_path(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

# The test must wait until the docker environment is up (takes about 5 seconds).
sleep(5)
Expand All @@ -78,10 +125,10 @@ def test_wsp_normal_set_of_records(set_up_environment, expected_results):
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_environment.get('CRAWLER_PROJECT'),
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_environment.get('CRAWLER_ARGUMENTS')
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

assert [override_generated_fields(result) for result in results] == \
Expand Down

0 comments on commit 76c79cc

Please sign in to comment.