Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: adds WSP functional test for local package #107

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ services:
- ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP # the tmp folder that keeps the history of the crawled records
links:
- rabbitmq
- celery
Expand Down
18 changes: 13 additions & 5 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -23,6 +23,7 @@
from ..utils import (
ftp_list_files,
ftp_connection_info,
local_list_files,
get_license,
unzip_xml_files,
)
Expand Down Expand Up @@ -88,19 +89,25 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc
def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.package_path:
yield Request(self.package_path, callback=self.handle_package_file)
new_files_paths = local_list_files(
self.package_path,
self.target_folder
)

for file_path in new_files_paths:
yield Request("file://{0}".format(file_path), callback=self.handle_package_file)
else:
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)

dummy, new_files = ftp_list_files(
new_files_paths = ftp_list_files(
self.ftp_folder,
self.target_folder,
server=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)

for remote_file in new_files:
for remote_file in new_files_paths:
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
Expand All @@ -116,7 +123,7 @@ def start_requests(self):

def handle_package_ftp(self, response):
"""Handle a zip package and yield every XML found."""
self.log("Visited %s" % response.url)
self.log("Visited url %s" % response.url)
zip_filepath = response.body
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand All @@ -128,6 +135,7 @@ def handle_package_ftp(self, response):

def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand Down
30 changes: 19 additions & 11 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand Down Expand Up @@ -66,16 +66,24 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive
encrypt_data_channel=True)

with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
files = host.listdir(host.curdir + '/' + server_folder)
missing_files = []
all_files = []
for filename in files:
destination_file = os.path.join(target_folder, filename)
source_file = os.path.join(server_folder, filename)
if not os.path.exists(destination_file):
missing_files.append(source_file)
all_files.append(source_file)
return all_files, missing_files
file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
return list_missing_files(server_folder, target_folder, file_names)


def local_list_files(local_folder, target_folder):
file_names = os.listdir(local_folder)
return list_missing_files(local_folder, target_folder, file_names)


def list_missing_files(remote_folder, target_folder, file_names):
missing_files = []
for file_name in file_names:
destination_file = os.path.join(target_folder, file_name)
source_file = os.path.join(remote_folder, file_name)
if not os.path.exists(destination_file):
missing_files.append(source_file)

return missing_files


def get_first(iterable, default=None):
Expand Down
82 changes: 69 additions & 13 deletions tests/functional/WSP/test_wsp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017 CERN.
# Copyright (C) 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -14,6 +14,7 @@
import pytest
import json
import os
import shutil

from scrapyd_api import ScrapydAPI
from time import sleep
Expand Down Expand Up @@ -50,14 +51,14 @@ def expected_results():
return expected_data


@pytest.fixture(scope="module")
def set_up_environment():
netrc_location = os.path.join(os.path.dirname(
os.path.realpath(__file__)),
'fixtures/ftp_server/.netrc'
@pytest.fixture(scope="function")
def set_up_ftp_environment():
netrc_location = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.join('fixtures', 'ftp_server', '.netrc')
)

return {
yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
Expand All @@ -66,9 +67,44 @@ def set_up_environment():
}
}

clean_dir()


@pytest.fixture(scope="function")
def set_up_local_environment():
package_location = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.join('fixtures', 'ftp_server', 'WSP')
)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
'package_path': package_location,
}
}

remove_generated_files(package_location)


def remove_generated_files(package_location):
clean_dir()

_, dirs, files = next(os.walk(package_location))
for dir_name in dirs:
clean_dir(os.path.join(package_location, dir_name))
for file_name in files:
if not file_name.endswith('.zip'):
os.unlink(os.path.join(package_location, file_name))


def clean_dir(path='/tmp/WSP/'):
shutil.rmtree(path, ignore_errors=True)

def test_wsp_normal_set_of_records(set_up_environment, expected_results):
crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))

def test_wsp_ftp(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))

# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)
Expand All @@ -78,13 +114,33 @@ def test_wsp_normal_set_of_records(set_up_environment, expected_results):
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_environment.get('CRAWLER_PROJECT'),
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results


def test_wsp_local_package_path(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_environment.get('CRAWLER_ARGUMENTS')
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gottern_results = [override_generated_fields(result) for result in results]
gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gottern_results == expected_results
assert gotten_results == expected_results