Skip to content

Commit

Permalink
Merge pull request #107 from spirosdelviniotis/hepcrawl_wsp_local_pac…
Browse files Browse the repository at this point in the history
…kage_test

tests: adds WSP functional test for local package
  • Loading branch information
david-caro authored May 11, 2017
2 parents e25033e + 94af814 commit 0c0fdf1
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 29 deletions.
1 change: 1 addition & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ services:
- ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP # the tmp folder that keeps the history of the crawled records
links:
- rabbitmq
- celery
Expand Down
18 changes: 13 additions & 5 deletions hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -23,6 +23,7 @@
from ..utils import (
ftp_list_files,
ftp_connection_info,
local_list_files,
get_license,
unzip_xml_files,
)
Expand Down Expand Up @@ -88,19 +89,25 @@ def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc
def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.package_path:
yield Request(self.package_path, callback=self.handle_package_file)
new_files_paths = local_list_files(
self.package_path,
self.target_folder
)

for file_path in new_files_paths:
yield Request("file://{0}".format(file_path), callback=self.handle_package_file)
else:
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)

dummy, new_files = ftp_list_files(
new_files_paths = ftp_list_files(
self.ftp_folder,
self.target_folder,
server=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)

for remote_file in new_files:
for remote_file in new_files_paths:
# Cast to byte-string for scrapy compatibility
remote_file = str(remote_file)
ftp_params["ftp_local_filename"] = os.path.join(
Expand All @@ -116,7 +123,7 @@ def start_requests(self):

def handle_package_ftp(self, response):
"""Handle a zip package and yield every XML found."""
self.log("Visited %s" % response.url)
self.log("Visited url %s" % response.url)
zip_filepath = response.body
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand All @@ -128,6 +135,7 @@ def handle_package_ftp(self, response):

def handle_package_file(self, response):
"""Handle a local zip package and yield every XML."""
self.log("Visited file %s" % response.url)
zip_filepath = urlparse.urlsplit(response.url).path
zip_target_folder, dummy = os.path.splitext(zip_filepath)
xml_files = unzip_xml_files(zip_filepath, zip_target_folder)
Expand Down
30 changes: 19 additions & 11 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016 CERN.
# Copyright (C) 2015, 2016, 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand Down Expand Up @@ -66,16 +66,24 @@ def ftp_list_files(server_folder, target_folder, server, user, password, passive
encrypt_data_channel=True)

with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
files = host.listdir(host.curdir + '/' + server_folder)
missing_files = []
all_files = []
for filename in files:
destination_file = os.path.join(target_folder, filename)
source_file = os.path.join(server_folder, filename)
if not os.path.exists(destination_file):
missing_files.append(source_file)
all_files.append(source_file)
return all_files, missing_files
file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
return list_missing_files(server_folder, target_folder, file_names)


def local_list_files(local_folder, target_folder):
file_names = os.listdir(local_folder)
return list_missing_files(local_folder, target_folder, file_names)


def list_missing_files(remote_folder, target_folder, file_names):
missing_files = []
for file_name in file_names:
destination_file = os.path.join(target_folder, file_name)
source_file = os.path.join(remote_folder, file_name)
if not os.path.exists(destination_file):
missing_files.append(source_file)

return missing_files


def get_first(iterable, default=None):
Expand Down
82 changes: 69 additions & 13 deletions tests/functional/WSP/test_wsp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017 CERN.
# Copyright (C) 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
Expand All @@ -14,6 +14,7 @@
import pytest
import json
import os
import shutil

from scrapyd_api import ScrapydAPI
from time import sleep
Expand Down Expand Up @@ -50,14 +51,14 @@ def expected_results():
return expected_data


@pytest.fixture(scope="module")
def set_up_environment():
netrc_location = os.path.join(os.path.dirname(
os.path.realpath(__file__)),
'fixtures/ftp_server/.netrc'
@pytest.fixture(scope="function")
def set_up_ftp_environment():
netrc_location = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.join('fixtures', 'ftp_server', '.netrc')
)

return {
yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
Expand All @@ -66,9 +67,44 @@ def set_up_environment():
}
}

clean_dir()


@pytest.fixture(scope="function")
def set_up_local_environment():
package_location = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.path.join('fixtures', 'ftp_server', 'WSP')
)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
'package_path': package_location,
}
}

remove_generated_files(package_location)


def remove_generated_files(package_location):
clean_dir()

_, dirs, files = next(os.walk(package_location))
for dir_name in dirs:
clean_dir(os.path.join(package_location, dir_name))
for file_name in files:
if not file_name.endswith('.zip'):
os.unlink(os.path.join(package_location, file_name))


def clean_dir(path='/tmp/WSP/'):
shutil.rmtree(path, ignore_errors=True)

def test_wsp_normal_set_of_records(set_up_environment, expected_results):
crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))

def test_wsp_ftp(set_up_ftp_environment, expected_results):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))

# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)
Expand All @@ -78,13 +114,33 @@ def test_wsp_normal_set_of_records(set_up_environment, expected_results):
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_environment.get('CRAWLER_PROJECT'),
project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gotten_results == expected_results


def test_wsp_local_package_path(set_up_local_environment, expected_results):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
crawler_instance=crawler,
project=set_up_local_environment.get('CRAWLER_PROJECT'),
spider='WSP',
settings={},
**set_up_environment.get('CRAWLER_ARGUMENTS')
**set_up_local_environment.get('CRAWLER_ARGUMENTS')
)

gottern_results = [override_generated_fields(result) for result in results]
gotten_results = [override_generated_fields(result) for result in results]
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert gottern_results == expected_results
assert gotten_results == expected_results

0 comments on commit 0c0fdf1

Please sign in to comment.