From 6d060783e2c8e8c5e3a32606174e5d661fb8ae49 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Fri, 7 Apr 2017 17:13:18 +0200 Subject: [PATCH] tests: add WSP functional tests * Adds: logger to `InspireCeleryPushLine.close_spider` in order to be able to inspect outgoing celery tasks to Inspire. * Adds: disable passive ftp mode for WSP spider. * Adds: dockerized environment for functional tests (Dockerfile for hepcrawl, docker-compose files). * Adds: docker FTPServer with needed fixtures for the WSP's functional tests. * Adds: mocked celery tasks to catch outgoing tasks to Inspire. * Adds: WSP functional test. * Adds: WSP functional test to travis. * Adds: dockerized execution on travis for unit tests and docs. Signed-off-by: Spiros Delviniotis --- .coveragerc | 14 ++ .travis.yml | 35 ++-- Dockerfile | 35 ++++ docker-compose.deps.yml | 21 +++ docker-compose.test.yml | 80 +++++++++ docker_entrypoint.sh | 27 +++ hepcrawl/pipelines.py | 3 + hepcrawl/scrapy.cfg | 2 +- hepcrawl/settings.py | 9 +- hepcrawl/spiders/wsp_spider.py | 2 + hepcrawl/utils.py | 17 +- pytest.ini | 4 +- setup.py | 3 +- tests/__init__.py | 8 + tests/functional/WSP/__init__.py | 8 + .../functional/WSP/fixtures/ftp_server/.netrc | 3 + .../WSP/IDAQPv20i01-03160015-1510863.zip | Bin 0 -> 3396 bytes .../WSP/fixtures/ftp_server/pureftpd.passwd | 1 + .../WSP/fixtures/wsp_smoke_records.json | 100 +++++++++++ tests/functional/WSP/test_wsp.py | 158 ++++++++++++++++++ tests/functional/__init__.py | 8 + tests/functional/scrapyd_coverage_runner.conf | 12 ++ tests/functional/scrapyd_coverage_runner.py | 34 ++++ tests/functional/tasks.py | 55 ++++++ 24 files changed, 618 insertions(+), 21 deletions(-) create mode 100644 .coveragerc create mode 100644 Dockerfile create mode 100644 docker-compose.deps.yml create mode 100644 docker-compose.test.yml create mode 100755 docker_entrypoint.sh create mode 100644 tests/__init__.py create mode 100644 tests/functional/WSP/__init__.py create mode 100644 tests/functional/WSP/fixtures/ftp_server/.netrc create mode 100644 tests/functional/WSP/fixtures/ftp_server/WSP/IDAQPv20i01-03160015-1510863.zip create mode 100644 tests/functional/WSP/fixtures/ftp_server/pureftpd.passwd create mode 100644 tests/functional/WSP/fixtures/wsp_smoke_records.json create mode 100644 tests/functional/WSP/test_wsp.py create mode 100644 tests/functional/__init__.py create mode 100644 tests/functional/scrapyd_coverage_runner.conf create mode 100644 tests/functional/scrapyd_coverage_runner.py create mode 100644 tests/functional/tasks.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..e63e8392 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +[run] +parallel = True +omit = + /hepcrawl_venv/lib/python2.7/site-packages/* + diff --git a/.travis.yml b/.travis.yml index e092404c..288e75f7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -11,29 +11,42 @@ sudo: false language: python -cache: - - pip +services: + - docker python: - '2.7' env: - - EXTRAS=docs,tests + global: + - EXTRAS=docs,tests + - DOCKER_COMPOSE_VERSION=1.9.0 + - DOCKER_DATA="$HOME/hepcrawl_docker_data" + matrix: + - SUITE=unit + - SUITE=functional_wsp + +matrix: + fast_finish: true before_install: - - "travis_retry pip install --upgrade pip setuptools py" - - "travis_retry pip install twine wheel coveralls check-manifest" + - travis_retry pip install twine wheel coveralls check-manifest + - sudo rm -f /usr/local/bin/docker-compose + - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose + - chmod +x docker-compose + - sudo mv docker-compose /usr/local/bin install: - - "travis_retry pip install -e .[${EXTRAS}]" + - travis_retry docker-compose -f docker-compose.deps.yml run --rm pip script: - - pip freeze - - sphinx-build -qnNW docs docs/_build/html - - python setup.py test - - python setup.py sdist && ls dist/* + - travis_retry docker-compose -f docker-compose.test.yml run --rm ${SUITE} after_success: + - docker-compose -f docker-compose.test.yml kill -s SIGTERM + - docker-compose -f docker-compose.test.yml rm -f + - coverage combine + - sed -i 's@\"/code/@'"\"$(pwd)/"'@g' .coverage - coveralls notifications: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..2a3ebc11 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +FROM centos + +RUN yum install -y epel-release && \ + yum update -y && \ + yum install -y \ + file \ + gcc \ + libffi-devel \ + libxml2-devel \ + libxslt-devel \ + libssl-devel \ + make \ + openssl-devel \ + poppler-utils \ + python-pip \ + python-virtualenv && \ + yum clean all + +RUN mkdir /code + +ADD /docker_entrypoint.sh /docker_entrypoint.sh +ENTRYPOINT ["/docker_entrypoint.sh"] + +WORKDIR /code + +CMD true diff --git a/docker-compose.deps.yml b/docker-compose.deps.yml new file mode 100644 index 00000000..2a043632 --- /dev/null +++ b/docker-compose.deps.yml @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +version: '2' + +services: + pip: + build: + context: ${PWD} + dockerfile: Dockerfile + image: hepcrawl_base + command: bash -c "pip install -e .[all] && pip freeze" + volumes: + - ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/ + - ${PWD}:/code/ diff --git a/docker-compose.test.yml b/docker-compose.test.yml new file mode 100644 index 00000000..355d6e4f --- /dev/null +++ b/docker-compose.test.yml @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +version: '2' + +services: + functional_wsp: + image: hepcrawl_base # hepcrawl_base image is build at pip service of docker-compose.deps.yml + environment: &env_variables + - APP_BROKER_URL=amqp://guest:guest@rabbitmq:5672// + - APP_CELERY_RESULT_BACKEND=amqp://guest:guest@rabbitmq:5672// + - APP_CRAWLER_HOST_URL=http://scrapyd:6800 + - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=tests.functional.tasks.submit_results + - COVERAGE_PROCESS_START=/code/.coveragerc + command: py.test -vv tests/functional/WSP/test_wsp.py + volumes: &static_volume + - ${DOCKER_DATA}/tmp/hepcrawl_venv:/hepcrawl_venv/ + - ${PWD}:/code/ + - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf + links: + - rabbitmq + - celery + - scrapyd + - ftp_server + + unit: + image: hepcrawl_base + environment: *env_variables + command: bash -c "py.test tests/unit && sphinx-build -nNW docs docs/_build/html && python setup.py sdist && ls dist/*" + volumes: *static_volume + + doc: + image: hepcrawl_base + environment: *env_variables + command: bash -c "sphinx-build -qnNW docs docs/_build/html && exec python setup.py sdist && exec ls dist/*" + volumes: *static_volume + + celery: + image: hepcrawl_base + environment: *env_variables + command: celery worker --events --app tests.functional.tasks --loglevel=debug + volumes: *static_volume + links: + - rabbitmq + - ftp_server + + scrapyd: + image: hepcrawl_base + environment: *env_variables + command: bash -c "rm -f twistd.pid && exec scrapyd" + volumes: *static_volume + links: + - celery + - ftp_server + - rabbitmq + depends_on: + - scrapyd_deploy + + scrapyd_deploy: + image: hepcrawl_base + environment: *env_variables + command: bash -c "sleep 10 && scrapyd-deploy" # make sure that the scrapyd is up + volumes: *static_volume + + ftp_server: + image: stilliard/pure-ftpd:hardened + environment: + - PUBLICHOST=localhost + volumes: + - ${PWD}/tests/functional/WSP/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP + - ${PWD}/tests/functional/WSP/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd + + rabbitmq: + image: rabbitmq diff --git a/docker_entrypoint.sh b/docker_entrypoint.sh new file mode 100755 index 00000000..108250a9 --- /dev/null +++ b/docker_entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +set -e + +VENV_PATH=/hepcrawl_venv + +if [ ! -f "$VENV_PATH/bin/activate" ]; then + virtualenv "$VENV_PATH" + source "$VENV_PATH"/bin/activate + pip install --upgrade pip + pip install --upgrade setuptools wheel +else + source "$VENV_PATH"/bin/activate +fi + +find \( -name __pycache__ -o -name '*.pyc' \) -delete + +exec "$@" diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 43877022..6287947f 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -228,6 +228,8 @@ def open_spider(self, spider): def close_spider(self, spider): """Post results to BROKER API.""" + from celery.utils.log import get_task_logger + logger = get_task_logger(__name__) if 'SCRAPY_JOB' in os.environ and self.count > 0: task_endpoint = spider.settings[ 'API_PIPELINE_TASK_ENDPOINT_MAPPING' @@ -235,6 +237,7 @@ def close_spider(self, spider): spider.name, spider.settings['API_PIPELINE_TASK_ENDPOINT_DEFAULT'], ) + logger.info('Triggering celery task: %s.' % task_endpoint) self.celery.send_task( task_endpoint, kwargs=self._prepare_payload(spider), diff --git a/hepcrawl/scrapy.cfg b/hepcrawl/scrapy.cfg index 10700c5c..adffa153 100644 --- a/hepcrawl/scrapy.cfg +++ b/hepcrawl/scrapy.cfg @@ -1,5 +1,5 @@ # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index bec246e8..dffb5077 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -100,17 +100,20 @@ # INSPIRE Push Pipeline settings # ============================== API_PIPELINE_URL = "http://localhost:5555/api/task/async-apply" -API_PIPELINE_TASK_ENDPOINT_DEFAULT = "inspire_crawler.tasks.submit_results" +API_PIPELINE_TASK_ENDPOINT_DEFAULT = os.environ.get( + "APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT", + "inspire_crawler.tasks.submit_results" +) API_PIPELINE_TASK_ENDPOINT_MAPPING = {} # e.g. {'my_spider': 'special.task'} # Celery # ====== BROKER_URL = os.environ.get( "APP_BROKER_URL", - "amqp://guest:guest@localhost:5672//") + "amqp://guest:guest@rabbitmq:5672//") CELERY_RESULT_BACKEND = os.environ.get( "APP_CELERY_RESULT_BACKEND", - "amqp://guest:guest@localhost:5672//") + "amqp://guest:guest@rabbitmq:5672//") CELERY_ACCEPT_CONTENT = ['json', 'msgpack', 'yaml'] CELERY_TIMEZONE = 'Europe/Amsterdam' CELERY_DISABLE_RATE_LIMITS = True diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index ff089488..1fb07f38 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -91,6 +91,7 @@ def start_requests(self): yield Request(self.package_path, callback=self.handle_package_file) else: ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) + dummy, new_files = ftp_list_files( self.ftp_folder, self.target_folder, @@ -98,6 +99,7 @@ def start_requests(self): user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) + for remote_file in new_files: # Cast to byte-string for scrapy compatibility remote_file = str(remote_file) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 29689b4d..6940c3c0 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -7,6 +7,8 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. +from __future__ import absolute_import, print_function + import os import re from operator import itemgetter @@ -17,6 +19,8 @@ from urlparse import urlparse import ftputil +import ftputil.session +import ftplib import requests from scrapy import Selector @@ -40,7 +44,7 @@ def unzip_xml_files(filename, target_folder): return xml_files -def ftp_connection_info(ftp_host, netrc_file): +def ftp_connection_info(ftp_host, netrc_file, passive_mode=False): """Return ftp connection info from netrc and optional host address.""" if not ftp_host: ftp_host = netrc(netrc_file).hosts.keys()[0] @@ -48,13 +52,20 @@ def ftp_connection_info(ftp_host, netrc_file): connection_params = { "ftp_user": logininfo[0], "ftp_password": logininfo[2], + "ftp_passive": passive_mode, } return ftp_host, connection_params -def ftp_list_files(server_folder, target_folder, server, user, password): +def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False): """List files from given FTP's server folder to target folder.""" - with ftputil.FTPHost(server, user, password) as host: + session_factory = ftputil.session.session_factory( + base_class=ftplib.FTP, + port=21, + use_passive_mode=passive_mode, + encrypt_data_channel=True) + + with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host: files = host.listdir(host.curdir + '/' + server_folder) missing_files = [] all_files = [] diff --git a/pytest.ini b/pytest.ini index a4a766fd..fe0eeadf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for # more details. [pytest] -addopts = --pep8 --ignore=docs --cov=hepcrawl --cov-report=term-missing +addopts = --pep8 --ignore=docs --cov-config .coveragerc --cov=hepcrawl --cov-report=term-missing pep8ignore = tests/* ALL *.py E501 diff --git a/setup.py b/setup.py index 6c7f1db3..d064a673 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of hepcrawl. -# Copyright (C) 2015, 2016 CERN. +# Copyright (C) 2015, 2016, 2017 CERN. # # hepcrawl is a free software; you can redistribute it and/or modify it # under the terms of the Revised BSD License; see LICENSE file for @@ -29,6 +29,7 @@ 'furl>=0.4.95', 'ftputil>=3.3.1', 'python-dateutil>=2.4.2', + 'python-scrapyd-api>=2.0.1', ] tests_require = [ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e8c02e63 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. diff --git a/tests/functional/WSP/__init__.py b/tests/functional/WSP/__init__.py new file mode 100644 index 00000000..e8c02e63 --- /dev/null +++ b/tests/functional/WSP/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. diff --git a/tests/functional/WSP/fixtures/ftp_server/.netrc b/tests/functional/WSP/fixtures/ftp_server/.netrc new file mode 100644 index 00000000..59a152f7 --- /dev/null +++ b/tests/functional/WSP/fixtures/ftp_server/.netrc @@ -0,0 +1,3 @@ +machine ftp_server +login bob +password bob diff --git a/tests/functional/WSP/fixtures/ftp_server/WSP/IDAQPv20i01-03160015-1510863.zip b/tests/functional/WSP/fixtures/ftp_server/WSP/IDAQPv20i01-03160015-1510863.zip new file mode 100644 index 0000000000000000000000000000000000000000..f102174cbc4ddd2d73216b5354a667804297f040 GIT binary patch literal 3396 zcma);c{tQ<7srROWQ!O{b~4C=F(N`jVTL9uWXYCY9y5j-6B>-Nlbweo%h;8%4bs>K zB??ittb;VNFG;;TUC+~dRquOset-P#`?}8g-sk-DIbUN#AOi>p*bDNVB>4XSpWxm* zxH;ORkn&K7yc-(r>m&zJR51QGzw3wmW53KtL!gRk5UBD=#gobq2n4G1ufhQUX-{GK z>K7>xBLLui5CBm7O@*H{4R_u>Q5dF5#cg8hPZemJB|rXN9REISd>no zYpjJOnq#an}&rPH5DTg{^cB42sdC@{a$fbC$9&6wY3BvWgSPtnO@_(kky+f`RL z%!vRLYsy#@<;&Iy%_Bu1jDy%XL6_^N<|ThihXz z^tsA&iuFnAAG5ROqf%=H0jYONy!?;j{k6nM1H}S~WrH5KHCJS-awjkv`DXA8cZ{_ zaWo7-T)x5ClwWO})PDyvnA{^va;|Uacp#NYFjZ1Gn~4tPf}N$DE^!$0p#3?ha}8%` zs$85kK7CH!ovbxL0z~S*u_EH$!fxA}^j9&k7qD@_;u%a-H-hKpERCW+cWiePsUVgG${!s)1AZ^#ukc?D1}4Qk@V!D&O4agqn607)&X_ zXC6Pbt66!|@oh^G2jZm+Dd+y8_tMtzV5|Un)7&$9U9O3h)?b}ySU%21E7XGv0&Tka zZ_M*(WY=VK5-{S)_{b9LQHDr;P2o0B>23^?Q_!ajA&`DnZsh7 zFko{by^rfdMk1za3{+4-t)jx?kJ(IeC75L+hN8WOGp^Oi2G{Zo(uRnGosmSoFbJ6)`Q}`L)yCa#8t|hLSd3OQqpgQH6NJC6QUdUFYss zU%S)3_^i0{byr$GQd3}}SbdPURm?M#U+2ZhgbJ-<(TvW+>vCf?g3Ycz>vLPIc5MP{ zjWFla8k_b5-qdk*Z6n1DrJTB(y?Ski;D}t$jA4t=jJ3X^s6yV)`Pr`TjD3>Dlw_3h zw~-$NEW*#dNC4>P=CM~!7)fF#rkw0L8Z%4oO}CDuy?l1dU;RZ*S9_5wJm!wWQ2e5q z8;09%l-)_p=CbPIS;-g38!{9h&U2;K{dLVyV6O_F^`}UU84O+(QG!f}dP&%(tqO>A ze4IlD_GDHGKv>5U%TFlGT5H$ar#=!pNNMc_)!gUrs{$$tiNv_Mmy%M2mD`x27Dez= zWpA^lw(EtFeD!Q|B{6Tf7XhAH-lh>l{^qY~5G&82di(YTz{sWN#o|-P(|iFMkIop~ zJwC8Zwt5j6Iw)=LB9&4nYg{dA_w=0Cv}zzYSGdu)(FEo&#Q&&#PH@$!u-ZcMQai~C zV*}w7L2?oo;+{aq18Rxo!ffFtC@+)33Yjb1DO>j}p_#hi_ohuMJiYE*1;{A52>0kF zLeX*7$wbtW8r;-FKYSWxBs3Fd^X3vXJmd0W;j1wSjb6u_tN3IWFM8+8wqf*0Pk4KO z0MJ*NaeAzr7&s=8;PX0M##?Cay(a$S+RPYVSEMs}l}@2rW0b84acfwgiX{rZ@zrTA z&R0KG>&a3*y#qIKO~j61=c0QjHp@y*lsA=?nV`ri(I6_D;wn|3w|eY4bLckH;?@$# zsxbYgX%+a7nJ*#pnPR)Z@38(6?!N_V707?VS|cP0F6^+p$g~G*RgT|;^`62HuvUTW z!#c_AJFK(Lh@*OSvJH<)GcttHJMb;929LCQvd3InA?A5mhe#j~{@DyWqK+87Q-SLT zD@~<&1!@RO)$K^8AiJN#LGC&_TB2mri@7?b;OX_Z5+7Aa0=A8ytA0xkH#Qb$bnTqv zxH~6i7Iu2>ci}LJhy_}mapGy!TL3wZ3j`Mea(x&1Ay$8I-0f9j1S~Zf2JSv|G!;G( zUZlpFYKeN8KRY-2lrBWH;m`;>Q{riV1g;V%tdu{3<=m}AgvS)KG4~n2?&Oo{C$MzB z#|G%kI}?{r>OPe=KOq>~-e1C9oIbn48BsHz@jA_~E9>4t5_fw)vo=?6&8iClMnO|q zOkW#GX81U*K(MF9B+F~N$wY?`tazF{)SLk#;B}kF$o+YRaTWM<^~|cb4KctbcidpX zZS;@a0;tWyuvT%WtieORmOIp{0!6Z^#fgP?t;-hR<7ANqYuvpSayEZZb?)smAk#bm z@V$<1d~1`a;H{={YH8T}y%jVv7g|Hn^*X_>hww@nrkj#UUfptP4|bLqveE-eK}WPr zntU(HA{)w}mUXOzQ1TqT7P}YwTp>c=g=egepP@-QtbjPI54-Zh{G1FXFVB>?^=5!Y z&~2Hp+Q&bih`HFFGmZ+YP1dyb!!A&>%=GT6^s7NXER(V7u1O!vE^LyrY@eE{g9h1* z2IPS1y6&ug=WHi3oQ2487(Ie@cdyuZ>y%QyExyyD%#>_&5mXsWn6qG!F)xWnZ&ITH}KwneW52wHk`X;FsSnu`;KNqbi$Wc~LsNHFren zK5wLm77ewkOFj=I>kgEP&WyNyw1qb^e7=d2OJRWDLv(nRss`X&SaJ#>3TsgcXZxrJ za%Am!qBus&=_i{*3*zN!KLpiOF#AV)-b0((-W`OelFYoyTdr?@#__S=MM4@-Xdj_R zlN{K06eSUxZHR~3Rfc+8$C2((6U~eoljRH+{+hPNl*1h|S~4qP>5R;|7knZz--%qn z%C%cME_21kW>>mCx8R>@!|NI*l(DfJXbcjChJ2;oozE0P+jM|und&5IYaSRuyFUi|e|cS_ClY;M0kdr@M*3@|`6$QNC(Iuh z9Yg90ByQkG&-BGMbe+rl__Zigd%;tEa-(4b&+Kj?cFBf9L;zx=MIQ$6PSwtqPI}KP z=RqW5i}yfWcg(Ia4FSzK%5!o3LT0(?lCdv<8YU01P3{L5W8Bp+8&f;mhY!d5Z87h- zP~W9iMOIRAuxqodaXXc;(KQ|LVw8Ajt*O7L`S0IcY-?B*q4 zBjwzz(A83*ZHMoFTB_{0JZQ~CHr`MlA?bYLGd857*8=`;_x;U@bK%I^`=?)d@$dD& za^v6YA-}Bu0nLAV^M4e7?asd`isCARynj>ir#JTT|NjQx-%Q_PfAG=0ZS(WsKZyRl or@lA-t4;Myaldg9rr%Ti>5XrVGwziG0E~NI1swq3&$fU252tRr+yDRo literal 0 HcmV?d00001 diff --git a/tests/functional/WSP/fixtures/ftp_server/pureftpd.passwd b/tests/functional/WSP/fixtures/ftp_server/pureftpd.passwd new file mode 100644 index 00000000..275a727c --- /dev/null +++ b/tests/functional/WSP/fixtures/ftp_server/pureftpd.passwd @@ -0,0 +1 @@ +bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./:::::::::::: diff --git a/tests/functional/WSP/fixtures/wsp_smoke_records.json b/tests/functional/WSP/fixtures/wsp_smoke_records.json new file mode 100644 index 00000000..0a287fa0 --- /dev/null +++ b/tests/functional/WSP/fixtures/wsp_smoke_records.json @@ -0,0 +1,100 @@ +[{ + "refereed": true, + "acquisition_source": { + "source": "WSP", + "method": "hepcrawl", + "submission_number": "a5f2858a26a211e790b40242ac120005", + "datetime": "2017-04-21T14:56:10.981309" + }, + "copyright": [{ + "url": "article", + "holder": "Copyright Holder" + }], + "public_notes": [{ + "source": "hepcrawl", + "value": "Communicated by J. John" + }], + "number_of_pages": 6, + "authors": [{ + "affiliations": [{ + "value": "Department, University, City, City_code 123456, C. R. Country_2" + }], + "full_name": "author_surname_2, author_name_1" + }], + "titles": [{ + "source": "WSP", + "title": "Article-title\u2019s" + }], + "dois": [{ + "source": "hepcrawl", + "value": "10.1142/S0219025717500060" + }], + "publication_info": [{ + "journal_volume": "30", + "year": 2017, + "journal_issue": "01", + "artid": "1750006", + "journal_title": "This is a journal title 2" + }], + "document_type": ["article"], + "abstracts": [{ + "source": "WSP", + "value": "Abstract L\u00e9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa." + }], + "imprints": [{ + "date": "2017-03-30T00:00:00" + }], + "citeable": true +}, { + "refereed": true, + "acquisition_source": { + "source": "WSP", + "method": "hepcrawl", + "submission_number": "a5f2858a26a211e790b40242ac120005", + "datetime": "2017-04-21T14:56:11.026428" + }, + "copyright": [{ + "url": "article", + "holder": "Copyright Holder" + }], + "public_notes": [{ + "source": "hepcrawl", + "value": "Communicated by B. O. Bob" + }], + "number_of_pages": 21, + "authors": [{ + "affiliations": [{ + "value": "Department, University, City, City_code 123456, C. R. Country_1" + }], + "full_name": "author_surname_1, author_name_1" + }, { + "affiliations": [{ + "value": "Department, University, City, City_code 123456, C. R. Country_1" + }], + "full_name": "author_surname_2, author_name_2" + }], + "titles": [{ + "source": "WSP", + "title": "Article-title L\u00e9vy char" + }], + "dois": [{ + "source": "hepcrawl", + "value": "10.1142/S0219025717500023" + }], + "publication_info": [{ + "journal_volume": "20", + "year": 2017, + "journal_issue": "01", + "artid": "1750002", + "journal_title": "This is a journal title 1" + }], + "document_type": ["article"], + "abstracts": [{ + "source": "WSP", + "value": "Abstract L\u00e9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa L2-bla blaaa blaa, Bla\u2019s bla, bla blaaa blaa." + }], + "imprints": [{ + "date": "2017-03-30T00:00:00" + }], + "citeable": true +}] diff --git a/tests/functional/WSP/test_wsp.py b/tests/functional/WSP/test_wsp.py new file mode 100644 index 00000000..3d26cdb0 --- /dev/null +++ b/tests/functional/WSP/test_wsp.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for WSP spider""" + +from __future__ import absolute_import, print_function, unicode_literals + +import pytest +import json +import os + +from itertools import islice +from scrapyd_api import ScrapydAPI +from time import sleep + +from tests.functional.tasks import app + + +class CeleryMonitor(object): + def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100): + self.results = [] + self.recv = None + self.app = app + self.connection = None + self.monitor_timeout = monitor_timeout + self.monitor_iter_limit = monitor_iter_limit + + def __enter__(self): + state = self.app.events.State() + + def announce_succeeded_tasks(event): + state.event(event) + task = state.tasks.get(event['uuid']) + print('TASK SUCCEEDED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + tasks = app.AsyncResult(task.id) + for task in tasks.result: + self.results.append(task) + self.recv.should_stop = True + + def announce_failed_tasks(event): + state.event(event) + task = state.tasks.get(event['uuid']) + print('TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + self.results.append(task.info()) + self.recv.should_stop = True + + self.app.control.enable_events() + self.connection = self.app.connection() + self.recv = self.app.events.Receiver(self.connection, handlers={ + 'task-succeeded': announce_succeeded_tasks, + 'task-failed': announce_failed_tasks, + }) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + events_iter = self.recv.itercapture(limit=None, timeout=self.monitor_timeout, wakeup=True) + self._wait_for_results(events_iter) + self.connection.__exit__() + + def _wait_for_results(self, events_iter): + any(islice(events_iter, self.monitor_iter_limit)) + + @classmethod + def do_crawl(cls, + app, + monitor_timeout, + monitor_iter_limit, + crawler_instance, + project='hepcrawl', + spider='WSP', + settings=None, + **crawler_arguments): + + if settings is None: + settings = {} + + with cls(app, monitor_timeout=monitor_timeout, monitor_iter_limit=monitor_iter_limit) as my_monitor: + crawler_instance.schedule( + project=project, + spider=spider, + settings=settings or {}, + **crawler_arguments + ) + + return my_monitor.results + + +def get_crawler_instance(crawler_host, *args, **kwargs): + """Return current crawler instance.""" + return ScrapydAPI( + crawler_host, + *args, + **kwargs + ) + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.fixture(scope="module") +def expected_results(): + file_name = 'fixtures/wsp_smoke_records.json' + responses_dir = os.path.dirname(os.path.realpath(__file__)) + response_file = os.path.join(responses_dir, file_name) + + with open(response_file) as fd: + expected_data = json.load(fd) + + return expected_data + + +@pytest.fixture(scope="module") +def set_up_environment(): + netrc_location = os.path.join(os.path.dirname( + os.path.realpath(__file__)), + 'fixtures/ftp_server/.netrc' + ) + + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'ftp_host': 'ftp_server', + 'ftp_netrc': netrc_location, + } + } + + +def test_wsp_normal_set_of_records(set_up_environment, expected_results): + crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL')) + + # The test must wait until the docker environment is up (takes about 5 seconds). + sleep(5) + + results = CeleryMonitor.do_crawl( + app=app, + monitor_timeout=5, + monitor_iter_limit=100, + crawler_instance=crawler, + project=set_up_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_environment.get('CRAWLER_ARGUMENTS') + ) + + assert [override_generated_fields(result) for result in results] == \ + [override_generated_fields(expected) for expected in expected_results] diff --git a/tests/functional/__init__.py b/tests/functional/__init__.py new file mode 100644 index 00000000..e8c02e63 --- /dev/null +++ b/tests/functional/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. diff --git a/tests/functional/scrapyd_coverage_runner.conf b/tests/functional/scrapyd_coverage_runner.conf new file mode 100644 index 00000000..3851724e --- /dev/null +++ b/tests/functional/scrapyd_coverage_runner.conf @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + + +[scrapyd] +runner = tests.functional.scrapyd_coverage_runner diff --git a/tests/functional/scrapyd_coverage_runner.py b/tests/functional/scrapyd_coverage_runner.py new file mode 100644 index 00000000..bbb891ec --- /dev/null +++ b/tests/functional/scrapyd_coverage_runner.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + + +from scrapyd.runner import main + +import coverage + + +cov = coverage.Coverage() + + +def start_coverage(): + cov.start() + coverage.process_startup() + + +def save_coverage(): + cov.stop() + cov.save() + + +if __name__ == '__main__': + print("\n--------------- CUSTOM SCRAPYD RUNNER ----------------\n") + + start_coverage() + main() + save_coverage() diff --git a/tests/functional/tasks.py b/tests/functional/tasks.py new file mode 100644 index 00000000..bdef104b --- /dev/null +++ b/tests/functional/tasks.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2015, 2016, 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Celery tasks for dealing with crawler.""" + +from __future__ import absolute_import, print_function, unicode_literals + +import json + +from six.moves.urllib.parse import urlparse + +from celery import Celery + + +class Config(object): + CELERY_RESULT_BACKEND = "amqp://guest:guest@rabbitmq:5672//" + BROKER_URL = "amqp://guest:guest@rabbitmq:5672//" + CELERY_ALWAYS_EAGER = True + CELERY_CACHE_BACKEND = 'memory' + CELERY_EAGER_PROPAGATES_EXCEPTIONS = True + + +app = Celery() +app.config_from_object(Config) + + +@app.task +def submit_results(job_id, errors, log_file, results_uri, results_data=None): + """Receive the submission of the results of a crawl job.""" + + def _extract_results_data(results_path): + results_data = [] + with open(results_path) as records: + lines = ( + line.strip() for line in records if line.strip() + ) + + for line in lines: + record = json.loads(line) + results_data.append(record) + + return results_data + + results_path = urlparse(results_uri).path + + if results_data is None: + results_data = _extract_results_data(results_path) + + return results_data