From 81735b8987933e7ac14a66103775337b20b3b079 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Tue, 15 Aug 2017 10:38:58 +0200 Subject: [PATCH] pos: add functional test Signed-off-by: Spiros Delviniotis --- .travis.yml | 1 + docker-compose.test.yml | 16 +++ hepcrawl/spiders/pos_spider.py | 39 +++----- .../pos/fixtures/https_server/conf/proxy.conf | 17 ++++ .../fixtures/https_server/conf/ssl/cert.key | 28 ++++++ .../fixtures/https_server/conf/ssl/cert.pem | 19 ++++ .../records/PoS(LATTICE 2013)001.html | 55 +++++++++++ .../pos/fixtures/oai_harvested/pos_record.xml | 33 +++++++ .../functional/pos/fixtures/pos_records.json | 57 +++++++++++ tests/functional/pos/test_pos.py | 97 +++++++++++++++++++ tests/unit/test_pos.py | 4 +- 11 files changed, 338 insertions(+), 28 deletions(-) create mode 100644 tests/functional/pos/fixtures/https_server/conf/proxy.conf create mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.key create mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem create mode 100644 tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html create mode 100644 tests/functional/pos/fixtures/oai_harvested/pos_record.xml create mode 100644 tests/functional/pos/fixtures/pos_records.json create mode 100644 tests/functional/pos/test_pos.py diff --git a/.travis.yml b/.travis.yml index 7682b90c..6bc66b84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ env: - SUITE=functional_arxiv - SUITE=functional_desy - SUITE=functional_cds + - SUITE=functional_pos matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 65bb864b..a1e93998 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -56,6 +56,13 @@ services: links: - scrapyd + functional_pos: + <<: *service_base + command: py.test -vv tests/functional/pos + links: + - scrapyd + - server.local + unit: <<: *service_base command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*" @@ -96,5 +103,14 @@ services: - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd + server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl + - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/ + ports: + - 443:443 + rabbitmq: image: rabbitmq diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index da0faf73..ab5083ed 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,6 +41,7 @@ class POSSpider(StatefulSpider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' + # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, @@ -83,24 +84,18 @@ def scrape_conference_paper(self, response): response=response, ) - # TODO Yield request for Conference page - proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - proceedings_identifier = proceedings_identifier.split('=')[1] - pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) - self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) + # # Yield request for Conference page + # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + # proceedings_identifier = proceedings_identifier.split('=')[1] + # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) # yield Request(pos_url, callback=self.scrape_proceedings) - yield self.build_conference_paper_item(response) + return self.build_conference_paper_item(response) - def scrape_proceedings(self, response): - # TODO create proceedings record - # TODO document_type = proceeding - # TODO title = template(“Proceedings, ”) - # TODO subtitle = template(“<place>, <date>”) - # TODO publication_info.journal_title = “PoS” - # TODO publication_info.journal_volume = identifier - - pass + # def scrape_proceedings(self, response): + # # create proceedings record + # import pytest + # pytest.set_trace() def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" @@ -174,7 +169,7 @@ def _get_journal_artid(identifier): def _get_ext_systems_number(node): return [ { - 'institute': 'pos', + 'institute': 'PoS', 'value': node.xpath('.//identifier/text()').extract_first() }, ] @@ -201,18 +196,10 @@ def _get_authors(node): # To be refactored ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append( - { - 'value': affiliation - } - ) + auth_dict['affiliations'].append({'value': affiliation}) # Todo probably to remove else: - auth_dict['affiliations'] = [ - { - 'value': affiliation - }, - ] + auth_dict['affiliations'] = [{'value': affiliation}, ] if auth_dict: authors.append(auth_dict) return authors diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf new file mode 100644 index 00000000..f4235640 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -0,0 +1,17 @@ +server { + listen 443 ssl; + server_name localhost; + + ssl on; + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + ssl_certificate ssl/cert.pem; + ssl_certificate_key ssl/cert.key; + + location ~ /contribution { + if ($args ~* "^id=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } +} diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key new file mode 100755 index 00000000..19e1df68 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQChhBiOoipMRRcc +E5waKrGB01/QtRpfIGp5KmJfnif05dR05wWojHO6EtabZ2qbXtcSuyQ0vRNpbZUU +OzcriFOMk8dujDzuKMkegsq/LE4PyN/R5JZtf34NyGG7v70K6Uq7RV4PUzk+zoum +1McMUBk1QlGP/E9RsDlSPv9XOblUpicPDuwhCwPf4zi6jporgXjDJ/iUuh+bexxv +40R7f2dCWkiHYiNiLNLTwXdYkaWBcc3HoTq9FEZZhYDhWRjX0/TuINmMr5lbUvr6 +UYRABOS4VeUyHpb/e7OH9WXQxzR76LuQFfQDSgs0GxXw1KG58aq+P0ni2E77C4Iu +odQ8iT+jAgMBAAECggEBAIqJeFrXY7p5xIGznEChgBHgUR3+SPlxH4KARVLIoHMh +s2L2SVcx6Y2f3O38/Wb5KTcKx9polz7l3Go3BHJVg3xfwT7kENsipqeB/g+OHALU +BI7PJ+wR3/hIePQGWUsDobMRo8U3WDG0DfryJS09gvG4yabb/tkNc41FNdUGUR31 +7VInQFqv2/jZ/2A3s3DZ0Cns9vJuLhmf7629k3MVCuU7Rh0rStnVCA70kjgKzOfP ++26fnfd/MmrQYbaukw04+cwcwifGkF5Jis80qTWsgdF82rkzpwJLDo0Jd2HZFuOa +AHkWK2QiMzb6PS2Uo7Zarax9E+W2TLahANXZQQ32NAkCgYEAzKw7XbEwzWG/T7yX +EgNIAN7YtcGYr9sfHlVJ8bWYK7GZBbCkKDlGU+YGRE++plh/jtXYjsIFElWtv01Y +UpqBdWf7p8mXdtVoq6YyL5WuQVMwpjKHvegTXXwAoreEXZeKr1LKC11B14h+8wsR +D5uf0GVmdw12nSrzeu3Q4oSgss8CgYEAygU++fItIYuPtZfrC8qDcyEiOLQmAHtX +eTnEHOPy8ik+bdwF5Rg0nzxLu3RZ47ykGdEOzpGRO4B9V1EevwSEzX6VO7latMUS +cLKb3Y0bXm6qQcWG+LAlvyaHfAH0oN47xfScLDiUm6BKd4Eo9kpkgaQzSgUfFZNQ +6DHiA3Emau0CgYEAyel7Y3GjMGomvrXQ3x9HkDxH0/7Z71qe92CyYvZ/2VMKH9fk +Ch5+p9P8CLYW4anapQGH80WqlSzbDCd0Y4EzB6z+UceJWd0stnFtfw4N6znze3HM +AegJ+qaTRfL/bQlL8qwc0Fs+0i9A9enL+fbQEVmHXRl2E5TEwFgOQvkOQ3cCgYAA +4bD6qkHkKZXA9x7BeGrGb9iUYsTfr6ocD1J5xczjnaZ2GEW2UDq6jyrNcJ6LzeDx +c+YapKv7lH33iZUWxFBIDUtdbVul+k4wS7c+akU6TkVT8Ca8oxgnE2X39pI4uX+N +R5n+32hWnYZ1qwygtoZlwm+u3QLbtz7dJIqV9UJzqQKBgQCL8Xo9LA0Dm7ZsdDDI +I93YsjCELvBsonymmD1MTpk7uIA+qH8LAih+Vhonc17NtpXuas8eqc8ntuNLAgON +Tylvk32uaRqquHWl6MT7bwaaK7pD8KuOIUJdl5SEc+DDUcB2A2XLg7Yv08Dus8A7 +6J5oH8YJ3hqmVGZzbOo75IFerg== +-----END PRIVATE KEY----- diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem new file mode 100755 index 00000000..1418c1bb --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDATCCAemgAwIBAgIJAJRKy2TWwZqTMA0GCSqGSIb3DQEBCwUAMBcxFTATBgNV +BAMMDGh0dHBzX3NlcnZlcjAeFw0xNzA4MTQxNDQ1MTFaFw0yMDA2MDMxNDQ1MTFa +MBcxFTATBgNVBAMMDGh0dHBzX3NlcnZlcjCCASIwDQYJKoZIhvcNAQEBBQADggEP +ADCCAQoCggEBAKGEGI6iKkxFFxwTnBoqsYHTX9C1Gl8gankqYl+eJ/Tl1HTnBaiM +c7oS1ptnapte1xK7JDS9E2ltlRQ7NyuIU4yTx26MPO4oyR6Cyr8sTg/I39Hklm1/ +fg3IYbu/vQrpSrtFXg9TOT7Oi6bUxwxQGTVCUY/8T1GwOVI+/1c5uVSmJw8O7CEL +A9/jOLqOmiuBeMMn+JS6H5t7HG/jRHt/Z0JaSIdiI2Is0tPBd1iRpYFxzcehOr0U +RlmFgOFZGNfT9O4g2YyvmVtS+vpRhEAE5LhV5TIelv97s4f1ZdDHNHvou5AV9ANK +CzQbFfDUobnxqr4/SeLYTvsLgi6h1DyJP6MCAwEAAaNQME4wHQYDVR0OBBYEFAfu +RxroDak/yro7MbRfDogKVDmBMB8GA1UdIwQYMBaAFAfuRxroDak/yro7MbRfDogK +VDmBMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAF5M/Gz6JDC1HoSm +6HFLBB9ul9TQQI3RhohwreCYyeZ866WrvqZfle+lxcgVburYCSyi5paFpvNK3DH2 +J0A2fDAMekZGcaJ7O5Zx0evTCwXoxDOhS+xO5IlGTXWCEKLeLkU27WJiLC9cTbFr +kfjL14IMnsioRzUz4a+aX5JllqnEccCDlHjSk1w5YvOvt6GC6Bvenouja2apPes/ +oJJpFwZVO0epqOQo1ndRGbt5NLv6YgZlvdFXWoKNKohzdfDV/RbW9BrbpyKSxFTm +usrmVcZTQpSf69zbnEVO8N3N6c1zNdETPON1ZGLW1O1MXWkQDZniH6LduXN/Oob7 +vYqvXlw= +-----END CERTIFICATE----- diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html new file mode 100644 index 00000000..e080cb39 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html @@ -0,0 +1,55 @@ +<!DOCTYPE html + PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US"> +<head> +<title>PoS(LATTICE 2013)001 + + + + + + + + + +

PoS(LATTICE 2013)001

+ +
+ +
+ Title + Heavy Flavour Physics Review +
+ +
+ Conference + 31st International Symposium on Lattice Field Theory LATTICE 2013 +
+ +
+ Authors +
+A. El-Khadra
+
+ + +
+ Contribution + pdf +
+ + + + +
+ + + + + diff --git a/tests/functional/pos/fixtures/oai_harvested/pos_record.xml b/tests/functional/pos/fixtures/oai_harvested/pos_record.xml new file mode 100644 index 00000000..f65dfb9e --- /dev/null +++ b/tests/functional/pos/fixtures/oai_harvested/pos_record.xml @@ -0,0 +1,33 @@ + + +2015-01-29T13:44:13Z + +https://pos.sissa.it/cgi-bin/oai/oai-script-spires-extended.cgi + + + +
+ oai:pos.sissa.it:LATTICE 2013/001 + 2014-04-28 + conference:LATTICE 2013 + group:9 +
+ + + Heavy Flavour Physics Review + Aida El-KhadraINFN and Università di Firenze + M. T. MacDonaldU of PecsLattice Field Theory + 31st International Symposium on Lattice Field Theory LATTICE 2013; Plenary sessions + Sissa Medialab + 2014-03-19T21:09:30Z + Text + application/pdf + PoS(LATTICE 2013)001 + en + LATTICE 2013 (31st International Symposium on Lattice Field Theory LATTICE 2013) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike + + +
+
+
diff --git a/tests/functional/pos/fixtures/pos_records.json b/tests/functional/pos/fixtures/pos_records.json new file mode 100644 index 00000000..ee8b88af --- /dev/null +++ b/tests/functional/pos/fixtures/pos_records.json @@ -0,0 +1,57 @@ +[ + { + "acquisition_source": { + "source": "pos", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "license": [ + { + "url": "https://creativecommons.org/licenses/by-nc-sa/3.0", + "license": "CC-BY-NC-SA-3.0" + } + ], + "titles": [ + { + "source": "Sissa Medialab", + "title": "Heavy Flavour Physics Review" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "INFN and Universit\u00e0 di Firenze" + } + ], + "full_name": "El-Khadra, Aida" + }, + { + "affiliations": [ + { + "value": "U of Pecs" + } + ], + "full_name": "MacDonald, M.T." + } + ], + "publication_info": [ + { + "journal_volume": "LATTICE 2013", + "year": 2014, + "artid": "001", + "journal_title": "PoS" + } + ], + "document_type": [ + "conference paper" + ], + "imprints": [ + { + "date": "2014-03-19" + } + ], + "citeable": true + } +] diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py new file mode 100644 index 00000000..582575bb --- /dev/null +++ b/tests/functional/pos/test_pos.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for PoS spider""" + +from __future__ import absolute_import, division, print_function + +import pytest + +from time import sleep + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.fixture(scope="function") +def set_up_oai_environment(): + package_location = get_test_suite_path( + 'pos', + 'fixtures', + 'oai_harvested', + 'pos_record.xml', + test_suite='functional', + ) + + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(10) + + yield { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'source_file': 'file://' + package_location, + 'base_conference_paper_url': 'https://server.local/contribution?id=', + } + } + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'pos', + 'fixtures', + 'pos_records.json', + ), + ], + ids=[ + 'conference_paper_record_only', + ] +) +def test_pos_conference_paper_record( + set_up_oai_environment, + expected_results, +): + crawler = get_crawler_instance(set_up_oai_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=1, + crawler_instance=crawler, + project=set_up_oai_environment.get('CRAWLER_PROJECT'), + spider='pos', + settings={}, + **set_up_oai_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results + + +# TODO create test that receives conference paper record AND proceedings record. + + +# TODO create test that receives proceedings record ONLY. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 94705248..6eb82940 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -66,7 +66,7 @@ def generated_record(scrape_pos_page_body): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - parsed_item = request.callback(response) + parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record @@ -155,7 +155,7 @@ def test_pipeline_record(generated_record): 'acquisition_source': { 'datetime': '2017-08-10T16:03:59.091110', 'method': 'hepcrawl', - 'source': 'PoS', + 'source': 'pos', 'submission_number': 'scrapy_job' }, 'authors': [