From 92d420f998d2013f0f2383498adf7c1e33a5b16c Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 10 Aug 2017 16:20:06 +0200 Subject: [PATCH 01/14] pos: first implementation Signed-off-by: Spiros Delviniotis Signed-off-by: David Caro --- hepcrawl/spiders/pos_spider.py | 217 ++++++++++-------- hepcrawl/tohep.py | 4 +- .../pos/sample_proceedings_page.html | 134 +++++++++++ tests/unit/test_pos.py | 112 +++++++-- 4 files changed, 350 insertions(+), 117 deletions(-) create mode 100644 tests/unit/responses/pos/sample_proceedings_page.html diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index c388a487..da0faf73 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -32,108 +32,109 @@ class POSSpider(StatefulSpider): """POS/Sissa crawler. Extracts from metadata: - * title - * article-id - * conf-acronym - * authors - * affiliations - * publication-date - * publisher - * license - * language - * link + todo:: be added... Example: :: - $ scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml + $ scrapy crawl PoS \\ + -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ - name = 'PoS' - pos_base_url = "https://pos.sissa.it/contribution?id=" - - def __init__(self, source_file=None, **kwargs): + name = 'pos' + + def __init__( + self, + source_file=None, + base_conference_paper_url='https://pos.sissa.it/contribution?id=', + base_proceedings_url='https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=', + # TODO to be changed without question in the url + # TODO make valid CA certificate + **kwargs + ): """Construct POS spider.""" super(POSSpider, self).__init__(**kwargs) self.source_file = source_file + self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url + self.BASE_PROCEEDINGS_URL = base_proceedings_url def start_requests(self): yield Request(self.source_file) def parse(self, response): """Get PDF information.""" + self.log('Got record from: {response.url}'.format(**vars())) + node = response.selector node.remove_namespaces() for record in node.xpath('.//record'): identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() if identifier: # Probably all links lead to same place, so take first - pos_url = "{0}{1}".format(self.pos_base_url, identifier) - request = Request(pos_url, callback=self.scrape_pos_page) + conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) + request = Request(conference_paper_url, callback=self.scrape_conference_paper) request.meta["url"] = response.url request.meta["record"] = record.extract() yield request - def scrape_pos_page(self, response): + def scrape_conference_paper(self, response): """Parse a page for PDF link.""" - response.meta["pos_pdf_url"] = response.selector.xpath( - "//a[contains(text(),'pdf')]/@href" - ).extract_first() - response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"]) response.meta["pos_url"] = response.url - return self.build_item(response) + response.meta["conference_paper_pdf_url"] = self._get_conference_paper_pdf_url( + response=response, + ) - def build_item(self, response): - """Parse an PoS XML exported file into a HEP record.""" - text = response.meta["record"] - node = Selector(text=text, type="xml") - node.remove_namespaces() - record = HEPLoader(item=HEPRecord(), selector=node) - record.add_xpath('title', '//metadata/pex-dc/title/text()') - record.add_xpath('source', '//metadata/pex-dc/publisher/text()') + # TODO Yield request for Conference page + proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + proceedings_identifier = proceedings_identifier.split('=')[1] + pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) + # yield Request(pos_url, callback=self.scrape_proceedings) - record.add_value('external_system_numbers', self._get_ext_systems_number(node)) + yield self.build_conference_paper_item(response) - license = get_licenses( - license_text=node.xpath( - ".//metadata/pex-dc/rights/text()" - ).extract_first(), - ) - record.add_value('license', license) + def scrape_proceedings(self, response): + # TODO create proceedings record + # TODO document_type = proceeding + # TODO title = template(“Proceedings, ”) + # TODO subtitle = template(“<place>, <date>”) + # TODO publication_info.journal_title = “PoS” + # TODO publication_info.journal_volume = identifier - date, year = self._get_date(node) - if date: - record.add_value('date_published', date) - if year: - record.add_value('journal_year', int(year)) + pass - identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() - record.add_value('urls', response.meta['pos_url']) - if response.meta['pos_pdf_url']: - record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) - if identifier: - pbn = re.split('[()]', identifier) - if len(pbn) == 3: - conf_acronym = pbn[1] - article_id = pbn[2] - record.add_value('journal_title', pbn[0]) - record.add_value('journal_volume', conf_acronym) - record.add_value('journal_artid', article_id) - else: - record.add_value('pubinfo_freetext', identifier) + def build_conference_paper_item(self, response): + """Parse an PoS XML exported file into a HEP record.""" + meta = response.meta + xml_record = meta.get('record') + node = Selector( + text=xml_record, + type="xml" + ) + node.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=node + ) - language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() - if language: - record.add_value('language', language) + license_text = node.xpath('.//metadata/pex-dc/rights/text()').extract_first() + record.add_value('license', get_licenses(license_text=license_text)) - authors = self._get_authors(node) - if authors: - record.add_value('authors', authors) + date, year = self._get_date(node=node) + record.add_value('date_published', date) + record.add_value('journal_year', year) - extra_data = self._get_extra_data(node) - if extra_data: - record.add_value('extra_data', extra_data) + identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() + record.add_value('journal_title', self._get_journal_title(identifier=identifier)) + record.add_value('journal_volume', self._get_journal_volume(identifier=identifier)) + record.add_value('journal_artid', self._get_journal_artid(identifier=identifier)) - record.add_value('collections', ['HEP', 'ConferencePaper']) + record.add_xpath('title', '//metadata/pex-dc/title/text()') + record.add_xpath('source', '//metadata/pex-dc/publisher/text()') + record.add_value('external_system_numbers', self._get_ext_systems_number(node=node)) + record.add_value('language', self._get_language(node=node)) + record.add_value('authors', self._get_authors(node=node)) + record.add_value('collections', ['conferencepaper']) + record.add_value('urls', meta.get('pos_url')) parsed_item = ParsedItem( record=record.load_item(), @@ -142,50 +143,76 @@ def build_item(self, response): return parsed_item - def _get_ext_systems_number(self, node): + def _get_conference_paper_pdf_url(self, response): + conference_paper_pdf_url = response.selector.xpath( + "//a[contains(text(),'pdf')]/@href", + ).extract_first() + + return urljoin( + self.BASE_CONFERENCE_PAPER_URL, + conference_paper_pdf_url, + ) + + @staticmethod + def _get_language(node): + language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() + return language if language != 'en' else None + + @staticmethod + def _get_journal_title(identifier): + return re.split('[()]', identifier)[0] + + @staticmethod + def _get_journal_volume(identifier): + return re.split('[()]', identifier)[1] + + @staticmethod + def _get_journal_artid(identifier): + return re.split('[()]', identifier)[2] + + @staticmethod + def _get_ext_systems_number(node): return [ { - 'institute': 'PoS', - 'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first() - }, - { - 'institute': 'PoS', + 'institute': 'pos', 'value': node.xpath('.//identifier/text()').extract_first() }, ] - def _get_date(self, node): - """Get article date.""" - date = '' - year = '' + @staticmethod + def _get_date(node): full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() date = create_valid_date(full_date) - if date: - year = date[0:4] + year = int(date[0:4]) + return date, year - def _get_authors(self, node): + @staticmethod + def _get_authors(node): # To be refactored """Get article authors.""" - author_selectors = node.xpath('.//metadata/pex-dc/creator') authors = [] - for selector in author_selectors: + creators = node.xpath('.//metadata/pex-dc/creator') + for creator in creators: auth_dict = {} - author = Selector(text=selector.extract()) - auth_dict['raw_name'] = \ - get_first(author.xpath('.//name//text()').extract(), default='') + author = Selector(text=creator.extract()) + auth_dict['raw_name'] = get_first( + author.xpath('.//name//text()').extract(), + default='', + ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append({'value': affiliation}) + auth_dict['affiliations'].append( + { + 'value': affiliation + } + ) + # Todo probably to remove else: - auth_dict['affiliations'] = [{'value': affiliation}, ] + auth_dict['affiliations'] = [ + { + 'value': affiliation + }, + ] if auth_dict: authors.append(auth_dict) return authors - - def _get_extra_data(self, node): - """Get info to help selection - not for INSPIRE record""" - extra_data = {} - - section = node.xpath(".//metadata/pex-dc/description/text()").extract_first() - extra_data['section'] = section.split(';', 1)[-1].strip() - return extra_data diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 980a030d..727600f8 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -92,7 +92,7 @@ def _normalize_hepcrawl_record(item, source): item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), - 'source': source, + 'source': item.pop('source', source), }] item['abstracts'] = [{ @@ -242,7 +242,7 @@ def _filter_affiliation(affiliations): for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( - author['full_name'], + full_name=author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) diff --git a/tests/unit/responses/pos/sample_proceedings_page.html b/tests/unit/responses/pos/sample_proceedings_page.html new file mode 100644 index 00000000..669e77b4 --- /dev/null +++ b/tests/unit/responses/pos/sample_proceedings_page.html @@ -0,0 +1,134 @@ +<!DOCTYPE html> +<html> + <head> + <title>31st International Symposium on Lattice Field Theory LATTICE 2013 + + + + + Main Image + + + +

31st International Symposium on Lattice Field Theory LATTICE 2013

+ + + + +
LATTICE 2013 - (other lattice conferences)
+
29 July – 3 August, 2013
Mainz, Germany
+ +
+

+ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +

+

+ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +

+
+
conference main image
+
+ + + + + + + + + + + + + + + + + + +
Sessions
Preface
Plenary sessions
Algorithms and Machines
Applications beyond QCD
Physics beyond the Standard Model
Chiral Symmetry
Non-zero Temperature and Density
Hadron Spectroscopy and Interactions
Hadron Structure
Standard Model Parameters and Renormalization
Theoretical Developments
Vacuum Structure and Confinement
Weak Decays and Matrix Elements
Special Session: Coding Efforts
Posters
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Preface
Foreword
+ PoS(LATTICE 2013)503 + pdf + H. Wittig +
Ken Wilson Obituary
+ PoS(LATTICE 2013)504 + pdf + A. Kronfeld +
Plenary sessions
Heavy Flavour Physics Review
+ PoS(LATTICE 2013)001 + pdf + A. El-Khadra +
Charmonium, $D_s$ and $D_s^*$ from overlap fermion on domain wall fermion configurations
+ PoS(LATTICE 2013)500 + pdf + Y.b. Yang, Y. Chen, A. Alexandru, S.J. Dong, T. Draper, M. Gong, F. Lee, A. Li, K.F. Liu, Z. Liu, M. Lujan and N. Mathur +
+
+ + + + + + + + + + diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 7ed1dee6..94705248 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -25,7 +25,13 @@ ) -@pytest.fixture +def override_generated_fields(record): + record['acquisition_source']['datetime'] = '2017-08-10T16:03:59.091110' + + return record + + +@pytest.fixture(scope='session') def scrape_pos_page_body(): return pkg_resources.resource_string( __name__, @@ -37,9 +43,14 @@ def scrape_pos_page_body(): ) -@pytest.fixture -def record(scrape_pos_page_body): +@pytest.fixture(scope='session') +def generated_record(scrape_pos_page_body): """Return results generator from the PoS spider.""" + # environmental variables needed for the pipelines payload + os.environ['SCRAPY_JOB'] = 'scrapy_job' + os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' + os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' + crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( @@ -52,6 +63,7 @@ def record(scrape_pos_page_body): **{'encoding': 'utf-8'} ) assert response + pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response) @@ -63,43 +75,43 @@ def record(scrape_pos_page_body): clean_dir() -def test_titles(record): +def test_titles(generated_record): """Test extracting title.""" expected_titles = [ { - 'source': 'PoS', + 'source': 'Sissa Medialab', 'title': 'Heavy Flavour Physics Review', } ] - assert 'titles' in record - assert record['titles'] == expected_titles + assert 'titles' in generated_record + assert generated_record['titles'] == expected_titles @pytest.mark.xfail(reason='License texts are not normalized and converted to URLs') -def test_license(record): +def test_license(generated_record): """Test extracting license information.""" expected_license = [{ 'license': 'CC BY-NC-SA 3.0', 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0', }] - assert record['license'] == expected_license + assert generated_record['license'] == expected_license -def test_collections(record): +def test_collections(generated_record): """Test extracting collections.""" expected_document_type = ['conference paper'] - assert record.get('citeable') - assert record.get('document_type') == expected_document_type + assert generated_record.get('citeable') + assert generated_record.get('document_type') == expected_document_type -def test_language(record): +def test_language(generated_record): """Test extracting language.""" - assert 'language' not in record + assert 'language' not in generated_record -def test_publication_info(record): +def test_publication_info(generated_record): """Test extracting dois.""" expected_pub_info = [{ 'artid': '001', @@ -108,13 +120,13 @@ def test_publication_info(record): 'year': 2014, }] - assert 'publication_info' in record + assert 'publication_info' in generated_record - pub_info = record['publication_info'] + pub_info = generated_record['publication_info'] assert pub_info == expected_pub_info -def test_authors(record): +def test_authors(generated_record): """Test authors.""" expected_authors = [ { @@ -127,12 +139,72 @@ def test_authors(record): } ] - assert 'authors' in record + assert 'authors' in generated_record - result_authors = record['authors'] + result_authors = generated_record['authors'] assert len(result_authors) == len(expected_authors) # here we are making sure order is kept for author, expected_author in zip(result_authors, expected_authors): assert author == expected_author + + +def test_pipeline_record(generated_record): + expected = { + 'acquisition_source': { + 'datetime': '2017-08-10T16:03:59.091110', + 'method': 'hepcrawl', + 'source': 'PoS', + 'submission_number': 'scrapy_job' + }, + 'authors': [ + { + 'affiliations': [ + { + 'value': u'INFN and Universit\xe0 di Firenze' + } + ], + 'full_name': u'El-Khadra, Aida' + }, + { + 'affiliations': [ + { + 'value': u'U of Pecs' + } + ], + 'full_name': u'MacDonald, M.T.' + } + ], + 'citeable': True, + 'document_type': [ + 'conference paper' + ], + 'imprints': [ + { + 'date': '2014-03-19' + } + ], + 'license': [ + { + 'license': 'CC-BY-NC-SA-3.0', + 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0' + } + ], + 'publication_info': [ + { + 'artid': u'001', + 'journal_title': u'PoS', + 'journal_volume': u'LATTICE 2013', + 'year': 2014 + } + ], + 'titles': [ + { + 'source': u'Sissa Medialab', + 'title': u'Heavy Flavour Physics Review' + } + ] + } + + assert override_generated_fields(generated_record) == expected From 81735b8987933e7ac14a66103775337b20b3b079 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Tue, 15 Aug 2017 10:38:58 +0200 Subject: [PATCH 02/14] pos: add functional test Signed-off-by: Spiros Delviniotis --- .travis.yml | 1 + docker-compose.test.yml | 16 +++ hepcrawl/spiders/pos_spider.py | 39 +++----- .../pos/fixtures/https_server/conf/proxy.conf | 17 ++++ .../fixtures/https_server/conf/ssl/cert.key | 28 ++++++ .../fixtures/https_server/conf/ssl/cert.pem | 19 ++++ .../records/PoS(LATTICE 2013)001.html | 55 +++++++++++ .../pos/fixtures/oai_harvested/pos_record.xml | 33 +++++++ .../functional/pos/fixtures/pos_records.json | 57 +++++++++++ tests/functional/pos/test_pos.py | 97 +++++++++++++++++++ tests/unit/test_pos.py | 4 +- 11 files changed, 338 insertions(+), 28 deletions(-) create mode 100644 tests/functional/pos/fixtures/https_server/conf/proxy.conf create mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.key create mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem create mode 100644 tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html create mode 100644 tests/functional/pos/fixtures/oai_harvested/pos_record.xml create mode 100644 tests/functional/pos/fixtures/pos_records.json create mode 100644 tests/functional/pos/test_pos.py diff --git a/.travis.yml b/.travis.yml index 7682b90c..6bc66b84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ env: - SUITE=functional_arxiv - SUITE=functional_desy - SUITE=functional_cds + - SUITE=functional_pos matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 65bb864b..a1e93998 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -56,6 +56,13 @@ services: links: - scrapyd + functional_pos: + <<: *service_base + command: py.test -vv tests/functional/pos + links: + - scrapyd + - server.local + unit: <<: *service_base command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*" @@ -96,5 +103,14 @@ services: - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd + server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl + - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/ + ports: + - 443:443 + rabbitmq: image: rabbitmq diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index da0faf73..ab5083ed 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,6 +41,7 @@ class POSSpider(StatefulSpider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' + # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, @@ -83,24 +84,18 @@ def scrape_conference_paper(self, response): response=response, ) - # TODO Yield request for Conference page - proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - proceedings_identifier = proceedings_identifier.split('=')[1] - pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) - self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) + # # Yield request for Conference page + # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + # proceedings_identifier = proceedings_identifier.split('=')[1] + # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) # yield Request(pos_url, callback=self.scrape_proceedings) - yield self.build_conference_paper_item(response) + return self.build_conference_paper_item(response) - def scrape_proceedings(self, response): - # TODO create proceedings record - # TODO document_type = proceeding - # TODO title = template(“Proceedings, ”) - # TODO subtitle = template(“<place>, <date>”) - # TODO publication_info.journal_title = “PoS” - # TODO publication_info.journal_volume = identifier - - pass + # def scrape_proceedings(self, response): + # # create proceedings record + # import pytest + # pytest.set_trace() def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" @@ -174,7 +169,7 @@ def _get_journal_artid(identifier): def _get_ext_systems_number(node): return [ { - 'institute': 'pos', + 'institute': 'PoS', 'value': node.xpath('.//identifier/text()').extract_first() }, ] @@ -201,18 +196,10 @@ def _get_authors(node): # To be refactored ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append( - { - 'value': affiliation - } - ) + auth_dict['affiliations'].append({'value': affiliation}) # Todo probably to remove else: - auth_dict['affiliations'] = [ - { - 'value': affiliation - }, - ] + auth_dict['affiliations'] = [{'value': affiliation}, ] if auth_dict: authors.append(auth_dict) return authors diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf new file mode 100644 index 00000000..f4235640 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -0,0 +1,17 @@ +server { + listen 443 ssl; + server_name localhost; + + ssl on; + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + ssl_certificate ssl/cert.pem; + ssl_certificate_key ssl/cert.key; + + location ~ /contribution { + if ($args ~* "^id=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } +} diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key new file mode 100755 index 00000000..19e1df68 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQChhBiOoipMRRcc +E5waKrGB01/QtRpfIGp5KmJfnif05dR05wWojHO6EtabZ2qbXtcSuyQ0vRNpbZUU +OzcriFOMk8dujDzuKMkegsq/LE4PyN/R5JZtf34NyGG7v70K6Uq7RV4PUzk+zoum +1McMUBk1QlGP/E9RsDlSPv9XOblUpicPDuwhCwPf4zi6jporgXjDJ/iUuh+bexxv +40R7f2dCWkiHYiNiLNLTwXdYkaWBcc3HoTq9FEZZhYDhWRjX0/TuINmMr5lbUvr6 +UYRABOS4VeUyHpb/e7OH9WXQxzR76LuQFfQDSgs0GxXw1KG58aq+P0ni2E77C4Iu +odQ8iT+jAgMBAAECggEBAIqJeFrXY7p5xIGznEChgBHgUR3+SPlxH4KARVLIoHMh +s2L2SVcx6Y2f3O38/Wb5KTcKx9polz7l3Go3BHJVg3xfwT7kENsipqeB/g+OHALU +BI7PJ+wR3/hIePQGWUsDobMRo8U3WDG0DfryJS09gvG4yabb/tkNc41FNdUGUR31 +7VInQFqv2/jZ/2A3s3DZ0Cns9vJuLhmf7629k3MVCuU7Rh0rStnVCA70kjgKzOfP ++26fnfd/MmrQYbaukw04+cwcwifGkF5Jis80qTWsgdF82rkzpwJLDo0Jd2HZFuOa +AHkWK2QiMzb6PS2Uo7Zarax9E+W2TLahANXZQQ32NAkCgYEAzKw7XbEwzWG/T7yX +EgNIAN7YtcGYr9sfHlVJ8bWYK7GZBbCkKDlGU+YGRE++plh/jtXYjsIFElWtv01Y +UpqBdWf7p8mXdtVoq6YyL5WuQVMwpjKHvegTXXwAoreEXZeKr1LKC11B14h+8wsR +D5uf0GVmdw12nSrzeu3Q4oSgss8CgYEAygU++fItIYuPtZfrC8qDcyEiOLQmAHtX +eTnEHOPy8ik+bdwF5Rg0nzxLu3RZ47ykGdEOzpGRO4B9V1EevwSEzX6VO7latMUS +cLKb3Y0bXm6qQcWG+LAlvyaHfAH0oN47xfScLDiUm6BKd4Eo9kpkgaQzSgUfFZNQ +6DHiA3Emau0CgYEAyel7Y3GjMGomvrXQ3x9HkDxH0/7Z71qe92CyYvZ/2VMKH9fk +Ch5+p9P8CLYW4anapQGH80WqlSzbDCd0Y4EzB6z+UceJWd0stnFtfw4N6znze3HM +AegJ+qaTRfL/bQlL8qwc0Fs+0i9A9enL+fbQEVmHXRl2E5TEwFgOQvkOQ3cCgYAA +4bD6qkHkKZXA9x7BeGrGb9iUYsTfr6ocD1J5xczjnaZ2GEW2UDq6jyrNcJ6LzeDx +c+YapKv7lH33iZUWxFBIDUtdbVul+k4wS7c+akU6TkVT8Ca8oxgnE2X39pI4uX+N +R5n+32hWnYZ1qwygtoZlwm+u3QLbtz7dJIqV9UJzqQKBgQCL8Xo9LA0Dm7ZsdDDI +I93YsjCELvBsonymmD1MTpk7uIA+qH8LAih+Vhonc17NtpXuas8eqc8ntuNLAgON +Tylvk32uaRqquHWl6MT7bwaaK7pD8KuOIUJdl5SEc+DDUcB2A2XLg7Yv08Dus8A7 +6J5oH8YJ3hqmVGZzbOo75IFerg== +-----END PRIVATE KEY----- diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem new file mode 100755 index 00000000..1418c1bb --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDATCCAemgAwIBAgIJAJRKy2TWwZqTMA0GCSqGSIb3DQEBCwUAMBcxFTATBgNV +BAMMDGh0dHBzX3NlcnZlcjAeFw0xNzA4MTQxNDQ1MTFaFw0yMDA2MDMxNDQ1MTFa +MBcxFTATBgNVBAMMDGh0dHBzX3NlcnZlcjCCASIwDQYJKoZIhvcNAQEBBQADggEP +ADCCAQoCggEBAKGEGI6iKkxFFxwTnBoqsYHTX9C1Gl8gankqYl+eJ/Tl1HTnBaiM +c7oS1ptnapte1xK7JDS9E2ltlRQ7NyuIU4yTx26MPO4oyR6Cyr8sTg/I39Hklm1/ +fg3IYbu/vQrpSrtFXg9TOT7Oi6bUxwxQGTVCUY/8T1GwOVI+/1c5uVSmJw8O7CEL +A9/jOLqOmiuBeMMn+JS6H5t7HG/jRHt/Z0JaSIdiI2Is0tPBd1iRpYFxzcehOr0U +RlmFgOFZGNfT9O4g2YyvmVtS+vpRhEAE5LhV5TIelv97s4f1ZdDHNHvou5AV9ANK +CzQbFfDUobnxqr4/SeLYTvsLgi6h1DyJP6MCAwEAAaNQME4wHQYDVR0OBBYEFAfu +RxroDak/yro7MbRfDogKVDmBMB8GA1UdIwQYMBaAFAfuRxroDak/yro7MbRfDogK +VDmBMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAF5M/Gz6JDC1HoSm +6HFLBB9ul9TQQI3RhohwreCYyeZ866WrvqZfle+lxcgVburYCSyi5paFpvNK3DH2 +J0A2fDAMekZGcaJ7O5Zx0evTCwXoxDOhS+xO5IlGTXWCEKLeLkU27WJiLC9cTbFr +kfjL14IMnsioRzUz4a+aX5JllqnEccCDlHjSk1w5YvOvt6GC6Bvenouja2apPes/ +oJJpFwZVO0epqOQo1ndRGbt5NLv6YgZlvdFXWoKNKohzdfDV/RbW9BrbpyKSxFTm +usrmVcZTQpSf69zbnEVO8N3N6c1zNdETPON1ZGLW1O1MXWkQDZniH6LduXN/Oob7 +vYqvXlw= +-----END CERTIFICATE----- diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html new file mode 100644 index 00000000..e080cb39 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html @@ -0,0 +1,55 @@ +<!DOCTYPE html + PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US"> +<head> +<title>PoS(LATTICE 2013)001 + + + + + + + + + +

PoS(LATTICE 2013)001

+ +
+ +
+ Title + Heavy Flavour Physics Review +
+ + + +
+ Authors +
+A. El-Khadra
+
+ + +
+ Contribution + pdf +
+ + + + +
+ + + + + diff --git a/tests/functional/pos/fixtures/oai_harvested/pos_record.xml b/tests/functional/pos/fixtures/oai_harvested/pos_record.xml new file mode 100644 index 00000000..f65dfb9e --- /dev/null +++ b/tests/functional/pos/fixtures/oai_harvested/pos_record.xml @@ -0,0 +1,33 @@ + + +2015-01-29T13:44:13Z + +https://pos.sissa.it/cgi-bin/oai/oai-script-spires-extended.cgi + + + +
+ oai:pos.sissa.it:LATTICE 2013/001 + 2014-04-28 + conference:LATTICE 2013 + group:9 +
+ + + Heavy Flavour Physics Review + Aida El-KhadraINFN and Università di Firenze + M. T. MacDonaldU of PecsLattice Field Theory + 31st International Symposium on Lattice Field Theory LATTICE 2013; Plenary sessions + Sissa Medialab + 2014-03-19T21:09:30Z + Text + application/pdf + PoS(LATTICE 2013)001 + en + LATTICE 2013 (31st International Symposium on Lattice Field Theory LATTICE 2013) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike + + +
+
+
diff --git a/tests/functional/pos/fixtures/pos_records.json b/tests/functional/pos/fixtures/pos_records.json new file mode 100644 index 00000000..ee8b88af --- /dev/null +++ b/tests/functional/pos/fixtures/pos_records.json @@ -0,0 +1,57 @@ +[ + { + "acquisition_source": { + "source": "pos", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + }, + "license": [ + { + "url": "https://creativecommons.org/licenses/by-nc-sa/3.0", + "license": "CC-BY-NC-SA-3.0" + } + ], + "titles": [ + { + "source": "Sissa Medialab", + "title": "Heavy Flavour Physics Review" + } + ], + "authors": [ + { + "affiliations": [ + { + "value": "INFN and Universit\u00e0 di Firenze" + } + ], + "full_name": "El-Khadra, Aida" + }, + { + "affiliations": [ + { + "value": "U of Pecs" + } + ], + "full_name": "MacDonald, M.T." + } + ], + "publication_info": [ + { + "journal_volume": "LATTICE 2013", + "year": 2014, + "artid": "001", + "journal_title": "PoS" + } + ], + "document_type": [ + "conference paper" + ], + "imprints": [ + { + "date": "2014-03-19" + } + ], + "citeable": true + } +] diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py new file mode 100644 index 00000000..582575bb --- /dev/null +++ b/tests/functional/pos/test_pos.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2017 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Functional tests for PoS spider""" + +from __future__ import absolute_import, division, print_function + +import pytest + +from time import sleep + +from hepcrawl.testlib.celery_monitor import CeleryMonitor +from hepcrawl.testlib.fixtures import ( + get_test_suite_path, + expected_json_results_from_file, +) +from hepcrawl.testlib.tasks import app as celery_app +from hepcrawl.testlib.utils import get_crawler_instance + + +def override_generated_fields(record): + record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' + record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + + return record + + +@pytest.fixture(scope="function") +def set_up_oai_environment(): + package_location = get_test_suite_path( + 'pos', + 'fixtures', + 'oai_harvested', + 'pos_record.xml', + test_suite='functional', + ) + + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(10) + + yield { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'source_file': 'file://' + package_location, + 'base_conference_paper_url': 'https://server.local/contribution?id=', + } + } + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'pos', + 'fixtures', + 'pos_records.json', + ), + ], + ids=[ + 'conference_paper_record_only', + ] +) +def test_pos_conference_paper_record( + set_up_oai_environment, + expected_results, +): + crawler = get_crawler_instance(set_up_oai_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=100, + events_limit=1, + crawler_instance=crawler, + project=set_up_oai_environment.get('CRAWLER_PROJECT'), + spider='pos', + settings={}, + **set_up_oai_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert sorted(gotten_results) == expected_results + + +# TODO create test that receives conference paper record AND proceedings record. + + +# TODO create test that receives proceedings record ONLY. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 94705248..6eb82940 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -66,7 +66,7 @@ def generated_record(scrape_pos_page_body): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - parsed_item = request.callback(response) + parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record @@ -155,7 +155,7 @@ def test_pipeline_record(generated_record): 'acquisition_source': { 'datetime': '2017-08-10T16:03:59.091110', 'method': 'hepcrawl', - 'source': 'PoS', + 'source': 'pos', 'submission_number': 'scrapy_job' }, 'authors': [ From 6c6e4ba53a20f481111b9e529023aa6ed936c6db Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Mon, 21 Aug 2017 13:44:30 +0200 Subject: [PATCH 03/14] pos: add proceedings support Addresses #159 Signed-off-by: Spiros Delviniotis --- hepcrawl/spiders/pos_spider.py | 86 +++++++++--- .../pos/fixtures/https_server/conf/proxy.conf | 8 ++ .../fixtures/https_server/records/187.html | 125 ++++++++++++++++++ .../records/PoS(LATTICE 2013)001.html | 88 ++++++------ ...> pos_conference_proceedings_records.json} | 23 ++++ tests/functional/pos/test_pos.py | 7 +- 6 files changed, 279 insertions(+), 58 deletions(-) create mode 100644 tests/functional/pos/fixtures/https_server/records/187.html rename tests/functional/pos/fixtures/{pos_records.json => pos_conference_proceedings_records.json} (68%) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index ab5083ed..1f2404e4 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,7 +41,6 @@ class POSSpider(StatefulSpider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' - # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, @@ -73,8 +72,9 @@ def parse(self, response): # Probably all links lead to same place, so take first conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) request = Request(conference_paper_url, callback=self.scrape_conference_paper) - request.meta["url"] = response.url - request.meta["record"] = record.extract() + request.meta['url'] = response.url + request.meta['record'] = record.extract() + request.meta['identifier'] = identifier yield request def scrape_conference_paper(self, response): @@ -84,18 +84,48 @@ def scrape_conference_paper(self, response): response=response, ) - # # Yield request for Conference page - # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - # proceedings_identifier = proceedings_identifier.split('=')[1] - # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) - # yield Request(pos_url, callback=self.scrape_proceedings) + # Scrape proceedings record + pos_url = self._get_proceedings_url(response) + self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) + meta = { + 'identifier': response.meta.get('identifier'), + } + yield Request( + pos_url, + callback=self.scrape_proceedings, + meta=meta, + ) + + yield self.build_conference_paper_item(response) + + def scrape_proceedings(self, response): + node = Selector( + text=response.body, + type='html', + ) + node.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=node + ) + + record.add_value('collections', ['proceeding']) + record.add_value('title', self._get_proceedings_title(node=node)) + record.add_value('subtitle', self._get_proceedings_date_place(node=node)) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + self._get_journal_volume( + identifier=response.meta.get('identifier'), + ) + ) - return self.build_conference_paper_item(response) + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) - # def scrape_proceedings(self, response): - # # create proceedings record - # import pytest - # pytest.set_trace() + return parsed_item def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" @@ -148,6 +178,13 @@ def _get_conference_paper_pdf_url(self, response): conference_paper_pdf_url, ) + def _get_proceedings_url(self, response): + internal_url = response.selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_identifier = internal_url.split('/')[1] + return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + @staticmethod def _get_language(node): language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() @@ -169,7 +206,7 @@ def _get_journal_artid(identifier): def _get_ext_systems_number(node): return [ { - 'institute': 'PoS', + 'institute': 'pos', 'value': node.xpath('.//identifier/text()').extract_first() }, ] @@ -196,10 +233,27 @@ def _get_authors(node): # To be refactored ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append({'value': affiliation}) + auth_dict['affiliations'].append( + { + 'value': affiliation + } + ) # Todo probably to remove else: - auth_dict['affiliations'] = [{'value': affiliation}, ] + auth_dict['affiliations'] = [ + { + 'value': affiliation + }, + ] if auth_dict: authors.append(auth_dict) return authors + + @staticmethod + def _get_proceedings_title(node): + return node.xpath('//h1/text()').extract_first() + + @staticmethod + def _get_proceedings_date_place(node): + date_place = node.xpath("//div[@class='conference_date']/text()").extract() + return ''.join(date_place) diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf index f4235640..1591cbcd 100644 --- a/tests/functional/pos/fixtures/https_server/conf/proxy.conf +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -14,4 +14,12 @@ server { rewrite ^.*$ /$mid.html permanent; } } + + location ~ /cgi-bin/reader/conf.cgi { + if ($args ~* "^confid=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } } diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/https_server/records/187.html new file mode 100644 index 00000000..0d86221a --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/187.html @@ -0,0 +1,125 @@ + + + + 31st International Symposium on Lattice Field Theory LATTICE 2013 + + + + + Main Image + + + +

31st International Symposium on Lattice Field Theory LATTICE 2013

+ + + +
LATTICE 2013 - (other lattice conferences)
+
29 July – 3 August, 2013
Mainz, Germany
+
+

+ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +

+

+ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +

+
+
conference main image
+
+ + + + + + + + + + + + + + + + + + +
Sessions
Preface
Plenary sessions
Algorithms and Machines
Applications beyond QCD
Physics beyond the Standard Model
Chiral Symmetry
Non-zero Temperature and Density
Hadron Spectroscopy and Interactions
Hadron Structure
Standard Model Parameters and Renormalization
Theoretical Developments
Vacuum Structure and Confinement
Weak Decays and Matrix Elements
Special Session: Coding Efforts
Posters
+ + + + + + + + + + + + + + + + + + + + +
Preface
Foreword
+ PoS(LATTICE 2013)503 + pdf + H. Wittig +
Ken Wilson Obituary
+ PoS(LATTICE 2013)504 + pdf + A. Kronfeld +
Plenary sessions
Heavy Flavour Physics Review
+ PoS(LATTICE 2013)001 + pdf + A. El-Khadra +
New Developments for Lattice Field Theory at Non-Zero Density
+ PoS(LATTICE 2013)002 + pdf + C. Gattringer +
+
+ + + + + + + + + diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html index e080cb39..5ed0c148 100644 --- a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html +++ b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html @@ -1,55 +1,65 @@ - - + + PoS(LATTICE 2013)001 + + + + + + + + - +Main Image - - +

PoS(LATTICE 2013)001

+ + + -
- -
- Title - Heavy Flavour Physics Review -
- - - -
- Authors -
-A. El-Khadra
-
- - -
- Contribution - pdf -
+
+

Heavy Flavour Physics Review

+

A. El-Khadra

+

in 31st International Symposium on Lattice Field Theory LATTICE 2013

+

Contribution: pdf

- -
- - + + + + + + + - + \ No newline at end of file diff --git a/tests/functional/pos/fixtures/pos_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json similarity index 68% rename from tests/functional/pos/fixtures/pos_records.json rename to tests/functional/pos/fixtures/pos_conference_proceedings_records.json index ee8b88af..3605ad13 100644 --- a/tests/functional/pos/fixtures/pos_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -1,5 +1,28 @@ [ { + "publication_info": [ + { + "journal_volume": "LATTICE 2013", + "journal_title": "PoS" + } + ], + "document_type": [ + "article" + ], + "titles": [ + { + "source": "pos", + "title": "31st International Symposium on Lattice Field Theory LATTICE 2013" + } + ], + "acquisition_source": { + "source": "pos", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + } + }, + { "acquisition_source": { "source": "pos", "method": "hepcrawl", diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 582575bb..c122b68e 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -50,6 +50,7 @@ def set_up_oai_environment(): 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, 'base_conference_paper_url': 'https://server.local/contribution?id=', + 'base_proceedings_url': 'https://server.local/cgi-bin/reader/conf.cgi?confid=', } } @@ -60,14 +61,14 @@ def set_up_oai_environment(): expected_json_results_from_file( 'pos', 'fixtures', - 'pos_records.json', + 'pos_conference_proceedings_records.json', ), ], ids=[ - 'conference_paper_record_only', + 'smoke', ] ) -def test_pos_conference_paper_record( +def test_pos_conference_paper_record_and_proceedings_record( set_up_oai_environment, expected_results, ): From 9f5e750ec6074cb9364955916680e09966c931a6 Mon Sep 17 00:00:00 2001 From: David Caro Date: Mon, 21 Aug 2017 20:39:55 +0200 Subject: [PATCH 04/14] pos: refactor Signed-off-by: David Caro --- hepcrawl/spiders/pos_spider.py | 342 +++++++++++++++++++++++---------- 1 file changed, 241 insertions(+), 101 deletions(-) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 1f2404e4..283cfcf8 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -28,30 +28,60 @@ ) +DEFAULT_BASE_URL = 'https://pos.sissa.it' +DEFAULT_BASE_CONFERENCE_PAPER_URL = ( + DEFAULT_BASE_URL + '/contribution?id=' +) +DEFAULT_BASE_PROCEEDINGS_URL = ( + DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid=' +) + + +class PoSExtractionException(Exception): + pass + + class POSSpider(StatefulSpider): """POS/Sissa crawler. - Extracts from metadata: - todo:: be added... + From PoS we create two types of records, a conference paper record, and a + conference proceedings record. + + The bulk of the records comes from oaiharvest, and this spider crawls the + files generated by it. + + For the conference paper record we have to scrape also the html page of the + record on the PoS website to get the pdf link. (see + `DEFAULT_BASE_CONFERENCE_PAPER_URL`) + + Then, from that same page, we get the internal conference id. + + With that conference id, then we scrape the conference proceedings page, + and extract the information to create the proceedings record. (see + `DEFAULT_BASE_PROCEEDINGS_URL`) + + To do that and because each needs the information of the previous, the + spider must use the callbacks system provided by scrapy through the + :ref:`scrapy.html.response.Response` callback parameter, and chain the + parser functions. + + The deduplication of the conference proceedings papers is left for the + `HepcrawlCrawlOnceMiddleware` middleware. Example: :: - $ scrapy crawl PoS \\ - -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml + -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml" """ name = 'pos' def __init__( self, source_file=None, - base_conference_paper_url='https://pos.sissa.it/contribution?id=', - base_proceedings_url='https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=', - # TODO to be changed without question in the url - # TODO make valid CA certificate + base_conference_paper_url=DEFAULT_BASE_CONFERENCE_PAPER_URL, + base_proceedings_url=DEFAULT_BASE_PROCEEDINGS_URL, **kwargs ): - """Construct POS spider.""" super(POSSpider, self).__init__(**kwargs) self.source_file = source_file self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url @@ -61,105 +91,157 @@ def start_requests(self): yield Request(self.source_file) def parse(self, response): - """Get PDF information.""" self.log('Got record from: {response.url}'.format(**vars())) - node = response.selector - node.remove_namespaces() - for record in node.xpath('.//record'): - identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() - if identifier: - # Probably all links lead to same place, so take first - conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) - request = Request(conference_paper_url, callback=self.scrape_conference_paper) - request.meta['url'] = response.url - request.meta['record'] = record.extract() - request.meta['identifier'] = identifier - yield request - - def scrape_conference_paper(self, response): - """Parse a page for PDF link.""" - response.meta["pos_url"] = response.url - response.meta["conference_paper_pdf_url"] = self._get_conference_paper_pdf_url( - response=response, - ) + response.selector.remove_namespaces() + records = response.selector.xpath('.//record') + for record in records: + yield self.get_conference_paper_page_request(raw_xml=record) - # Scrape proceedings record - pos_url = self._get_proceedings_url(response) - self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) - meta = { - 'identifier': response.meta.get('identifier'), - } - yield Request( - pos_url, - callback=self.scrape_proceedings, - meta=meta, + def get_conference_paper_page_request(self, raw_xml, meta=None): + """Gets the conference paper html page, for the pdf link for the + conference paper, and later the internal conference id. + """ + meta = meta or {} + + identifier = raw_xml.xpath( + './/metadata/pex-dc/identifier/text()' + ).extract_first() + conference_paper_url = "{0}{1}".format( + self.base_conference_paper_url, + identifier, + ) + meta['xml_record'] = raw_xml + + # the meta parameter will be passed over to the callback as a property + # in the response parameter + return Request( + url=conference_paper_url, + callback=self.parse_conference_paper, + meta=meta ) - yield self.build_conference_paper_item(response) + def parse_conference_paper(self, response): + xml_record = response.meta.get('xml_record') + conference_paper_url = response.url + conference_paper_pdf_url = self._get_conference_paper_pdf_url( + conference_paper_page_html=response.body, + ) - def scrape_proceedings(self, response): - node = Selector( - text=response.body, - type='html', + parsed_conference_paper = self.build_conference_paper_item( + xml_record=xml_record, + conference_paper_url=conference_paper_url, + conference_paper_pdf_url=conference_paper_pdf_url, ) - node.remove_namespaces() - record = HEPLoader( - item=HEPRecord(), - selector=node + yield parsed_conference_paper + + # prepare next callback step + response.meta['html_record'] = response.body + yield self.get_conference_proceendings_page_request( + meta=response.meta, ) - record.add_value('collections', ['proceeding']) - record.add_value('title', self._get_proceedings_title(node=node)) - record.add_value('subtitle', self._get_proceedings_date_place(node=node)) - record.add_value('journal_title', 'PoS') - record.add_value( - 'journal_volume', - self._get_journal_volume( - identifier=response.meta.get('identifier'), + def get_conference_proceendings_page_request(self, meta): + """Gets the conference proceedings page, using the indernal conference + id from the record html page retrieved before. + """ + if not meta.get('html_record'): + raise PoSExtractionException( + 'PoS conference paper page was empty, current meta:\n%s' % meta ) + + proceedings_page_url = self._get_proceedings_page_url( + page_html=meta.get('html_record'), ) - parsed_item = ParsedItem( - record=record.load_item(), - record_format='hepcrawl', + page_selector = Selector( + text=meta.get('html_record'), + type='html', ) + pos_id = page_selector.xpath( + ".//metadata/pex-dc/identifier/text()" + ).extract_first() + meta['pos_id'] = pos_id - return parsed_item + return Request( + url=proceedings_page_url, + meta=meta, + callback=self.parse_conference_proceedings, + ) + + def parse_conference_proceedings(self, request): + parsed_conference_proceedings = self.build_conference_proceedings_item( + proceedings_page_html=request.body, + pos_id=request.meta['pos_id'], + ) + yield parsed_conference_proceedings + + def _get_proceedings_page_url(self, page_html): + page_selector = Selector( + text=page_html, + type="html" + ) + internal_url = page_selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_internal_id = internal_url.split('/')[1] + return '{0}{1}'.format( + self.base_proceedings_url, + proceedings_internal_id, + ) - def build_conference_paper_item(self, response): - """Parse an PoS XML exported file into a HEP record.""" - meta = response.meta - xml_record = meta.get('record') - node = Selector( + def build_conference_paper_item( + self, + xml_record, + conference_paper_url, + conference_paper_pdf_url, + ): + selector = Selector( text=xml_record, type="xml" ) - node.remove_namespaces() + selector.remove_namespaces() record = HEPLoader( item=HEPRecord(), - selector=node + selector=selector ) - license_text = node.xpath('.//metadata/pex-dc/rights/text()').extract_first() + license_text = selector.xpath( + './/metadata/pex-dc/rights/text()' + ).extract_first() record.add_value('license', get_licenses(license_text=license_text)) - date, year = self._get_date(node=node) + date, year = self._get_date(selector=selector) record.add_value('date_published', date) record.add_value('journal_year', year) - identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() - record.add_value('journal_title', self._get_journal_title(identifier=identifier)) - record.add_value('journal_volume', self._get_journal_volume(identifier=identifier)) - record.add_value('journal_artid', self._get_journal_artid(identifier=identifier)) + identifier = selector.xpath( + ".//metadata/pex-dc/identifier/text()" + ).extract_first() + record.add_value( + 'journal_title', + self._get_journal_title(identifier=identifier), + ) + record.add_value( + 'journal_volume', + self._get_journal_volume(identifier=identifier), + ) + record.add_value( + 'journal_artid', + self._get_journal_artid(identifier=identifier), + ) record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') - record.add_value('external_system_numbers', self._get_ext_systems_number(node=node)) - record.add_value('language', self._get_language(node=node)) - record.add_value('authors', self._get_authors(node=node)) + record.add_value( + 'external_system_numbers', + self._get_ext_systems_number(selector=selector), + ) + record.add_value('language', self._get_language(selector=selector)) + record.add_value('authors', self._get_authors(selector=selector)) record.add_value('collections', ['conferencepaper']) - record.add_value('urls', meta.get('pos_url')) + record.add_value('urls', conference_paper_pdf_url) + record.add_value('_fulltext_url', self._get_conference_paper_pdf_url()) parsed_item = ParsedItem( record=record.load_item(), @@ -168,14 +250,63 @@ def build_conference_paper_item(self, response): return parsed_item - def _get_conference_paper_pdf_url(self, response): - conference_paper_pdf_url = response.selector.xpath( + def build_conference_proceedings_item( + self, + proceedings_page_html, + pos_id, + ): + selector = Selector( + text=proceedings_page_html, + type='html', + ) + selector.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=selector + ) + + record.add_value('collections', ['proceeding']) + record.add_value( + 'title', + self._get_proceedings_title(selector=selector), + ) + record.add_value( + 'subtitle', + self._get_proceedings_date_place(selector=selector), + ) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + self._get_journal_volume(pos_id=pos_id), + ) + + parsed_proceeding = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_proceeding + + def _get_conference_paper_pdf_url(self, conference_paper_page_html): + selector = Selector( + text=conference_paper_page_html, + type='html', + ) + conference_paper_pdf_relative_url = selector.xpath( "//a[contains(text(),'pdf')]/@href", ).extract_first() + if not conference_paper_pdf_relative_url: + raise PoSExtractionException( + ( + 'Unable to get the conference paper pdf url from the html:' + '\n%s' + ) % conference_paper_page_html + ) + return urljoin( - self.BASE_CONFERENCE_PAPER_URL, - conference_paper_pdf_url, + self.base_conference_paper_url, + conference_paper_pdf_relative_url, ) def _get_proceedings_url(self, response): @@ -186,44 +317,50 @@ def _get_proceedings_url(self, response): return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) @staticmethod - def _get_language(node): - language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() + def _get_language(selector): + language = selector.xpath( + ".//metadata/pex-dc/language/text()" + ).extract_first() return language if language != 'en' else None @staticmethod - def _get_journal_title(identifier): - return re.split('[()]', identifier)[0] + def _get_journal_title(pos_id): + return re.split('[()]', pos_id)[0] @staticmethod - def _get_journal_volume(identifier): - return re.split('[()]', identifier)[1] + def _get_journal_volume(pos_id): + return re.split('[()]', pos_id)[1] @staticmethod - def _get_journal_artid(identifier): - return re.split('[()]', identifier)[2] + def _get_journal_artid(pos_id): + return re.split('[()]', pos_id)[2] @staticmethod - def _get_ext_systems_number(node): + def _get_ext_systems_number(selector): return [ { 'institute': 'pos', - 'value': node.xpath('.//identifier/text()').extract_first() + 'value': selector.xpath( + './/identifier/text()' + ).extract_first() }, ] @staticmethod - def _get_date(node): - full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() + def _get_date(selector): + full_date = selector.xpath( + ".//metadata/pex-dc/date/text()" + ).extract_first() date = create_valid_date(full_date) year = int(date[0:4]) return date, year @staticmethod - def _get_authors(node): # To be refactored + def _get_authors(selector): """Get article authors.""" authors = [] - creators = node.xpath('.//metadata/pex-dc/creator') + creators = selector.xpath('.//metadata/pex-dc/creator') for creator in creators: auth_dict = {} author = Selector(text=creator.extract()) @@ -231,14 +368,15 @@ def _get_authors(node): # To be refactored author.xpath('.//name//text()').extract(), default='', ) - for affiliation in author.xpath('.//affiliation//text()').extract(): + for affiliation in author.xpath( + './/affiliation//text()' + ).extract(): if 'affiliations' in auth_dict: auth_dict['affiliations'].append( { 'value': affiliation } ) - # Todo probably to remove else: auth_dict['affiliations'] = [ { @@ -250,10 +388,12 @@ def _get_authors(node): # To be refactored return authors @staticmethod - def _get_proceedings_title(node): - return node.xpath('//h1/text()').extract_first() + def _get_proceedings_title(selector): + return selector.xpath('//h1/text()').extract_first() @staticmethod - def _get_proceedings_date_place(node): - date_place = node.xpath("//div[@class='conference_date']/text()").extract() + def _get_proceedings_date_place(selector): + date_place = selector.xpath( + "//div[@class='conference_date']/text()" + ).extract() return ''.join(date_place) From a98cfb2fb46edf611df1afb9e36b95cb4a784696 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Mon, 21 Aug 2017 13:44:30 +0200 Subject: [PATCH 05/14] pos: add proceedings support Addresses #159 Signed-off-by: Spiros Delviniotis --- .../records/PoS(LATTICE 2013)001.html | 119 +++++++++--------- .../pos_conference_proceedings_records.json | 2 +- tests/functional/pos/test_pos.py | 10 +- 3 files changed, 62 insertions(+), 69 deletions(-) diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html index 5ed0c148..64ad7a6f 100644 --- a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html +++ b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html @@ -1,65 +1,58 @@ - -PoS(LATTICE 2013)001 - - - - - - - - - - - - -Main Image - - - -

PoS(LATTICE 2013)001

- - - - -
-

Heavy Flavour Physics Review

-

A. El-Khadra

-

in 31st International Symposium on Lattice Field Theory LATTICE 2013

-

Contribution: pdf

- - - -
- - - - - - - - - - \ No newline at end of file + + PoS(LATTICE 2013)001 + + + + + + + + + + + Main Image + + + +

PoS(LATTICE 2013)001

+ + + +
+

Heavy Flavour Physics Review

+

A. El-Khadra

+

in 31st International Symposium on Lattice Field Theory LATTICE 2013

+

Contribution: pdf

+
+ + + + + + + + + diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json index 3605ad13..d2ebb12a 100644 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -1,5 +1,5 @@ [ - { + { "publication_info": [ { "journal_volume": "LATTICE 2013", diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index c122b68e..66f24831 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -32,7 +32,7 @@ def override_generated_fields(record): @pytest.fixture(scope="function") -def set_up_oai_environment(): +def set_up_environment(): package_location = get_test_suite_path( 'pos', 'fixtures', @@ -69,10 +69,10 @@ def set_up_oai_environment(): ] ) def test_pos_conference_paper_record_and_proceedings_record( - set_up_oai_environment, + set_up_environment, expected_results, ): - crawler = get_crawler_instance(set_up_oai_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( app=celery_app, @@ -80,10 +80,10 @@ def test_pos_conference_paper_record_and_proceedings_record( monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_oai_environment.get('CRAWLER_PROJECT'), + project=set_up_environment.get('CRAWLER_PROJECT'), spider='pos', settings={}, - **set_up_oai_environment.get('CRAWLER_ARGUMENTS') + **set_up_environment.get('CRAWLER_ARGUMENTS') ) gotten_results = [override_generated_fields(result) for result in results] From 21bdc5047b4aea668a4607b5ddb55dcdf74c07c2 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Tue, 22 Aug 2017 13:13:25 +0200 Subject: [PATCH 06/14] pos: fix spider Signed-off-by: Spiros Delviniotis --- hepcrawl/items.py | 4 ++ hepcrawl/spiders/pos_spider.py | 60 ++++++++++++------- hepcrawl/tohep.py | 6 ++ .../pos_conference_proceedings_records.json | 10 ++++ tests/functional/pos/test_pos.py | 24 +++++--- tests/unit/test_pos.py | 56 ++++++++++------- 6 files changed, 107 insertions(+), 53 deletions(-) diff --git a/hepcrawl/items.py b/hepcrawl/items.py index dab67dda..8bdb5478 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -318,3 +318,7 @@ class HEPRecord(scrapy.Item): thesis_supervisor = scrapy.Field() language = scrapy.Field() + + _fft = scrapy.Field() + """Used to communicate with legacy about files (to be) attached to the + record.""" diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 283cfcf8..b3574a8e 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -94,24 +94,26 @@ def parse(self, response): self.log('Got record from: {response.url}'.format(**vars())) response.selector.remove_namespaces() - records = response.selector.xpath('.//record') - for record in records: - yield self.get_conference_paper_page_request(raw_xml=record) + record_xml_selectors = response.selector.xpath('.//record') + for record_xml_selector in record_xml_selectors: + yield self.get_conference_paper_page_request( + xml_selector=record_xml_selector, + ) - def get_conference_paper_page_request(self, raw_xml, meta=None): + def get_conference_paper_page_request(self, xml_selector, meta=None): """Gets the conference paper html page, for the pdf link for the conference paper, and later the internal conference id. """ meta = meta or {} - identifier = raw_xml.xpath( + identifier = xml_selector.xpath( './/metadata/pex-dc/identifier/text()' ).extract_first() conference_paper_url = "{0}{1}".format( self.base_conference_paper_url, identifier, ) - meta['xml_record'] = raw_xml + meta['xml_record'] = xml_selector.extract() # the meta parameter will be passed over to the callback as a property # in the response parameter @@ -137,11 +139,11 @@ def parse_conference_paper(self, response): # prepare next callback step response.meta['html_record'] = response.body - yield self.get_conference_proceendings_page_request( + yield self.get_conference_proceedings_page_request( meta=response.meta, ) - def get_conference_proceendings_page_request(self, meta): + def get_conference_proceedings_page_request(self, meta): """Gets the conference proceedings page, using the indernal conference id from the record html page retrieved before. """ @@ -155,9 +157,10 @@ def get_conference_proceendings_page_request(self, meta): ) page_selector = Selector( - text=meta.get('html_record'), - type='html', + text=meta.get('xml_record'), + type='xml', ) + page_selector.remove_namespaces() pos_id = page_selector.xpath( ".//metadata/pex-dc/identifier/text()" ).extract_first() @@ -220,15 +223,15 @@ def build_conference_paper_item( ).extract_first() record.add_value( 'journal_title', - self._get_journal_title(identifier=identifier), + self._get_journal_title(pos_ext_identifier=identifier), ) record.add_value( 'journal_volume', - self._get_journal_volume(identifier=identifier), + self._get_journal_volume(pos_ext_identifier=identifier), ) record.add_value( 'journal_artid', - self._get_journal_artid(identifier=identifier), + self._get_journal_artid(pos_ext_identifier=identifier), ) record.add_xpath('title', '//metadata/pex-dc/title/text()') @@ -240,8 +243,13 @@ def build_conference_paper_item( record.add_value('language', self._get_language(selector=selector)) record.add_value('authors', self._get_authors(selector=selector)) record.add_value('collections', ['conferencepaper']) - record.add_value('urls', conference_paper_pdf_url) - record.add_value('_fulltext_url', self._get_conference_paper_pdf_url()) + record.add_value('urls', [conference_paper_url]) + record.add_value( + '_fft', + self._set_fft( + path=conference_paper_pdf_url, + ), + ) parsed_item = ParsedItem( record=record.load_item(), @@ -277,7 +285,7 @@ def build_conference_proceedings_item( record.add_value('journal_title', 'PoS') record.add_value( 'journal_volume', - self._get_journal_volume(pos_id=pos_id), + self._get_journal_volume(pos_ext_identifier=pos_id), ) parsed_proceeding = ParsedItem( @@ -316,6 +324,14 @@ def _get_proceedings_url(self, response): proceedings_identifier = internal_url.split('/')[1] return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + @staticmethod + def _set_fft(path): + return [ + { + 'path': path, + }, + ] + @staticmethod def _get_language(selector): language = selector.xpath( @@ -324,16 +340,16 @@ def _get_language(selector): return language if language != 'en' else None @staticmethod - def _get_journal_title(pos_id): - return re.split('[()]', pos_id)[0] + def _get_journal_title(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[0] @staticmethod - def _get_journal_volume(pos_id): - return re.split('[()]', pos_id)[1] + def _get_journal_volume(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[1] @staticmethod - def _get_journal_artid(pos_id): - return re.split('[()]', pos_id)[2] + def _get_journal_artid(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[2] @staticmethod def _get_ext_systems_number(selector): diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 727600f8..b333b1b7 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -384,6 +384,12 @@ def _filter_affiliation(affiliations): source=report_number.get('source') ) + for url in crawler_record.get('urls', []): + builder.add_url(url=url.get('value')) + + if crawler_record.get('_fft'): + builder.record['_fft'] = crawler_record.get('_fft') + builder.validate_record() return builder.record diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json index d2ebb12a..317dd0c9 100644 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -41,6 +41,16 @@ "title": "Heavy Flavour Physics Review" } ], + "_fft": [ + { + "path": "https://server.local/187/001/pdf" + } + ], + "urls": [ + { + "value": "https://server.local/PoS(LATTICE%202013)001.html" + } + ], "authors": [ { "affiliations": [ diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 66f24831..423db2b0 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -32,7 +32,13 @@ def override_generated_fields(record): @pytest.fixture(scope="function") -def set_up_environment(): +def wait_until_services_are_up(seconds=10): + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(seconds) + + +@pytest.fixture(scope="function") +def configuration(): package_location = get_test_suite_path( 'pos', 'fixtures', @@ -41,9 +47,6 @@ def set_up_environment(): test_suite='functional', ) - # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(10) - yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', @@ -69,10 +72,11 @@ def set_up_environment(): ] ) def test_pos_conference_paper_record_and_proceedings_record( - set_up_environment, - expected_results, + configuration, + wait_until_services_are_up, + expected_results, ): - crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( app=celery_app, @@ -80,10 +84,10 @@ def test_pos_conference_paper_record_and_proceedings_record( monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_environment.get('CRAWLER_PROJECT'), + project=configuration.get('CRAWLER_PROJECT'), spider='pos', settings={}, - **set_up_environment.get('CRAWLER_ARGUMENTS') + **configuration.get('CRAWLER_ARGUMENTS') ) gotten_results = [override_generated_fields(result) for result in results] @@ -93,6 +97,8 @@ def test_pos_conference_paper_record_and_proceedings_record( # TODO create test that receives conference paper record AND proceedings record. +# 'Crawl-once' plug-in needed. # TODO create test that receives proceedings record ONLY. +# 'Crawl-once' plug-in needed. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 6eb82940..689be197 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -32,7 +32,7 @@ def override_generated_fields(record): @pytest.fixture(scope='session') -def scrape_pos_page_body(): +def scrape_pos_conference_paper_page_body(): return pkg_resources.resource_string( __name__, os.path.join( @@ -44,7 +44,7 @@ def scrape_pos_page_body(): @pytest.fixture(scope='session') -def generated_record(scrape_pos_page_body): +def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' @@ -54,7 +54,9 @@ def generated_record(scrape_pos_page_body): crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( - fake_response_from_file('pos/sample_pos_record.xml') + fake_response_from_file( + file_name=str('pos/sample_pos_record.xml'), + ) ).next() response = HtmlResponse( url=request.url, @@ -75,7 +77,7 @@ def generated_record(scrape_pos_page_body): clean_dir() -def test_titles(generated_record): +def test_titles(generated_conference_paper): """Test extracting title.""" expected_titles = [ { @@ -84,34 +86,34 @@ def test_titles(generated_record): } ] - assert 'titles' in generated_record - assert generated_record['titles'] == expected_titles + assert 'titles' in generated_conference_paper + assert generated_conference_paper['titles'] == expected_titles @pytest.mark.xfail(reason='License texts are not normalized and converted to URLs') -def test_license(generated_record): +def test_license(generated_conference_paper): """Test extracting license information.""" expected_license = [{ - 'license': 'CC BY-NC-SA 3.0', + 'license': 'CC-BY-NC-SA-3.0', 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0', }] - assert generated_record['license'] == expected_license + assert generated_conference_paper['license'] == expected_license -def test_collections(generated_record): +def test_collections(generated_conference_paper): """Test extracting collections.""" expected_document_type = ['conference paper'] - assert generated_record.get('citeable') - assert generated_record.get('document_type') == expected_document_type + assert generated_conference_paper.get('citeable') + assert generated_conference_paper.get('document_type') == expected_document_type -def test_language(generated_record): +def test_language(generated_conference_paper): """Test extracting language.""" - assert 'language' not in generated_record + assert 'language' not in generated_conference_paper -def test_publication_info(generated_record): +def test_publication_info(generated_conference_paper): """Test extracting dois.""" expected_pub_info = [{ 'artid': '001', @@ -120,13 +122,13 @@ def test_publication_info(generated_record): 'year': 2014, }] - assert 'publication_info' in generated_record + assert 'publication_info' in generated_conference_paper - pub_info = generated_record['publication_info'] + pub_info = generated_conference_paper['publication_info'] assert pub_info == expected_pub_info -def test_authors(generated_record): +def test_authors(generated_conference_paper): """Test authors.""" expected_authors = [ { @@ -139,9 +141,9 @@ def test_authors(generated_record): } ] - assert 'authors' in generated_record + assert 'authors' in generated_conference_paper - result_authors = generated_record['authors'] + result_authors = generated_conference_paper['authors'] assert len(result_authors) == len(expected_authors) @@ -150,7 +152,7 @@ def test_authors(generated_record): assert author == expected_author -def test_pipeline_record(generated_record): +def test_pipeline_conference_paper(generated_conference_paper): expected = { 'acquisition_source': { 'datetime': '2017-08-10T16:03:59.091110', @@ -204,7 +206,17 @@ def test_pipeline_record(generated_record): 'source': u'Sissa Medialab', 'title': u'Heavy Flavour Physics Review' } + ], + '_fft': [ + { + 'path': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf' + } + ], + 'urls': [ + { + 'value': 'https://pos.sissa.it/contribution?id=PoS%28LATTICE+2013%29001' + } ] } - assert override_generated_fields(generated_record) == expected + assert override_generated_fields(generated_conference_paper) == expected From fcbd1d6d96867ff390ec7e09eb5f8387835006b4 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 21:01:38 +0200 Subject: [PATCH 07/14] docker: added proper health dependencies Signed-off-by: David Caro --- docker-compose.test.yml | 61 +++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/docker-compose.test.yml b/docker-compose.test.yml index a1e93998..9a1df2e0 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -33,22 +33,27 @@ services: functional_wsp: <<: *service_base command: py.test -vv tests/functional/wsp - links: - - scrapyd - - ftp_server + depends_on: + scrapyd: + condition: service_healthy + ftp_server: + condition: service_healthy functional_desy: <<: *service_base command: py.test -vv tests/functional/desy - links: - - scrapyd - - ftp_server + depends_on: + scrapyd: + condition: service_healthy + ftp_server: + condition: service_healthy functional_arxiv: <<: *service_base command: py.test -vv tests/functional/arxiv - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy functional_cds: <<: *service_base @@ -59,9 +64,11 @@ services: functional_pos: <<: *service_base command: py.test -vv tests/functional/pos - links: - - scrapyd - - server.local + depends_on: + scrapyd: + condition: service_healthy + http-server.local: + condition: service_healthy unit: <<: *service_base @@ -71,14 +78,16 @@ services: celery: <<: *service_base command: celery worker --events --app hepcrawl.testlib.tasks --loglevel=debug - links: - - rabbitmq + depends_on: + rabbitmq: + condition: service_healthy scrapyd: <<: *service_base command: bash -c "rm -f twistd.pid && exec scrapyd" - links: - - celery + depends_on: + celery: + condition: service_started healthcheck: timeout: 5s interval: 5s @@ -90,8 +99,9 @@ services: scrapyd-deploy: <<: *service_base command: bash -c "scrapyd-deploy" - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy ftp_server: image: stilliard/pure-ftpd:hardened @@ -103,7 +113,7 @@ services: - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd - server.local: + http-server.local: image: nginx:stable-alpine volumes: - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf @@ -111,6 +121,21 @@ services: - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/ ports: - 443:443 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl https://localhost:443/" rabbitmq: image: rabbitmq + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD" + - "rabbitmqctl" + - "status" From 38b60d209437b68f3932c4d9fd2bff1782601f7e Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 21:02:07 +0200 Subject: [PATCH 08/14] middleware: added some extra debug logs Signed-off-by: David Caro --- hepcrawl/middlewares.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 99551e93..ecd518f4 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -116,10 +116,23 @@ def _has_to_be_crawled(self, request, spider): request_db_key = self._get_key(request) if request_db_key not in self.db: + LOGGER.debug( + 'Crawl-Once: key %s for request %s not found in the db, ' + 'should be crawled.' % (request_db_key, request) + ) return True new_file_timestamp = self._get_timestamp(request, spider) old_file_timestamp = self.db.get(key=request_db_key) + LOGGER.debug( + 'Crawl-Once: key %s for request %s found in the db, ' + 'considering timestamps new(%s) and old(%s).' % ( + request_db_key, + request, + new_file_timestamp, + old_file_timestamp, + ) + ) return new_file_timestamp > old_file_timestamp def _get_key(self, request): From d9c64b1d64a28758e97c22c5ab46298479f87919 Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 20 Sep 2017 21:02:52 +0200 Subject: [PATCH 09/14] pos: fix title and test cleanups Signed-off-by: David Caro --- hepcrawl/spiders/pos_spider.py | 4 +- hepcrawl/tohep.py | 1 + .../pos_conference_proceedings_records.json | 9 +-- tests/functional/pos/test_pos.py | 66 +++++++++++-------- 4 files changed, 47 insertions(+), 33 deletions(-) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index b3574a8e..49d1a830 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -273,7 +273,7 @@ def build_conference_proceedings_item( selector=selector ) - record.add_value('collections', ['proceeding']) + record.add_value('collections', ['proceedings']) record.add_value( 'title', self._get_proceedings_title(selector=selector), @@ -405,7 +405,7 @@ def _get_authors(selector): @staticmethod def _get_proceedings_title(selector): - return selector.xpath('//h1/text()').extract_first() + return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first() @staticmethod def _get_proceedings_date_place(selector): diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index b333b1b7..2bc3a783 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -249,6 +249,7 @@ def _filter_affiliation(affiliations): for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), + subtitle=title.get('subtitle'), source=title.get('source') ) diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json index 317dd0c9..ae21d507 100644 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -7,12 +7,13 @@ } ], "document_type": [ - "article" + "proceedings" ], "titles": [ { "source": "pos", - "title": "31st International Symposium on Lattice Field Theory LATTICE 2013" + "title": "Proceedings, 31st International Symposium on Lattice Field Theory LATTICE 2013", + "subtitle": "1-3 August 2002, Heidelberg, Germany" } ], "acquisition_source": { @@ -43,12 +44,12 @@ ], "_fft": [ { - "path": "https://server.local/187/001/pdf" + "path": "https://http-server.local/187/001/pdf" } ], "urls": [ { - "value": "https://server.local/PoS(LATTICE%202013)001.html" + "value": "https://http-server.local/PoS(LATTICE%202013)001.html" } ], "authors": [ diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 423db2b0..a915516b 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -11,34 +11,38 @@ from __future__ import absolute_import, division, print_function +import os import pytest -from time import sleep - from hepcrawl.testlib.celery_monitor import CeleryMonitor from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.utils import get_crawler_instance +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + yield + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) + + def override_generated_fields(record): record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' + ) return record -@pytest.fixture(scope="function") -def wait_until_services_are_up(seconds=10): - # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(seconds) - - -@pytest.fixture(scope="function") -def configuration(): +def get_configuration(): package_location = get_test_suite_path( 'pos', 'fixtures', @@ -47,24 +51,31 @@ def configuration(): test_suite='functional', ) - yield { + return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, - 'base_conference_paper_url': 'https://server.local/contribution?id=', - 'base_proceedings_url': 'https://server.local/cgi-bin/reader/conf.cgi?confid=', + 'base_conference_paper_url': ( + 'https://http-server.local/contribution?id=' + ), + 'base_proceedings_url': ( + 'https://http-server.local/cgi-bin/reader/conf.cgi?confid=' + ), } } @pytest.mark.parametrize( - 'expected_results', + 'expected_results, config', [ - expected_json_results_from_file( - 'pos', - 'fixtures', - 'pos_conference_proceedings_records.json', + ( + expected_json_results_from_file( + 'pos', + 'fixtures', + 'pos_conference_proceedings_records.json', + ), + get_configuration(), ), ], ids=[ @@ -72,11 +83,10 @@ def configuration(): ] ) def test_pos_conference_paper_record_and_proceedings_record( - configuration, - wait_until_services_are_up, expected_results, + config, ): - crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) results = CeleryMonitor.do_crawl( app=celery_app, @@ -84,20 +94,22 @@ def test_pos_conference_paper_record_and_proceedings_record( monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=configuration.get('CRAWLER_PROJECT'), + project=config['CRAWLER_PROJECT'], spider='pos', settings={}, - **configuration.get('CRAWLER_ARGUMENTS') + **config['CRAWLER_ARGUMENTS'] ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert sorted(gotten_results) == expected_results -# TODO create test that receives conference paper record AND proceedings record. -# 'Crawl-once' plug-in needed. +# TODO create test that receives conference paper record AND proceedings +# record. 'Crawl-once' plug-in needed. # TODO create test that receives proceedings record ONLY. From c98af1f013bf96c1ea26450efa2ea4919a852ed0 Mon Sep 17 00:00:00 2001 From: David Caro Date: Fri, 20 Oct 2017 20:52:04 +0200 Subject: [PATCH 10/14] pos: remove fft Signed-off-by: David Caro --- hepcrawl/items.py | 4 ---- hepcrawl/tohep.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/hepcrawl/items.py b/hepcrawl/items.py index 8bdb5478..dab67dda 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -318,7 +318,3 @@ class HEPRecord(scrapy.Item): thesis_supervisor = scrapy.Field() language = scrapy.Field() - - _fft = scrapy.Field() - """Used to communicate with legacy about files (to be) attached to the - record.""" diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 2bc3a783..d83d7e43 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -388,9 +388,6 @@ def _filter_affiliation(affiliations): for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) - if crawler_record.get('_fft'): - builder.record['_fft'] = crawler_record.get('_fft') - builder.validate_record() return builder.record From 120369d75bd7bcc3c809217d4b2598558d71adf9 Mon Sep 17 00:00:00 2001 From: David Caro Date: Fri, 20 Oct 2017 20:52:13 +0200 Subject: [PATCH 11/14] functional.desy: remove fft namings Signed-off-by: David Caro --- tests/functional/desy/test_desy.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index 3c309e1f..b3ef14ae 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -77,24 +77,24 @@ def _generate_md5_hash(file_path): assert file_1_hash == file_2_hash -def assert_ffts_content_matches_expected(record): - for fft_field in record.get('_fft', []): - assert_fft_content_matches_expected(fft_field) +def assert_documents_content_matches_expected(record): + for documents_field in record.get('documents', []): + assert_document_content_matches_expected(documents_field) -def assert_fft_content_matches_expected(fft_field): - expected_file_name = get_file_name_from_fft(fft_field) - assert_files_equal(expected_file_name, fft_field['path']) +def assert_document_content_matches_expected(documents_field): + expected_file_name = get_file_name_from_documents(documents_field) + assert_files_equal(expected_file_name, documents_field['path']) -def get_file_name_from_fft(fft_field): +def get_file_name_from_documents(documents_field): file_path = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'FFT', - fft_field['filename'] + fft_field['format'], + documents_field['filename'] + documents_field['format'], test_suite='functional', ) return file_path @@ -215,4 +215,4 @@ def test_desy( assert gotten_results == expected_results for record in gotten_results: - assert_ffts_content_matches_expected(record) + assert_documents_content_matches_expected(record) From cb5a69bf02ef06dd231cc39aa14b0b170160c44e Mon Sep 17 00:00:00 2001 From: David Caro Date: Sat, 28 Oct 2017 21:37:47 +0200 Subject: [PATCH 12/14] pos: adapt to documents Signed-off-by: David Caro --- hepcrawl/items.py | 9 ++++--- hepcrawl/spiders/pos_spider.py | 26 +++++++++++++------ hepcrawl/tohep.py | 11 ++++++++ .../pos_conference_proceedings_records.json | 18 +++++++++---- tests/functional/pos/test_pos.py | 13 ++++++++-- tests/unit/test_pos.py | 16 ++++++++---- 6 files changed, 69 insertions(+), 24 deletions(-) diff --git a/hepcrawl/items.py b/hepcrawl/items.py index dab67dda..f14f17df 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -44,15 +44,16 @@ class HEPRecord(scrapy.Item): file_urls = scrapy.Field() """List of files to be downloaded with FilesPipeline and added to files.""" - additional_files = scrapy.Field() + documents = scrapy.Field() """Files (fulltexts, package) belonging to this item. Example: :: - [{ - "type": "Fulltext", # Fulltext, Supplemental, Data, Figure - "uri": "file:///path/to/file", # can also be HTTP + "fulltext": true, + "url": "file:///path/to/file", + "description": "some fancy stuff", + "key": "usually_a_file_name.pdf", }] """ diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 49d1a830..b98c8508 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -12,7 +12,7 @@ from __future__ import absolute_import, division, print_function import re - +import os from urlparse import urljoin from scrapy import Request, Selector @@ -84,8 +84,8 @@ def __init__( ): super(POSSpider, self).__init__(**kwargs) self.source_file = source_file - self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url - self.BASE_PROCEEDINGS_URL = base_proceedings_url + self.base_conference_paper_url = base_conference_paper_url + self.base_proceedings_url = base_proceedings_url def start_requests(self): yield Request(self.source_file) @@ -124,6 +124,9 @@ def get_conference_paper_page_request(self, xml_selector, meta=None): ) def parse_conference_paper(self, response): + self.log( + 'Parsing conference paper from: {response.url}'.format(**vars()) + ) xml_record = response.meta.get('xml_record') conference_paper_url = response.url conference_paper_pdf_url = self._get_conference_paper_pdf_url( @@ -245,8 +248,8 @@ def build_conference_paper_item( record.add_value('collections', ['conferencepaper']) record.add_value('urls', [conference_paper_url]) record.add_value( - '_fft', - self._set_fft( + 'documents', + self.get_documents( path=conference_paper_pdf_url, ), ) @@ -322,13 +325,20 @@ def _get_proceedings_url(self, response): "//a[not(contains(text(),'pdf'))]/@href", ).extract_first() proceedings_identifier = internal_url.split('/')[1] - return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + return '{0}{1}'.format( + self.base_proceedings_url, + proceedings_identifier, + ) @staticmethod - def _set_fft(path): + def get_documents(path): return [ { - 'path': path, + 'key': os.path.basename(path), + 'url': path, + 'original_url': path, + 'hidden': True, + 'fulltext': True, }, ] diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index d83d7e43..a4529d7c 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -388,6 +388,17 @@ def _filter_affiliation(affiliations): for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) + for document in crawler_record.get('documents', []): + builder.add_document( + description=document.get('description'), + fulltext=document.get('fulltext'), + hidden=document.get('hidden'), + key=document['key'], + material=document.get('material'), + original_url=document.get('original_url'), + url=document['url'], + ) + builder.validate_record() return builder.record diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json index ae21d507..94899d88 100644 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -1,5 +1,7 @@ [ { + "_collections": [ "Literature" ], + "curated": false, "publication_info": [ { "journal_volume": "LATTICE 2013", @@ -13,7 +15,7 @@ { "source": "pos", "title": "Proceedings, 31st International Symposium on Lattice Field Theory LATTICE 2013", - "subtitle": "1-3 August 2002, Heidelberg, Germany" + "subtitle": "29 July \u2013 3 August, 2013 Mainz, Germany" } ], "acquisition_source": { @@ -24,6 +26,8 @@ } }, { + "_collections": [ "Literature" ], + "curated": false, "acquisition_source": { "source": "pos", "method": "hepcrawl", @@ -32,8 +36,7 @@ }, "license": [ { - "url": "https://creativecommons.org/licenses/by-nc-sa/3.0", - "license": "CC-BY-NC-SA-3.0" + "license": "Creative Commons Attribution-NonCommercial-ShareAlike" } ], "titles": [ @@ -42,9 +45,14 @@ "title": "Heavy Flavour Physics Review" } ], - "_fft": [ + "documents": [ { - "path": "https://http-server.local/187/001/pdf" + "fulltext": true, + "hidden": true, + "url": "https://http-server.local/187/001/pdf", + "original_url": "https://http-server.local/187/001/pdf", + "key": "pdf", + "source": "pos" } ], "urls": [ diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index a915516b..490ad058 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -92,7 +92,7 @@ def test_pos_conference_paper_record_and_proceedings_record( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, - events_limit=1, + events_limit=2, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], spider='pos', @@ -105,7 +105,16 @@ def test_pos_conference_paper_record_and_proceedings_record( override_generated_fields(expected) for expected in expected_results ] - assert sorted(gotten_results) == expected_results + gotten_results = sorted( + gotten_results, + key=lambda x: x['document_type'] + ) + expected_results = sorted( + expected_results, + key=lambda x: x['document_type'] + ) + + assert gotten_results == expected_results # TODO create test that receives conference paper record AND proceedings diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 689be197..b376c964 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -61,7 +61,7 @@ def generated_conference_paper(scrape_pos_conference_paper_page_body): response = HtmlResponse( url=request.url, request=request, - body=scrape_pos_page_body, + body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response @@ -154,6 +154,8 @@ def test_authors(generated_conference_paper): def test_pipeline_conference_paper(generated_conference_paper): expected = { + '_collections': ['Literature'], + 'curated': False, 'acquisition_source': { 'datetime': '2017-08-10T16:03:59.091110', 'method': 'hepcrawl', @@ -189,8 +191,7 @@ def test_pipeline_conference_paper(generated_conference_paper): ], 'license': [ { - 'license': 'CC-BY-NC-SA-3.0', - 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0' + 'license': 'Creative Commons Attribution-NonCommercial-ShareAlike', } ], 'publication_info': [ @@ -207,9 +208,14 @@ def test_pipeline_conference_paper(generated_conference_paper): 'title': u'Heavy Flavour Physics Review' } ], - '_fft': [ + 'documents': [ { - 'path': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf' + 'key': 'LATTICE 2013_001.pdf', + 'fulltext': True, + 'hidden': True, + 'url': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf', + 'original_url': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf', + 'source': 'pos', } ], 'urls': [ From 5fb3a5cbc455e36c5d0502601a1f06bce851691d Mon Sep 17 00:00:00 2001 From: David Caro Date: Sat, 28 Oct 2017 21:39:17 +0200 Subject: [PATCH 13/14] functional.desy: adapt to documents structure Signed-off-by: David Caro --- tests/functional/desy/test_desy.py | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index b3ef14ae..42a51105 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -63,30 +63,6 @@ def _override(field_key, original_dict, backup_dict, new_value): return clean_record -def assert_files_equal(file_1, file_2): - """Compares two files calculating the md5 hash.""" - def _generate_md5_hash(file_path): - hasher = hashlib.md5() - with open(str(file_path), 'rb') as fd: - buf = fd.read() - hasher.update(buf) - return hasher.hexdigest() - - file_1_hash = _generate_md5_hash(file_1) - file_2_hash = _generate_md5_hash(file_2) - assert file_1_hash == file_2_hash - - -def assert_documents_content_matches_expected(record): - for documents_field in record.get('documents', []): - assert_document_content_matches_expected(documents_field) - - -def assert_document_content_matches_expected(documents_field): - expected_file_name = get_file_name_from_documents(documents_field) - assert_files_equal(expected_file_name, documents_field['path']) - - def get_file_name_from_documents(documents_field): file_path = get_test_suite_path( 'desy', @@ -94,7 +70,7 @@ def get_file_name_from_documents(documents_field): 'ftp_server', 'DESY', 'FFT', - documents_field['filename'] + documents_field['format'], + documents_field['key'], test_suite='functional', ) return file_path @@ -213,6 +189,3 @@ def test_desy( ) assert gotten_results == expected_results - - for record in gotten_results: - assert_documents_content_matches_expected(record) From 1dd708dfd51374905bc87e9b90f6275bca671862 Mon Sep 17 00:00:00 2001 From: David Caro Date: Sat, 28 Oct 2017 21:39:36 +0200 Subject: [PATCH 14/14] global: minimal adaptation to documents Signed-off-by: David Caro --- hepcrawl/items.py | 1 + hepcrawl/spiders/edp_spider.py | 4 ++-- hepcrawl/spiders/elsevier_spider.py | 5 ++++- hepcrawl/spiders/hindawi_spider.py | 12 ++++++------ hepcrawl/spiders/infn_spider.py | 5 ++++- hepcrawl/spiders/iop_spider.py | 26 +++++++++++++++----------- hepcrawl/spiders/magic_spider.py | 2 +- hepcrawl/spiders/mit_spider.py | 6 ++++-- hepcrawl/spiders/phenix_spider.py | 5 ++++- hepcrawl/spiders/pos_spider.py | 2 +- hepcrawl/spiders/t2k_spider.py | 10 +++++----- tests/unit/test_edp.py | 2 -- tests/unit/test_elsevier.py | 4 ++-- tests/unit/test_hindawi.py | 6 +++--- tests/unit/test_infn.py | 6 ++++-- tests/unit/test_iop.py | 21 ++++++++++----------- tests/unit/test_magic.py | 4 ++-- tests/unit/test_mit.py | 5 ++++- tests/unit/test_phenix.py | 9 ++++++--- tests/unit/test_t2k.py | 4 ++-- 20 files changed, 80 insertions(+), 59 deletions(-) diff --git a/hepcrawl/items.py b/hepcrawl/items.py index f14f17df..09d0d552 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -49,6 +49,7 @@ class HEPRecord(scrapy.Item): Example: :: + [{ "fulltext": true, "url": "file:///path/to/file", diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index ef075e4d..c051c8ee 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -312,7 +312,7 @@ def build_item_rich(self, response): # NOTE: maybe this should be removed as the 'rich' format records # are not open access. record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", @@ -384,7 +384,7 @@ def build_item_jats(self, response): if "pdf_links" in response.meta: record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 3f1fe0c6..e2d4e919 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -995,7 +995,10 @@ def build_item(self, response): xml_file = response.meta.get("xml_url") if xml_file: - record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(xml_file, "HIDDEN", "Fulltext"), + ) sd_url = self._get_sd_url(xml_file) if requests.head(sd_url).status_code == 200: # Test if valid url record.add_value("urls", sd_url) diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 999b7183..5f81f5b4 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -154,13 +154,13 @@ def get_journal_pages(node): else: return journal_pages, '' - def create_file(self, file_path, file_access, file_type): - """Create a structured dictionary to add to 'files' item.""" + def create_document(self, file_path): + """Create a structured dictionary to add to 'documents' item.""" file_dict = { - "access": file_access, + "hidden": True, "description": self.name.upper(), "url": file_path, - "type": file_type, + "fulltext": True, } return file_dict @@ -219,9 +219,9 @@ def parse_node(self, response, node): record.add_value('file_urls', pdf_links) if xml_links: record.add_value( - 'additional_files', + 'documents', [ - self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext") + self.create_document(xml) for xml in xml_links ] ) diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 23e71708..2e093ab1 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -232,7 +232,10 @@ def build_item(self, response): pdf_files = response.meta.get("pdf_links") if pdf_files: - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', response.meta.get("authors")) record.add_value('date_published', response.meta.get("date_published")) record.add_value('thesis', response.meta.get("thesis_info")) diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index ee778e58..fbca3ae5 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage): if pattern in pdf_path: return os.path.join(self.pdf_files, pdf_path) - def add_file(self, file_path, file_access, file_type): + def add_document(self, file_path, hidden, fulltext): """Create a structured dictionary and add to 'files' item.""" file_dict = { - "access": file_access, + "hidden": hidden, + "fulltext": fulltext, "description": self.name.upper(), "url": file_path, - "type": file_type, } return file_dict @@ -206,21 +206,25 @@ def parse_node(self, response, node): record.add_value('collections', self.get_collections(doctype)) xml_file_path = response.url - record.add_value("additional_files", - self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext")) + record.add_value( + "documents", + self.add_document(xml_file_path, hidden=True, fulltext=True), + ) if self.pdf_files: pdf_file_path = self.get_pdf_path(volume, issue, fpage) if pdf_file_path: if doctype and "erratum" in doctype.lower(): - file_type = "Erratum" + fulltext = False else: - file_type = "Fulltext" + fulltext = True if journal_title in self.OPEN_ACCESS_JOURNALS: - file_access = "INSPIRE-PUBLIC" # FIXME: right? + hidden = False else: - file_access = "INSPIRE-HIDDEN" - record.add_value("additional_files", - self.add_file(pdf_file_path, file_access, file_type)) + hidden = True + record.add_value( + "documents", + self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext), + ) parsed_item = ParsedItem( record=record.load_item(), diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 41687674..8dfd5d51 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -177,7 +177,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("files")) + record.add_value("documents", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 8ba61d89..21804873 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -207,8 +207,10 @@ def build_item(self, response): pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract() if pdf_files: - record.add_value('additional_files', self.add_file( - pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_xpath('date_published', "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()") diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index a5fcd025..aa54bd98 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -121,7 +121,10 @@ def parse_node(self, response, node): return None pdf_files = node.xpath(".//a/@href").extract() - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_value('date_published', year) record.add_value('thesis', {'degree_type': thesis_type}) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index b98c8508..19d4fee5 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -62,7 +62,7 @@ class POSSpider(StatefulSpider): To do that and because each needs the information of the previous, the spider must use the callbacks system provided by scrapy through the - :ref:`scrapy.html.response.Response` callback parameter, and chain the + :class:`scrapy.html.response.Response` callback parameter, and chain the parser functions. The deduplication of the conference proceedings papers is left for the diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 4dd495a9..db18eb1e 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -101,16 +101,16 @@ def get_splash_links(self, node): return out_links - def add_file(self, pdf_files, file_access, file_type): + def add_document(self, pdf_files): """Create a structured dictionary and add to ``files`` item.""" # NOTE: should this be moved to utils? file_dicts = [] for link in pdf_files: file_dict = { - "access": file_access, + "hidden": True, + "fulltext": True, "description": self.name.title(), "url": urljoin(self.domain, link), - "type": file_type, } file_dicts.append(file_dict) return file_dicts @@ -149,7 +149,7 @@ def scrape_for_pdf(self, response): "//a[@class='contenttype-file state-internal url']/@href").extract() response.meta["abstract"] = abstract - response.meta["additional_files"] = self.add_file(file_paths, "HIDDEN", "Fulltext") + response.meta["documents"] = self.add_document(file_paths) return self.build_item(response) @@ -165,7 +165,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("additional_files")) + record.add_value("documents", response.meta.get("documents")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index 219245bb..3ff75717 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -389,7 +389,6 @@ def test_no_dois_jats(): record = parsed_item.record assert "dois" not in record - assert "additional_files" not in record assert isinstance(record, HEPRecord) @@ -413,7 +412,6 @@ def test_no_dois_rich(): record = parsed_item.record assert "dois" not in record - assert "additional_files" not in record assert isinstance(record, HEPRecord) diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index 126792a7..a3c436a5 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -326,8 +326,8 @@ def test_authors(record): def test_files(record): """Test file urls.""" - assert record["additional_files"] - assert record["additional_files"][0]['url'] == "elsevier/sample_consyn_record.xml" + assert record["documents"] + assert record["documents"][0]['url'] == "elsevier/sample_consyn_record.xml" def test_dois(record): diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py index 51f0fc77..a8f8f20b 100644 --- a/tests/unit/test_hindawi.py +++ b/tests/unit/test_hindawi.py @@ -96,9 +96,9 @@ def test_urls(record): def test_additional_files(record): """Test additional files.""" url = "http://downloads.hindawi.com/journals/aa/2010/194946.xml" - assert "additional_files" in record - assert record["additional_files"][0]["url"] == url - assert record["additional_files"][0]["access"] == "INSPIRE-HIDDEN" + assert "documents" in record + assert record["documents"][0]["url"] == url + assert record["documents"][0]["hidden"] def test_collections(record): diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py index c15ef727..526fdf40 100644 --- a/tests/unit/test_infn.py +++ b/tests/unit/test_infn.py @@ -83,8 +83,10 @@ def test_date_published(record): def test_files(record): """Test pdf files.""" - assert record["additional_files"][0][ - "url"] == "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf" + assert record["documents"][0]["url"] == ( + "http://www.infn.it/thesis/PDF/getfile.php" + "?filename=10136-Fedon-dottorato.pdf" + ) def test_thesis(record): diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index bb01766c..1e48fb8a 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -154,10 +154,10 @@ def test_files(record): """Test files dictionary.""" pdf_filename = "test_143_3_336.pdf" - assert "additional_files" in record - assert record["additional_files"][1]["access"] == 'INSPIRE-HIDDEN' - assert record["additional_files"][1]["type"] == 'Fulltext' - assert record["additional_files"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) + assert "documents" in record + assert record["documents"][1]["hidden"] + assert record["documents"][1]["fulltext"] + assert record["documents"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) @pytest.fixture @@ -196,13 +196,12 @@ def erratum_open_access_record(): def test_files_erratum_open_access_record(erratum_open_access_record): """Test files dict with open access journal with erratum article.""" pdf_filename = "test_143_3_336.pdf" - assert "additional_files" in erratum_open_access_record - assert erratum_open_access_record["additional_files"][ - 1]["access"] == 'INSPIRE-PUBLIC' - assert erratum_open_access_record[ - "additional_files"][1]["type"] == 'Erratum' - assert erratum_open_access_record["additional_files"][ - 1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) + assert "documents" in erratum_open_access_record + assert not erratum_open_access_record["documents"][1]["hidden"] + assert not erratum_open_access_record["documents"][1]["fulltext"] + assert erratum_open_access_record["documents"][1]["url"] == ( + os.path.join(TEST_PDF_DIR, pdf_filename) + ) def test_not_published_record(): diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py index f3c0f355..16c52881 100644 --- a/tests/unit/test_magic.py +++ b/tests/unit/test_magic.py @@ -145,8 +145,8 @@ def test_url(record): def test_pdf_link(record): """Test pdf link(s)""" files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf" - assert 'additional_files' in record - assert record['additional_files'][1]['url'] == files + assert 'documents' in record + assert record['documents'][1]['url'] == files def test_no_author_no_date_no_url(): diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py index 8a185cef..895d2c2d 100644 --- a/tests/unit/test_mit.py +++ b/tests/unit/test_mit.py @@ -106,7 +106,10 @@ def test_date_published(record): def test_files(record): """Test pdf files.""" - assert record["additional_files"][0]["url"] == "http://dspace.mit.edu/bitstream/handle/1721.1/99287/922886248-MIT.pdf?sequence=1" + assert record["documents"][0]["url"] == ( + "http://dspace.mit.edu/bitstream/handle/1721.1/99287/" + "922886248-MIT.pdf?sequence=1" + ) def test_thesis(record): diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py index c272683f..2d9b2c58 100644 --- a/tests/unit/test_phenix.py +++ b/tests/unit/test_phenix.py @@ -91,6 +91,9 @@ def test_authors(record): def test_pdf_link(record): """Test pdf link(s)""" - files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf" - assert 'additional_files' in record - assert record['additional_files'][0]['url'] == files + files = ( + "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/" + "Guragain_Hari-DISSERTATION.pdf" + ) + assert 'documents' in record + assert record['documents'][0]['url'] == files diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py index d9395aa2..c3fe0e2c 100644 --- a/tests/unit/test_t2k.py +++ b/tests/unit/test_t2k.py @@ -113,8 +113,8 @@ def test_url(record): def test_pdf_link(record): """Test pdf link(s)""" files = "http://www.t2k.org/docs/thesis/001/IJT-THESIS" - assert 'additional_files' in record - assert record['additional_files'][0]['url'] == files + assert 'documents' in record + assert record['documents'][0]['url'] == files @pytest.fixture