From 5f8fe7f4c3b7edd2abb55008285d1b4994840345 Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 10 Aug 2017 16:20:06 +0200 Subject: [PATCH] WIP for PoS spider Signed-off-by: Spiros Delviniotis --- hepcrawl/crawler2hep.py | 4 +- hepcrawl/spiders/pos_spider.py | 191 ++++++++++-------- .../pos/sample_proceedings_page.html | 134 ++++++++++++ tests/unit/test_pos.py | 103 ++++++++-- 4 files changed, 323 insertions(+), 109 deletions(-) create mode 100644 tests/unit/responses/pos/sample_proceedings_page.html diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index cacc5590..c2ba80d8 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -69,7 +69,7 @@ def _normalize_hepcrawl_record(item, source): item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), - 'source': source, + 'source': item.pop('source', source), }] item['abstracts'] = [{ @@ -178,7 +178,7 @@ def _filter_affiliation(affiliations): for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( - author['full_name'], + full_name=author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 0b849a8e..91873e43 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -32,24 +32,17 @@ class POSSpider(Spider): """POS/Sissa crawler. Extracts from metadata: - * title - * article-id - * conf-acronym - * authors - * affiliations - * publication-date - * publisher - * license - * language - * link + todo:: be added... Example: :: - $ scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml + $ scrapy crawl PoS -a source_file=file://`pwd`/tests/unit/responses/pos/ + sample_pos_record.xml """ name = 'PoS' - pos_base_url = "https://pos.sissa.it/contribution?id=" + conference_paper_url = "https://pos.sissa.it/contribution?id=" + # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__(self, source_file=None, **kwargs): """Construct POS spider.""" @@ -61,118 +54,146 @@ def start_requests(self): def parse(self, response): """Get PDF information.""" + self.log('Got record from: {response.url}'.format(**vars())) + node = response.selector node.remove_namespaces() for record in node.xpath('.//record'): identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() if identifier: # Probably all links lead to same place, so take first - pos_url = "{0}{1}".format(self.pos_base_url, identifier) - request = Request(pos_url, callback=self.scrape_pos_page) + conference_paper_url = "{0}{1}".format(self.conference_paper_url, identifier) + request = Request(conference_paper_url, callback=self.scrape_conference_paper) request.meta["url"] = response.url request.meta["record"] = record.extract() yield request - def scrape_pos_page(self, response): + def scrape_conference_paper(self, response): """Parse a page for PDF link.""" - response.meta["pos_pdf_url"] = response.selector.xpath( - "//a[contains(text(),'pdf')]/@href" - ).extract_first() - response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"]) response.meta["pos_url"] = response.url - return self.build_item(response) + response.meta["conference_paper_pdf_url"] = self._get_conference_paper_pdf_url( + response=response, + ) + + # # Yield request for Conference page + # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + # proceedings_identifier = proceedings_identifier.split('=')[1] + # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) + # yield Request(pos_url, callback=self.scrape_proceedings) + + return self.build_conference_paper_item(response) - def build_item(self, response): + # def scrape_proceedings(self, response): + # # create proceedings record + # import pytest + # pytest.set_trace() + + def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" - text = response.meta["record"] - node = Selector(text=text, type="xml") + meta = response.meta + xml_record = meta.get('record') + node = Selector( + text=xml_record, + type="xml" + ) node.remove_namespaces() - record = HEPLoader(item=HEPRecord(), selector=node) + record = HEPLoader( + item=HEPRecord(), + selector=node + ) + + license_text = node.xpath('.//metadata/pex-dc/rights/text()').extract_first() + record.add_value('license', get_licenses(license_text=license_text)) + + date, year = self._get_date(node=node) + record.add_value('date_published', date) + record.add_value('journal_year', year) + + identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() + record.add_value('journal_title', self._get_journal_title(identifier=identifier)) + record.add_value('journal_volume', self._get_journal_volume(identifier=identifier)) + record.add_value('journal_artid', self._get_journal_artid(identifier=identifier)) + record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') + record.add_value('external_system_numbers', self._get_ext_systems_number(node=node)) + record.add_value('language', self._get_language(node=node)) + record.add_value('authors', self._get_authors(node=node)) + record.add_value('collections', ['conferencepaper']) + record.add_value('urls', meta.get('pos_url')) - record.add_value('external_system_numbers', self._get_ext_systems_number(node)) - - license = get_licenses( - license_text=node.xpath( - ".//metadata/pex-dc/rights/text()" - ).extract_first(), + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', ) - record.add_value('license', license) - date, year = self._get_date(node) - if date: - record.add_value('date_published', date) - if year: - record.add_value('journal_year', int(year)) + return parsed_item - identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() - record.add_value('urls', response.meta['pos_url']) - if response.meta['pos_pdf_url']: - record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) - if identifier: - pbn = re.split('[()]', identifier) - if len(pbn) == 3: - conf_acronym = pbn[1] - article_id = pbn[2] - record.add_value('journal_title', pbn[0]) - record.add_value('journal_volume', conf_acronym) - record.add_value('journal_artid', article_id) - else: - record.add_value('pubinfo_freetext', identifier) + def _get_conference_paper_pdf_url(self, response): + conference_paper_pdf_url = response.selector.xpath( + "//a[contains(text(),'pdf')]/@href", + ).extract_first() + + return urljoin( + self.conference_paper_url, + conference_paper_pdf_url, + ) + @staticmethod + def _get_language(node): language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() - if language: - record.add_value('language', language) + return language if language != 'en' else None - authors = self._get_authors(node) - if authors: - record.add_value('authors', authors) + @staticmethod + def _get_additional_files(meta): + conference_paper_pdf_url = meta.get('conference_paper_pdf_url') - extra_data = self._get_extra_data(node) - if extra_data: - record.add_value('extra_data', extra_data) + return { + 'type': 'Fulltext', + 'url': conference_paper_pdf_url or None, + } - record.add_value('collections', ['HEP', 'ConferencePaper']) + @staticmethod + def _get_journal_title(identifier): + return re.split('[()]', identifier)[0] - parsed_item = ParsedItem( - record=record.load_item(), - record_format='hepcrawl', - ) + @staticmethod + def _get_journal_volume(identifier): + return re.split('[()]', identifier)[1] - return parsed_item + @staticmethod + def _get_journal_artid(identifier): + return re.split('[()]', identifier)[2] - def _get_ext_systems_number(self, node): + @staticmethod + def _get_ext_systems_number(node): return [ - { - 'institute': 'PoS', - 'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first() - }, { 'institute': 'PoS', 'value': node.xpath('.//identifier/text()').extract_first() }, ] - def _get_date(self, node): - """Get article date.""" - date = '' - year = '' + @staticmethod + def _get_date(node): full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() date = create_valid_date(full_date) - if date: - year = date[0:4] + year = int(date[0:4]) + return date, year - def _get_authors(self, node): + @staticmethod + def _get_authors(node): # To be refactored """Get article authors.""" - author_selectors = node.xpath('.//metadata/pex-dc/creator') authors = [] - for selector in author_selectors: + creators = node.xpath('.//metadata/pex-dc/creator') + for creator in creators: auth_dict = {} - author = Selector(text=selector.extract()) - auth_dict['raw_name'] = \ - get_first(author.xpath('.//name//text()').extract(), default='') + author = Selector(text=creator.extract()) + auth_dict['raw_name'] = get_first( + author.xpath('.//name//text()').extract(), + default='', + ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: auth_dict['affiliations'].append({'value': affiliation}) @@ -181,11 +202,3 @@ def _get_authors(self, node): if auth_dict: authors.append(auth_dict) return authors - - def _get_extra_data(self, node): - """Get info to help selection - not for INSPIRE record""" - extra_data = {} - - section = node.xpath(".//metadata/pex-dc/description/text()").extract_first() - extra_data['section'] = section.split(';', 1)[-1].strip() - return extra_data diff --git a/tests/unit/responses/pos/sample_proceedings_page.html b/tests/unit/responses/pos/sample_proceedings_page.html new file mode 100644 index 00000000..669e77b4 --- /dev/null +++ b/tests/unit/responses/pos/sample_proceedings_page.html @@ -0,0 +1,134 @@ + + + + 31st International Symposium on Lattice Field Theory LATTICE 2013 + + + + + Main Image + + + +

31st International Symposium on Lattice Field Theory LATTICE 2013

+ + + + +
LATTICE 2013 - (other lattice conferences)
+
29 July – 3 August, 2013
Mainz, Germany
+ +
+

+ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +

+

+ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +

+
+
conference main image
+
+ + + + + + + + + + + + + + + + + + +
Sessions
Preface
Plenary sessions
Algorithms and Machines
Applications beyond QCD
Physics beyond the Standard Model
Chiral Symmetry
Non-zero Temperature and Density
Hadron Spectroscopy and Interactions
Hadron Structure
Standard Model Parameters and Renormalization
Theoretical Developments
Vacuum Structure and Confinement
Weak Decays and Matrix Elements
Special Session: Coding Efforts
Posters
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Preface
Foreword
+ PoS(LATTICE 2013)503 + pdf + H. Wittig +
Ken Wilson Obituary
+ PoS(LATTICE 2013)504 + pdf + A. Kronfeld +
Plenary sessions
Heavy Flavour Physics Review
+ PoS(LATTICE 2013)001 + pdf + A. El-Khadra +
Charmonium, $D_s$ and $D_s^*$ from overlap fermion on domain wall fermion configurations
+ PoS(LATTICE 2013)500 + pdf + Y.b. Yang, Y. Chen, A. Alexandru, S.J. Dong, T. Draper, M. Gong, F. Lee, A. Li, K.F. Liu, Z. Liu, M. Lujan and N. Mathur +
+
+ + + + + + + + + + diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index bea29b34..300a717a 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -22,6 +22,12 @@ from hepcrawl.testlib.fixtures import fake_response_from_file +def override_generated_fields(record): + record['acquisition_source']['datetime'] = '2017-08-10T16:03:59.091110' + + return record + + @pytest.fixture def scrape_pos_page_body(): return pkg_resources.resource_string( @@ -35,7 +41,7 @@ def scrape_pos_page_body(): @pytest.fixture -def record(scrape_pos_page_body): +def generated_record(scrape_pos_page_body): """Return results generator from the PoS spider.""" crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) @@ -49,6 +55,7 @@ def record(scrape_pos_page_body): **{'encoding': 'utf-8'} ) assert response + pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response) @@ -58,42 +65,42 @@ def record(scrape_pos_page_body): return parsed_record -def test_titles(record): +def test_titles(generated_record): """Test extracting title.""" expected_titles = [ { - 'source': 'PoS', + 'source': 'Sissa Medialab', 'title': 'Heavy Flavour Physics Review', } ] - assert 'titles' in record - assert record['titles'] == expected_titles + assert 'titles' in generated_record + assert generated_record['titles'] == expected_titles -def test_license(record): +def test_license(generated_record): """Test extracting license information.""" expected_license = [{ 'license': 'CC-BY-NC-SA-3.0', 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0', }] - assert record['license'] == expected_license + assert generated_record['license'] == expected_license -def test_collections(record): +def test_collections(generated_record): """Test extracting collections.""" expected_document_type = ['conference paper'] - assert record.get('citeable') - assert record.get('document_type') == expected_document_type + assert generated_record.get('citeable') + assert generated_record.get('document_type') == expected_document_type -def test_language(record): +def test_language(generated_record): """Test extracting language.""" - assert 'language' not in record + assert 'language' not in generated_record -def test_publication_info(record): +def test_publication_info(generated_record): """Test extracting dois.""" expected_pub_info = [{ 'artid': '001', @@ -102,13 +109,13 @@ def test_publication_info(record): 'year': 2014, }] - assert 'publication_info' in record + assert 'publication_info' in generated_record - pub_info = record['publication_info'] + pub_info = generated_record['publication_info'] assert pub_info == expected_pub_info -def test_authors(record): +def test_authors(generated_record): """Test authors.""" expected_authors = [ { @@ -121,12 +128,72 @@ def test_authors(record): } ] - assert 'authors' in record + assert 'authors' in generated_record - result_authors = record['authors'] + result_authors = generated_record['authors'] assert len(result_authors) == len(expected_authors) # here we are making sure order is kept for author, expected_author in zip(result_authors, expected_authors): assert author == expected_author + + +def test_pipeline_record(generated_record): + expected = { + 'acquisition_source': { + 'datetime': '2017-08-10T16:03:59.091110', + 'method': 'hepcrawl', + 'source': 'PoS', + 'submission_number': 'None' + }, + 'authors': [ + { + 'affiliations': [ + { + 'value': u'INFN and Universit\xe0 di Firenze' + } + ], + 'full_name': u'El-Khadra, Aida' + }, + { + 'affiliations': [ + { + 'value': u'U of Pecs' + } + ], + 'full_name': u'MacDonald, M.T.' + } + ], + 'citeable': True, + 'document_type': [ + 'conference paper' + ], + 'imprints': [ + { + 'date': '2014-03-19' + } + ], + 'license': [ + { + 'license': 'CC-BY-NC-SA-3.0', + 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0' + } + ], + 'publication_info': [ + { + 'artid': u'001', + 'journal_title': u'PoS', + 'journal_volume': u'LATTICE 2013', + 'year': 2014 + } + ], + 'titles': [ + { + 'source': u'Sissa Medialab', + 'title': u'Heavy Flavour Physics Review' + } + ] + } + + assert override_generated_fields(generated_record) == expected