From 1d6e8e8da7dcfb3258365069cc1612440f92d29a Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Thu, 10 Aug 2017 16:20:06 +0200 Subject: [PATCH] WIP for PoS spider Signed-off-by: Spiros Delviniotis --- hepcrawl/crawler2hep.py | 4 +- hepcrawl/spiders/pos_spider.py | 185 +++++++++--------- .../pos/sample_proceedings_page.html | 134 +++++++++++++ tests/unit/test_pos.py | 112 +++++++++-- 4 files changed, 323 insertions(+), 112 deletions(-) create mode 100644 tests/unit/responses/pos/sample_proceedings_page.html diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index cacc5590..c2ba80d8 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -69,7 +69,7 @@ def _normalize_hepcrawl_record(item, source): item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), - 'source': source, + 'source': item.pop('source', source), }] item['abstracts'] = [{ @@ -178,7 +178,7 @@ def _filter_affiliation(affiliations): for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( - author['full_name'], + full_name=author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 0b849a8e..1a79dafb 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -32,24 +32,17 @@ class POSSpider(Spider): """POS/Sissa crawler. Extracts from metadata: - * title - * article-id - * conf-acronym - * authors - * affiliations - * publication-date - * publisher - * license - * language - * link + todo:: be added... Example: :: - $ scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml + $ scrapy crawl PoS \\ + -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'PoS' - pos_base_url = "https://pos.sissa.it/contribution?id=" + BASE_CONFERENCE_PAPER_URL = "https://pos.sissa.it/contribution?id=" + # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__(self, source_file=None, **kwargs): """Construct POS spider.""" @@ -61,79 +54,73 @@ def start_requests(self): def parse(self, response): """Get PDF information.""" + self.log('Got record from: {response.url}'.format(**vars())) + node = response.selector node.remove_namespaces() for record in node.xpath('.//record'): identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() if identifier: # Probably all links lead to same place, so take first - pos_url = "{0}{1}".format(self.pos_base_url, identifier) - request = Request(pos_url, callback=self.scrape_pos_page) + conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) + request = Request(conference_paper_url, callback=self.scrape_conference_paper) request.meta["url"] = response.url request.meta["record"] = record.extract() yield request - def scrape_pos_page(self, response): + def scrape_conference_paper(self, response): """Parse a page for PDF link.""" - response.meta["pos_pdf_url"] = response.selector.xpath( - "//a[contains(text(),'pdf')]/@href" - ).extract_first() - response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"]) response.meta["pos_url"] = response.url - return self.build_item(response) - - def build_item(self, response): - """Parse an PoS XML exported file into a HEP record.""" - text = response.meta["record"] - node = Selector(text=text, type="xml") - node.remove_namespaces() - record = HEPLoader(item=HEPRecord(), selector=node) - record.add_xpath('title', '//metadata/pex-dc/title/text()') - record.add_xpath('source', '//metadata/pex-dc/publisher/text()') + response.meta["conference_paper_pdf_url"] = self._get_conference_paper_pdf_url( + response=response, + ) - record.add_value('external_system_numbers', self._get_ext_systems_number(node)) + # # Yield request for Conference page + # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + # proceedings_identifier = proceedings_identifier.split('=')[1] + # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) + # yield Request(pos_url, callback=self.scrape_proceedings) - license = get_licenses( - license_text=node.xpath( - ".//metadata/pex-dc/rights/text()" - ).extract_first(), - ) - record.add_value('license', license) + return self.build_conference_paper_item(response) - date, year = self._get_date(node) - if date: - record.add_value('date_published', date) - if year: - record.add_value('journal_year', int(year)) + # def scrape_proceedings(self, response): + # # create proceedings record + # import pytest + # pytest.set_trace() - identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() - record.add_value('urls', response.meta['pos_url']) - if response.meta['pos_pdf_url']: - record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) - if identifier: - pbn = re.split('[()]', identifier) - if len(pbn) == 3: - conf_acronym = pbn[1] - article_id = pbn[2] - record.add_value('journal_title', pbn[0]) - record.add_value('journal_volume', conf_acronym) - record.add_value('journal_artid', article_id) - else: - record.add_value('pubinfo_freetext', identifier) + def build_conference_paper_item(self, response): + """Parse an PoS XML exported file into a HEP record.""" + meta = response.meta + xml_record = meta.get('record') + node = Selector( + text=xml_record, + type="xml" + ) + node.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=node + ) - language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() - if language: - record.add_value('language', language) + license_text = node.xpath('.//metadata/pex-dc/rights/text()').extract_first() + record.add_value('license', get_licenses(license_text=license_text)) - authors = self._get_authors(node) - if authors: - record.add_value('authors', authors) + date, year = self._get_date(node=node) + record.add_value('date_published', date) + record.add_value('journal_year', year) - extra_data = self._get_extra_data(node) - if extra_data: - record.add_value('extra_data', extra_data) + identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() + record.add_value('journal_title', self._get_journal_title(identifier=identifier)) + record.add_value('journal_volume', self._get_journal_volume(identifier=identifier)) + record.add_value('journal_artid', self._get_journal_artid(identifier=identifier)) - record.add_value('collections', ['HEP', 'ConferencePaper']) + record.add_xpath('title', '//metadata/pex-dc/title/text()') + record.add_xpath('source', '//metadata/pex-dc/publisher/text()') + record.add_value('external_system_numbers', self._get_ext_systems_number(node=node)) + record.add_value('language', self._get_language(node=node)) + record.add_value('authors', self._get_authors(node=node)) + record.add_value('collections', ['conferencepaper']) + record.add_value('urls', meta.get('pos_url')) parsed_item = ParsedItem( record=record.load_item(), @@ -142,50 +129,68 @@ def build_item(self, response): return parsed_item - def _get_ext_systems_number(self, node): + def _get_conference_paper_pdf_url(self, response): + conference_paper_pdf_url = response.selector.xpath( + "//a[contains(text(),'pdf')]/@href", + ).extract_first() + + return urljoin( + self.BASE_CONFERENCE_PAPER_URL, + conference_paper_pdf_url, + ) + + @staticmethod + def _get_language(node): + language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() + return language if language != 'en' else None + + @staticmethod + def _get_journal_title(identifier): + return re.split('[()]', identifier)[0] + + @staticmethod + def _get_journal_volume(identifier): + return re.split('[()]', identifier)[1] + + @staticmethod + def _get_journal_artid(identifier): + return re.split('[()]', identifier)[2] + + @staticmethod + def _get_ext_systems_number(node): return [ - { - 'institute': 'PoS', - 'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first() - }, { 'institute': 'PoS', 'value': node.xpath('.//identifier/text()').extract_first() }, ] - def _get_date(self, node): - """Get article date.""" - date = '' - year = '' + @staticmethod + def _get_date(node): full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() date = create_valid_date(full_date) - if date: - year = date[0:4] + year = int(date[0:4]) + return date, year - def _get_authors(self, node): + @staticmethod + def _get_authors(node): # To be refactored """Get article authors.""" - author_selectors = node.xpath('.//metadata/pex-dc/creator') authors = [] - for selector in author_selectors: + creators = node.xpath('.//metadata/pex-dc/creator') + for creator in creators: auth_dict = {} - author = Selector(text=selector.extract()) - auth_dict['raw_name'] = \ - get_first(author.xpath('.//name//text()').extract(), default='') + author = Selector(text=creator.extract()) + auth_dict['raw_name'] = get_first( + author.xpath('.//name//text()').extract(), + default='', + ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: auth_dict['affiliations'].append({'value': affiliation}) + # Todo probably to remove else: auth_dict['affiliations'] = [{'value': affiliation}, ] if auth_dict: authors.append(auth_dict) return authors - - def _get_extra_data(self, node): - """Get info to help selection - not for INSPIRE record""" - extra_data = {} - - section = node.xpath(".//metadata/pex-dc/description/text()").extract_first() - extra_data['section'] = section.split(';', 1)[-1].strip() - return extra_data diff --git a/tests/unit/responses/pos/sample_proceedings_page.html b/tests/unit/responses/pos/sample_proceedings_page.html new file mode 100644 index 00000000..669e77b4 --- /dev/null +++ b/tests/unit/responses/pos/sample_proceedings_page.html @@ -0,0 +1,134 @@ + + + + 31st International Symposium on Lattice Field Theory LATTICE 2013 + + + + + Main Image + + + +

31st International Symposium on Lattice Field Theory LATTICE 2013

+ + + + +
LATTICE 2013 - (other lattice conferences)
+
29 July – 3 August, 2013
Mainz, Germany
+ +
+

+ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +

+

+ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +

+
+
conference main image
+
+ + + + + + + + + + + + + + + + + + +
Sessions
Preface
Plenary sessions
Algorithms and Machines
Applications beyond QCD
Physics beyond the Standard Model
Chiral Symmetry
Non-zero Temperature and Density
Hadron Spectroscopy and Interactions
Hadron Structure
Standard Model Parameters and Renormalization
Theoretical Developments
Vacuum Structure and Confinement
Weak Decays and Matrix Elements
Special Session: Coding Efforts
Posters
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Preface
Foreword
+ PoS(LATTICE 2013)503 + pdf + H. Wittig +
Ken Wilson Obituary
+ PoS(LATTICE 2013)504 + pdf + A. Kronfeld +
Plenary sessions
Heavy Flavour Physics Review
+ PoS(LATTICE 2013)001 + pdf + A. El-Khadra +
Charmonium, $D_s$ and $D_s^*$ from overlap fermion on domain wall fermion configurations
+ PoS(LATTICE 2013)500 + pdf + Y.b. Yang, Y. Chen, A. Alexandru, S.J. Dong, T. Draper, M. Gong, F. Lee, A. Li, K.F. Liu, Z. Liu, M. Lujan and N. Mathur +
+
+ + + + + + + + + + diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index bea29b34..1e4ec1c6 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -22,7 +22,13 @@ from hepcrawl.testlib.fixtures import fake_response_from_file -@pytest.fixture +def override_generated_fields(record): + record['acquisition_source']['datetime'] = '2017-08-10T16:03:59.091110' + + return record + + +@pytest.fixture(scope='session') def scrape_pos_page_body(): return pkg_resources.resource_string( __name__, @@ -34,9 +40,14 @@ def scrape_pos_page_body(): ) -@pytest.fixture -def record(scrape_pos_page_body): +@pytest.fixture(scope='session') +def generated_record(scrape_pos_page_body): """Return results generator from the PoS spider.""" + # environmental variables needed for the pipelines payload + os.environ['SCRAPY_JOB'] = 'scrapy_job' + os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' + os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' + crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( @@ -49,6 +60,7 @@ def record(scrape_pos_page_body): **{'encoding': 'utf-8'} ) assert response + pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response) @@ -58,42 +70,42 @@ def record(scrape_pos_page_body): return parsed_record -def test_titles(record): +def test_titles(generated_record): """Test extracting title.""" expected_titles = [ { - 'source': 'PoS', + 'source': 'Sissa Medialab', 'title': 'Heavy Flavour Physics Review', } ] - assert 'titles' in record - assert record['titles'] == expected_titles + assert 'titles' in generated_record + assert generated_record['titles'] == expected_titles -def test_license(record): +def test_license(generated_record): """Test extracting license information.""" expected_license = [{ 'license': 'CC-BY-NC-SA-3.0', 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0', }] - assert record['license'] == expected_license + assert generated_record['license'] == expected_license -def test_collections(record): +def test_collections(generated_record): """Test extracting collections.""" expected_document_type = ['conference paper'] - assert record.get('citeable') - assert record.get('document_type') == expected_document_type + assert generated_record.get('citeable') + assert generated_record.get('document_type') == expected_document_type -def test_language(record): +def test_language(generated_record): """Test extracting language.""" - assert 'language' not in record + assert 'language' not in generated_record -def test_publication_info(record): +def test_publication_info(generated_record): """Test extracting dois.""" expected_pub_info = [{ 'artid': '001', @@ -102,13 +114,13 @@ def test_publication_info(record): 'year': 2014, }] - assert 'publication_info' in record + assert 'publication_info' in generated_record - pub_info = record['publication_info'] + pub_info = generated_record['publication_info'] assert pub_info == expected_pub_info -def test_authors(record): +def test_authors(generated_record): """Test authors.""" expected_authors = [ { @@ -121,12 +133,72 @@ def test_authors(record): } ] - assert 'authors' in record + assert 'authors' in generated_record - result_authors = record['authors'] + result_authors = generated_record['authors'] assert len(result_authors) == len(expected_authors) # here we are making sure order is kept for author, expected_author in zip(result_authors, expected_authors): assert author == expected_author + + +def test_pipeline_record(generated_record): + expected = { + 'acquisition_source': { + 'datetime': '2017-08-10T16:03:59.091110', + 'method': 'hepcrawl', + 'source': 'PoS', + 'submission_number': 'scrapy_job' + }, + 'authors': [ + { + 'affiliations': [ + { + 'value': u'INFN and Universit\xe0 di Firenze' + } + ], + 'full_name': u'El-Khadra, Aida' + }, + { + 'affiliations': [ + { + 'value': u'U of Pecs' + } + ], + 'full_name': u'MacDonald, M.T.' + } + ], + 'citeable': True, + 'document_type': [ + 'conference paper' + ], + 'imprints': [ + { + 'date': '2014-03-19' + } + ], + 'license': [ + { + 'license': 'CC-BY-NC-SA-3.0', + 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0' + } + ], + 'publication_info': [ + { + 'artid': u'001', + 'journal_title': u'PoS', + 'journal_volume': u'LATTICE 2013', + 'year': 2014 + } + ], + 'titles': [ + { + 'source': u'Sissa Medialab', + 'title': u'Heavy Flavour Physics Review' + } + ] + } + + assert override_generated_fields(generated_record) == expected