From f1a427553ef276f431b6b1b8d627562b3935a3fe Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Mon, 21 Aug 2017 13:44:30 +0200 Subject: [PATCH] pos: add proceedings support Addresses #159 Signed-off-by: Spiros Delviniotis --- hepcrawl/spiders/pos_spider.py | 69 ++++++++-- .../pos/fixtures/https_server/conf/proxy.conf | 8 ++ .../fixtures/https_server/records/187.html | 125 ++++++++++++++++++ .../records/PoS(LATTICE 2013)001.html | 88 ++++++------ ...> pos_conference_proceedings_records.json} | 23 ++++ tests/functional/pos/test_pos.py | 7 +- 6 files changed, 264 insertions(+), 56 deletions(-) create mode 100644 tests/functional/pos/fixtures/https_server/records/187.html rename tests/functional/pos/fixtures/{pos_records.json => pos_conference_proceedings_records.json} (68%) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 5b20b588..4190b4b5 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -72,8 +72,9 @@ def parse(self, response): # Probably all links lead to same place, so take first conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) request = Request(conference_paper_url, callback=self.scrape_conference_paper) - request.meta["url"] = response.url - request.meta["record"] = record.extract() + request.meta['url'] = response.url + request.meta['record'] = record.extract() + request.meta['identifier'] = identifier yield request def scrape_conference_paper(self, response): @@ -83,24 +84,48 @@ def scrape_conference_paper(self, response): response=response, ) - # TODO Yield request for Conference page - proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - proceedings_identifier = proceedings_identifier.split('=')[1] - pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + # Scrape proceedings record + pos_url = self._get_proceedings_url(response) self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) - # yield Request(pos_url, callback=self.scrape_proceedings) + meta = { + 'identifier': response.meta.get('identifier'), + } + yield Request( + pos_url, + callback=self.scrape_proceedings, + meta=meta, + ) yield self.build_conference_paper_item(response) def scrape_proceedings(self, response): - # TODO create proceedings record - # TODO document_type = proceeding - # TODO title = template(“Proceedings, ”) - # TODO subtitle = template(“<place>, <date>”) - # TODO publication_info.journal_title = “PoS” - # TODO publication_info.journal_volume = identifier + node = Selector( + text=response.body, + type='html', + ) + node.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=node + ) - pass + record.add_value('collections', ['proceeding']) + record.add_value('title', self._get_proceedings_title(node=node)) + record.add_value('subtitle', self._get_proceedings_date_place(node=node)) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + self._get_journal_volume( + identifier=response.meta.get('identifier'), + ) + ) + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" @@ -153,6 +178,13 @@ def _get_conference_paper_pdf_url(self, response): conference_paper_pdf_url, ) + def _get_proceedings_url(self, response): + internal_url = response.selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_identifier = internal_url.split('/')[1] + return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + @staticmethod def _get_language(node): language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() @@ -216,3 +248,12 @@ def _get_authors(node): # To be refactored if auth_dict: authors.append(auth_dict) return authors + + @staticmethod + def _get_proceedings_title(node): + return node.xpath('//h1/text()').extract_first() + + @staticmethod + def _get_proceedings_date_place(node): + date_place = node.xpath("//div[@class='conference_date']/text()").extract() + return ''.join(date_place) diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf index f4235640..1591cbcd 100644 --- a/tests/functional/pos/fixtures/https_server/conf/proxy.conf +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -14,4 +14,12 @@ server { rewrite ^.*$ /$mid.html permanent; } } + + location ~ /cgi-bin/reader/conf.cgi { + if ($args ~* "^confid=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } } diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/https_server/records/187.html new file mode 100644 index 00000000..0d86221a --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/187.html @@ -0,0 +1,125 @@ +<!DOCTYPE html> +<html> + <head> + <title>31st International Symposium on Lattice Field Theory LATTICE 2013 + + + + + Main Image + + + +

31st International Symposium on Lattice Field Theory LATTICE 2013

+ + + +
LATTICE 2013 - (other lattice conferences)
+
29 July – 3 August, 2013
Mainz, Germany
+
+

+ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +

+

+ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +

+
+
conference main image
+
+ + + + + + + + + + + + + + + + + + +
Sessions
Preface
Plenary sessions
Algorithms and Machines
Applications beyond QCD
Physics beyond the Standard Model
Chiral Symmetry
Non-zero Temperature and Density
Hadron Spectroscopy and Interactions
Hadron Structure
Standard Model Parameters and Renormalization
Theoretical Developments
Vacuum Structure and Confinement
Weak Decays and Matrix Elements
Special Session: Coding Efforts
Posters
+ + + + + + + + + + + + + + + + + + + + +
Preface
Foreword
+ PoS(LATTICE 2013)503 + pdf + H. Wittig +
Ken Wilson Obituary
+ PoS(LATTICE 2013)504 + pdf + A. Kronfeld +
Plenary sessions
Heavy Flavour Physics Review
+ PoS(LATTICE 2013)001 + pdf + A. El-Khadra +
New Developments for Lattice Field Theory at Non-Zero Density
+ PoS(LATTICE 2013)002 + pdf + C. Gattringer +
+
+ + + + + + + + + diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html index e080cb39..5ed0c148 100644 --- a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html +++ b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html @@ -1,55 +1,65 @@ - - + + PoS(LATTICE 2013)001 + + + + + + + + - +Main Image - - +

PoS(LATTICE 2013)001

+ + + -
- -
- Title - Heavy Flavour Physics Review -
- -
- Conference - 31st International Symposium on Lattice Field Theory LATTICE 2013 -
- -
- Authors -
-A. El-Khadra
-
- - -
- Contribution - pdf -
+
+

Heavy Flavour Physics Review

+

A. El-Khadra

+

in 31st International Symposium on Lattice Field Theory LATTICE 2013

+

Contribution: pdf

- -
- - + + + + + + + - + \ No newline at end of file diff --git a/tests/functional/pos/fixtures/pos_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json similarity index 68% rename from tests/functional/pos/fixtures/pos_records.json rename to tests/functional/pos/fixtures/pos_conference_proceedings_records.json index ee8b88af..3605ad13 100644 --- a/tests/functional/pos/fixtures/pos_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -1,5 +1,28 @@ [ { + "publication_info": [ + { + "journal_volume": "LATTICE 2013", + "journal_title": "PoS" + } + ], + "document_type": [ + "article" + ], + "titles": [ + { + "source": "pos", + "title": "31st International Symposium on Lattice Field Theory LATTICE 2013" + } + ], + "acquisition_source": { + "source": "pos", + "method": "hepcrawl", + "submission_number": "5652c7f6190f11e79e8000224dabeaad", + "datetime": "2017-04-03T10:26:40.365216" + } + }, + { "acquisition_source": { "source": "pos", "method": "hepcrawl", diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 582575bb..c122b68e 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -50,6 +50,7 @@ def set_up_oai_environment(): 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, 'base_conference_paper_url': 'https://server.local/contribution?id=', + 'base_proceedings_url': 'https://server.local/cgi-bin/reader/conf.cgi?confid=', } } @@ -60,14 +61,14 @@ def set_up_oai_environment(): expected_json_results_from_file( 'pos', 'fixtures', - 'pos_records.json', + 'pos_conference_proceedings_records.json', ), ], ids=[ - 'conference_paper_record_only', + 'smoke', ] ) -def test_pos_conference_paper_record( +def test_pos_conference_paper_record_and_proceedings_record( set_up_oai_environment, expected_results, ):