diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index ab5083ed..1f2404e4 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,7 +41,6 @@ class POSSpider(StatefulSpider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' - # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, @@ -73,8 +72,9 @@ def parse(self, response): # Probably all links lead to same place, so take first conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier) request = Request(conference_paper_url, callback=self.scrape_conference_paper) - request.meta["url"] = response.url - request.meta["record"] = record.extract() + request.meta['url'] = response.url + request.meta['record'] = record.extract() + request.meta['identifier'] = identifier yield request def scrape_conference_paper(self, response): @@ -84,18 +84,48 @@ def scrape_conference_paper(self, response): response=response, ) - # # Yield request for Conference page - # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - # proceedings_identifier = proceedings_identifier.split('=')[1] - # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) - # yield Request(pos_url, callback=self.scrape_proceedings) + # Scrape proceedings record + pos_url = self._get_proceedings_url(response) + self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) + meta = { + 'identifier': response.meta.get('identifier'), + } + yield Request( + pos_url, + callback=self.scrape_proceedings, + meta=meta, + ) + + yield self.build_conference_paper_item(response) + + def scrape_proceedings(self, response): + node = Selector( + text=response.body, + type='html', + ) + node.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=node + ) + + record.add_value('collections', ['proceeding']) + record.add_value('title', self._get_proceedings_title(node=node)) + record.add_value('subtitle', self._get_proceedings_date_place(node=node)) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + self._get_journal_volume( + identifier=response.meta.get('identifier'), + ) + ) - return self.build_conference_paper_item(response) + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) - # def scrape_proceedings(self, response): - # # create proceedings record - # import pytest - # pytest.set_trace() + return parsed_item def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" @@ -148,6 +178,13 @@ def _get_conference_paper_pdf_url(self, response): conference_paper_pdf_url, ) + def _get_proceedings_url(self, response): + internal_url = response.selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_identifier = internal_url.split('/')[1] + return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + @staticmethod def _get_language(node): language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() @@ -169,7 +206,7 @@ def _get_journal_artid(identifier): def _get_ext_systems_number(node): return [ { - 'institute': 'PoS', + 'institute': 'pos', 'value': node.xpath('.//identifier/text()').extract_first() }, ] @@ -196,10 +233,27 @@ def _get_authors(node): # To be refactored ) for affiliation in author.xpath('.//affiliation//text()').extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append({'value': affiliation}) + auth_dict['affiliations'].append( + { + 'value': affiliation + } + ) # Todo probably to remove else: - auth_dict['affiliations'] = [{'value': affiliation}, ] + auth_dict['affiliations'] = [ + { + 'value': affiliation + }, + ] if auth_dict: authors.append(auth_dict) return authors + + @staticmethod + def _get_proceedings_title(node): + return node.xpath('//h1/text()').extract_first() + + @staticmethod + def _get_proceedings_date_place(node): + date_place = node.xpath("//div[@class='conference_date']/text()").extract() + return ''.join(date_place) diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf index f4235640..1591cbcd 100644 --- a/tests/functional/pos/fixtures/https_server/conf/proxy.conf +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -14,4 +14,12 @@ server { rewrite ^.*$ /$mid.html permanent; } } + + location ~ /cgi-bin/reader/conf.cgi { + if ($args ~* "^confid=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } } diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/https_server/records/187.html new file mode 100644 index 00000000..0d86221a --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/187.html @@ -0,0 +1,125 @@ + + +
++ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +
++ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +
+Preface |
Foreword + PoS(LATTICE 2013)503 + pdf + + |
+
Ken Wilson Obituary + PoS(LATTICE 2013)504 + pdf + + |
+
Plenary sessions | +
Heavy Flavour Physics Review + PoS(LATTICE 2013)001 + pdf + + |
+
New Developments for Lattice Field Theory at Non-Zero Density + PoS(LATTICE 2013)002 + pdf + + |
+
Heavy Flavour Physics Review
+A. El-Khadra
+in 31st International Symposium on Lattice Field Theory LATTICE 2013
+Contribution: pdf
- -