diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index cbdc82d4..5b20b588 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,19 +41,21 @@ class POSSpider(Spider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' - # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, source_file=None, base_conference_paper_url='https://pos.sissa.it/contribution?id=', + base_proceedings_url='https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=', # TODO to be changed without question in the url + # TODO make valid CA certificate **kwargs ): """Construct POS spider.""" super(POSSpider, self).__init__(**kwargs) self.source_file = source_file self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url + self.BASE_PROCEEDINGS_URL = base_proceedings_url def start_requests(self): yield Request(self.source_file) @@ -81,18 +83,24 @@ def scrape_conference_paper(self, response): response=response, ) - # # Yield request for Conference page - # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - # proceedings_identifier = proceedings_identifier.split('=')[1] - # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) + # TODO Yield request for Conference page + proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + proceedings_identifier = proceedings_identifier.split('=')[1] + pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) # yield Request(pos_url, callback=self.scrape_proceedings) - return self.build_conference_paper_item(response) + yield self.build_conference_paper_item(response) - # def scrape_proceedings(self, response): - # # create proceedings record - # import pytest - # pytest.set_trace() + def scrape_proceedings(self, response): + # TODO create proceedings record + # TODO document_type = proceeding + # TODO title = template(“Proceedings, ”) + # TODO subtitle = template(“<place>, <date>”) + # TODO publication_info.journal_title = “PoS” + # TODO publication_info.journal_volume = identifier + + pass def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index b11cb7fa..582575bb 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -64,10 +64,10 @@ def set_up_oai_environment(): ), ], ids=[ - 'smoke', + 'conference_paper_record_only', ] ) -def test_pos( +def test_pos_conference_paper_record( set_up_oai_environment, expected_results, ): @@ -89,3 +89,9 @@ def test_pos( expected_results = [override_generated_fields(expected) for expected in expected_results] assert sorted(gotten_results) == expected_results + + +# TODO create test that receives conference paper record AND proceedings record. + + +# TODO create test that receives proceedings record ONLY. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index a2745274..918ea592 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -66,7 +66,7 @@ def generated_record(scrape_pos_page_body): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - parsed_item = request.callback(response) + parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record