From 908c94376ac3a5dcf9b7f1af18f871c41cb51de3 Mon Sep 17 00:00:00 2001 From: Spyridon Delviniotis Date: Mon, 21 Aug 2017 07:57:29 +0200 Subject: [PATCH] pos: add TODOs for implementation Signed-off-by: Spyridon Delviniotis --- hepcrawl/spiders/pos_spider.py | 28 ++++++++++++++++++---------- tests/functional/pos/test_pos.py | 10 ++++++++-- tests/unit/test_pos.py | 2 +- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 18fb18ae..3fcdab5a 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -41,19 +41,21 @@ class POSSpider(Spider): -a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml """ name = 'pos' - # pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=" def __init__( self, source_file=None, base_conference_paper_url='https://pos.sissa.it/contribution?id=', + base_proceedings_url='https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=', # TODO to be changed without question in the url + # TODO make valid CA certificate **kwargs ): """Construct POS spider.""" super(POSSpider, self).__init__(**kwargs) self.source_file = source_file self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url + self.BASE_PROCEEDINGS_URL = base_proceedings_url def start_requests(self): yield Request(self.source_file) @@ -81,18 +83,24 @@ def scrape_conference_paper(self, response): response=response, ) - # # Yield request for Conference page - # proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() - # proceedings_identifier = proceedings_identifier.split('=')[1] - # pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier) + # TODO Yield request for Conference page + proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first() + proceedings_identifier = proceedings_identifier.split('=')[1] + pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier) + self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars())) # yield Request(pos_url, callback=self.scrape_proceedings) - return self.build_conference_paper_item(response) + yield self.build_conference_paper_item(response) - # def scrape_proceedings(self, response): - # # create proceedings record - # import pytest - # pytest.set_trace() + def scrape_proceedings(self, response): + # TODO create proceedings record + # TODO document_type = proceeding + # TODO title = template(“Proceedings, ”) + # TODO subtitle = template(“<place>, <date>”) + # TODO publication_info.journal_title = “PoS” + # TODO publication_info.journal_volume = identifier + + pass def build_conference_paper_item(self, response): """Parse an PoS XML exported file into a HEP record.""" diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index b11cb7fa..582575bb 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -64,10 +64,10 @@ def set_up_oai_environment(): ), ], ids=[ - 'smoke', + 'conference_paper_record_only', ] ) -def test_pos( +def test_pos_conference_paper_record( set_up_oai_environment, expected_results, ): @@ -89,3 +89,9 @@ def test_pos( expected_results = [override_generated_fields(expected) for expected in expected_results] assert sorted(gotten_results) == expected_results + + +# TODO create test that receives conference paper record AND proceedings record. + + +# TODO create test that receives proceedings record ONLY. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 22fd4015..4de5fa8b 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -63,7 +63,7 @@ def generated_record(scrape_pos_page_body): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - parsed_item = request.callback(response) + parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record