Skip to content

Commit

Permalink
pos: add TODOs for implementation
Browse files Browse the repository at this point in the history
Signed-off-by: Spyridon Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 21, 2017
1 parent 2716d53 commit 908c943
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 13 deletions.
28 changes: 18 additions & 10 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,21 @@ class POSSpider(Spider):
-a source_file=file://`pwd`/tests/unit/responses/pos/sample_pos_record.xml
"""
name = 'pos'
# pos_proceedings_url = "https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid="

def __init__(
self,
source_file=None,
base_conference_paper_url='https://pos.sissa.it/contribution?id=',
base_proceedings_url='https://pos.sissa.it/cgi-bin/reader/conf.cgi?confid=',
# TODO to be changed without question in the url
# TODO make valid CA certificate
**kwargs
):
"""Construct POS spider."""
super(POSSpider, self).__init__(**kwargs)
self.source_file = source_file
self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url
self.BASE_PROCEEDINGS_URL = base_proceedings_url

def start_requests(self):
yield Request(self.source_file)
Expand Down Expand Up @@ -81,18 +83,24 @@ def scrape_conference_paper(self, response):
response=response,
)

# # Yield request for Conference page
# proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first()
# proceedings_identifier = proceedings_identifier.split('=')[1]
# pos_url = "{0}{1}".format(self.pos_proceedings_url, proceedings_identifier)
# TODO Yield request for Conference page
proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first()
proceedings_identifier = proceedings_identifier.split('=')[1]
pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier)
self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars()))
# yield Request(pos_url, callback=self.scrape_proceedings)

return self.build_conference_paper_item(response)
yield self.build_conference_paper_item(response)

# def scrape_proceedings(self, response):
# # create proceedings record
# import pytest
# pytest.set_trace()
def scrape_proceedings(self, response):
# TODO create proceedings record
# TODO document_type = proceeding
# TODO title = template(“Proceedings, <title>”)
# TODO subtitle = template(“<place>, <date>”)
# TODO publication_info.journal_title = “PoS”
# TODO publication_info.journal_volume = identifier

pass

def build_conference_paper_item(self, response):
"""Parse an PoS XML exported file into a HEP record."""
Expand Down
10 changes: 8 additions & 2 deletions tests/functional/pos/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def set_up_oai_environment():
),
],
ids=[
'smoke',
'conference_paper_record_only',
]
)
def test_pos(
def test_pos_conference_paper_record(
set_up_oai_environment,
expected_results,
):
Expand All @@ -89,3 +89,9 @@ def test_pos(
expected_results = [override_generated_fields(expected) for expected in expected_results]

assert sorted(gotten_results) == expected_results


# TODO create test that receives conference paper record AND proceedings record.


# TODO create test that receives proceedings record ONLY.
2 changes: 1 addition & 1 deletion tests/unit/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def generated_record(scrape_pos_page_body):

pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
parsed_item = request.callback(response)
parsed_item = request.callback(response).next()
parsed_record = pipeline.process_item(parsed_item, spider)
assert parsed_record

Expand Down

0 comments on commit 908c943

Please sign in to comment.