Skip to content

Commit

Permalink
pos: fix spider
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 22, 2017
1 parent 4978cf2 commit 0a33a49
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 53 deletions.
6 changes: 6 additions & 0 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ def _filter_affiliation(affiliations):
source=report_number.get('source')
)

for url in crawler_record.get('urls', []):
builder.add_url(url=url.get('value'))

if crawler_record.get('_fft'):
builder.record['_fft'] = crawler_record.get('_fft')

builder.validate_record()

return builder.record
4 changes: 4 additions & 0 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,7 @@ class HEPRecord(scrapy.Item):

thesis_supervisor = scrapy.Field()
language = scrapy.Field()

_fft = scrapy.Field()
"""Used to communicate with legacy about files (to be) attached to the
record."""
60 changes: 38 additions & 22 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,24 +94,26 @@ def parse(self, response):
self.log('Got record from: {response.url}'.format(**vars()))

response.selector.remove_namespaces()
records = response.selector.xpath('.//record')
for record in records:
yield self.get_conference_paper_page_request(raw_xml=record)
record_xml_selectors = response.selector.xpath('.//record')
for record_xml_selector in record_xml_selectors:
yield self.get_conference_paper_page_request(
xml_selector=record_xml_selector,
)

def get_conference_paper_page_request(self, raw_xml, meta=None):
def get_conference_paper_page_request(self, xml_selector, meta=None):
"""Gets the conference paper html page, for the pdf link for the
conference paper, and later the internal conference id.
"""
meta = meta or {}

identifier = raw_xml.xpath(
identifier = xml_selector.xpath(
'.//metadata/pex-dc/identifier/text()'
).extract_first()
conference_paper_url = "{0}{1}".format(
self.base_conference_paper_url,
identifier,
)
meta['xml_record'] = raw_xml
meta['xml_record'] = xml_selector.extract()

# the meta parameter will be passed over to the callback as a property
# in the response parameter
Expand All @@ -137,11 +139,11 @@ def parse_conference_paper(self, response):

# prepare next callback step
response.meta['html_record'] = response.body
yield self.get_conference_proceendings_page_request(
yield self.get_conference_proceedings_page_request(
meta=response.meta,
)

def get_conference_proceendings_page_request(self, meta):
def get_conference_proceedings_page_request(self, meta):
"""Gets the conference proceedings page, using the indernal conference
id from the record html page retrieved before.
"""
Expand All @@ -155,9 +157,10 @@ def get_conference_proceendings_page_request(self, meta):
)

page_selector = Selector(
text=meta.get('html_record'),
type='html',
text=meta.get('xml_record'),
type='xml',
)
page_selector.remove_namespaces()
pos_id = page_selector.xpath(
".//metadata/pex-dc/identifier/text()"
).extract_first()
Expand Down Expand Up @@ -220,15 +223,15 @@ def build_conference_paper_item(
).extract_first()
record.add_value(
'journal_title',
self._get_journal_title(identifier=identifier),
self._get_journal_title(pos_ext_identifier=identifier),
)
record.add_value(
'journal_volume',
self._get_journal_volume(identifier=identifier),
self._get_journal_volume(pos_ext_identifier=identifier),
)
record.add_value(
'journal_artid',
self._get_journal_artid(identifier=identifier),
self._get_journal_artid(pos_ext_identifier=identifier),
)

record.add_xpath('title', '//metadata/pex-dc/title/text()')
Expand All @@ -240,8 +243,13 @@ def build_conference_paper_item(
record.add_value('language', self._get_language(selector=selector))
record.add_value('authors', self._get_authors(selector=selector))
record.add_value('collections', ['conferencepaper'])
record.add_value('urls', conference_paper_pdf_url)
record.add_value('_fulltext_url', self._get_conference_paper_pdf_url())
record.add_value('urls', [conference_paper_url])
record.add_value(
'_fft',
self._set_fft(
path=conference_paper_pdf_url,
),
)

parsed_item = ParsedItem(
record=record.load_item(),
Expand Down Expand Up @@ -277,7 +285,7 @@ def build_conference_proceedings_item(
record.add_value('journal_title', 'PoS')
record.add_value(
'journal_volume',
self._get_journal_volume(pos_id=pos_id),
self._get_journal_volume(pos_ext_identifier=pos_id),
)

parsed_proceeding = ParsedItem(
Expand Down Expand Up @@ -309,6 +317,14 @@ def _get_conference_paper_pdf_url(self, conference_paper_page_html):
conference_paper_pdf_relative_url,
)

@staticmethod
def _set_fft(path):
return [
{
'path': path,
},
]

@staticmethod
def _get_language(selector):
language = selector.xpath(
Expand All @@ -317,16 +333,16 @@ def _get_language(selector):
return language if language != 'en' else None

@staticmethod
def _get_journal_title(pos_id):
return re.split('[()]', pos_id)[0]
def _get_journal_title(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[0]

@staticmethod
def _get_journal_volume(pos_id):
return re.split('[()]', pos_id)[1]
def _get_journal_volume(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[1]

@staticmethod
def _get_journal_artid(pos_id):
return re.split('[()]', pos_id)[2]
def _get_journal_artid(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[2]

@staticmethod
def _get_ext_systems_number(selector):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@
"title": "Heavy Flavour Physics Review"
}
],
"_fft": [
{
"path": "https://server.local/187/001/pdf"
}
],
"urls": [
{
"value": "https://server.local/PoS(LATTICE%202013)001.html"
}
],
"authors": [
{
"affiliations": [
Expand Down
24 changes: 15 additions & 9 deletions tests/functional/pos/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ def override_generated_fields(record):


@pytest.fixture(scope="function")
def set_up_environment():
def wait_until_services_are_up(seconds=10):
# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(seconds)


@pytest.fixture(scope="function")
def configuration():
package_location = get_test_suite_path(
'pos',
'fixtures',
Expand All @@ -41,9 +47,6 @@ def set_up_environment():
test_suite='functional',
)

# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
Expand All @@ -69,21 +72,22 @@ def set_up_environment():
]
)
def test_pos_conference_paper_record_and_proceedings_record(
set_up_environment,
expected_results,
configuration,
wait_until_services_are_up,
expected_results,
):
crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))
crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
events_limit=1,
crawler_instance=crawler,
project=set_up_environment.get('CRAWLER_PROJECT'),
project=configuration.get('CRAWLER_PROJECT'),
spider='pos',
settings={},
**set_up_environment.get('CRAWLER_ARGUMENTS')
**configuration.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
Expand All @@ -93,6 +97,8 @@ def test_pos_conference_paper_record_and_proceedings_record(


# TODO create test that receives conference paper record AND proceedings record.
# 'Crawl-once' plug-in needed.


# TODO create test that receives proceedings record ONLY.
# 'Crawl-once' plug-in needed.
56 changes: 34 additions & 22 deletions tests/unit/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def override_generated_fields(record):


@pytest.fixture(scope='session')
def scrape_pos_page_body():
def scrape_pos_conference_paper_page_body():
return pkg_resources.resource_string(
__name__,
os.path.join(
Expand All @@ -41,7 +41,7 @@ def scrape_pos_page_body():


@pytest.fixture(scope='session')
def generated_record(scrape_pos_page_body):
def generated_conference_paper(scrape_pos_conference_paper_page_body):
"""Return results generator from the PoS spider."""
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
Expand All @@ -51,12 +51,14 @@ def generated_record(scrape_pos_page_body):
crawler = Crawler(spidercls=pos_spider.POSSpider)
spider = pos_spider.POSSpider.from_crawler(crawler)
request = spider.parse(
fake_response_from_file('pos/sample_pos_record.xml')
fake_response_from_file(
file_name=str('pos/sample_pos_record.xml'),
)
).next()
response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_page_body,
body=scrape_pos_conference_paper_page_body,
**{'encoding': 'utf-8'}
)
assert response
Expand All @@ -70,7 +72,7 @@ def generated_record(scrape_pos_page_body):
return parsed_record


def test_titles(generated_record):
def test_titles(generated_conference_paper):
"""Test extracting title."""
expected_titles = [
{
Expand All @@ -79,33 +81,33 @@ def test_titles(generated_record):
}
]

assert 'titles' in generated_record
assert generated_record['titles'] == expected_titles
assert 'titles' in generated_conference_paper
assert generated_conference_paper['titles'] == expected_titles


def test_license(generated_record):
def test_license(generated_conference_paper):
"""Test extracting license information."""
expected_license = [{
'license': 'CC-BY-NC-SA-3.0',
'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0',
}]
assert generated_record['license'] == expected_license
assert generated_conference_paper['license'] == expected_license


def test_collections(generated_record):
def test_collections(generated_conference_paper):
"""Test extracting collections."""
expected_document_type = ['conference paper']

assert generated_record.get('citeable')
assert generated_record.get('document_type') == expected_document_type
assert generated_conference_paper.get('citeable')
assert generated_conference_paper.get('document_type') == expected_document_type


def test_language(generated_record):
def test_language(generated_conference_paper):
"""Test extracting language."""
assert 'language' not in generated_record
assert 'language' not in generated_conference_paper


def test_publication_info(generated_record):
def test_publication_info(generated_conference_paper):
"""Test extracting dois."""
expected_pub_info = [{
'artid': '001',
Expand All @@ -114,13 +116,13 @@ def test_publication_info(generated_record):
'year': 2014,
}]

assert 'publication_info' in generated_record
assert 'publication_info' in generated_conference_paper

pub_info = generated_record['publication_info']
pub_info = generated_conference_paper['publication_info']
assert pub_info == expected_pub_info


def test_authors(generated_record):
def test_authors(generated_conference_paper):
"""Test authors."""
expected_authors = [
{
Expand All @@ -133,9 +135,9 @@ def test_authors(generated_record):
}
]

assert 'authors' in generated_record
assert 'authors' in generated_conference_paper

result_authors = generated_record['authors']
result_authors = generated_conference_paper['authors']

assert len(result_authors) == len(expected_authors)

Expand All @@ -144,7 +146,7 @@ def test_authors(generated_record):
assert author == expected_author


def test_pipeline_record(generated_record):
def test_pipeline_conference_paper(generated_conference_paper):
expected = {
'acquisition_source': {
'datetime': '2017-08-10T16:03:59.091110',
Expand Down Expand Up @@ -198,7 +200,17 @@ def test_pipeline_record(generated_record):
'source': u'Sissa Medialab',
'title': u'Heavy Flavour Physics Review'
}
],
'_fft': [
{
'path': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf'
}
],
'urls': [
{
'value': 'https://pos.sissa.it/contribution?id=PoS%28LATTICE+2013%29001'
}
]
}

assert override_generated_fields(generated_record) == expected
assert override_generated_fields(generated_conference_paper) == expected

0 comments on commit 0a33a49

Please sign in to comment.