Skip to content

Commit

Permalink
pos: fix spider
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 22, 2017
1 parent 4978cf2 commit 8320ade
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 55 deletions.
6 changes: 6 additions & 0 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ def _filter_affiliation(affiliations):
source=report_number.get('source')
)

for url in crawler_record.get('urls', []):
builder.add_url(url=url.get('value'))

if crawler_record.get('_fft'):
builder.record['_fft'] = crawler_record.get('_fft')

builder.validate_record()

return builder.record
4 changes: 4 additions & 0 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,7 @@ class HEPRecord(scrapy.Item):

thesis_supervisor = scrapy.Field()
language = scrapy.Field()

_fft = scrapy.Field()
"""Used to communicate with legacy about files (to be) attached to the
record."""
63 changes: 39 additions & 24 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,13 @@ class POSSpider(Spider):
To do that and because each needs the information of the previous, the
spider must use the callbacks system provided by scrapy through the
:ref:`scrapy.html.response.Response` callback parameter, and chain the
:class:`scrapy.html.response.Response` callback parameter, and chain the
parser functions.
The deduplication of the conference proceedings papers is left for the
`HepcrawlCrawlOnceMiddleware` middleware.
Example:
::
$ scrapy crawl PoS \\
-a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml"
"""
Expand All @@ -94,24 +93,26 @@ def parse(self, response):
self.log('Got record from: {response.url}'.format(**vars()))

response.selector.remove_namespaces()
records = response.selector.xpath('.//record')
for record in records:
yield self.get_conference_paper_page_request(raw_xml=record)
record_xml_selectors = response.selector.xpath('.//record')
for record_xml_selector in record_xml_selectors:
yield self.get_conference_paper_page_request(
xml_selector=record_xml_selector,
)

def get_conference_paper_page_request(self, raw_xml, meta=None):
def get_conference_paper_page_request(self, xml_selector, meta=None):
"""Gets the conference paper html page, for the pdf link for the
conference paper, and later the internal conference id.
"""
meta = meta or {}

identifier = raw_xml.xpath(
identifier = xml_selector.xpath(
'.//metadata/pex-dc/identifier/text()'
).extract_first()
conference_paper_url = "{0}{1}".format(
self.base_conference_paper_url,
identifier,
)
meta['xml_record'] = raw_xml
meta['xml_record'] = xml_selector.extract()

# the meta parameter will be passed over to the callback as a property
# in the response parameter
Expand All @@ -137,11 +138,11 @@ def parse_conference_paper(self, response):

# prepare next callback step
response.meta['html_record'] = response.body
yield self.get_conference_proceendings_page_request(
yield self.get_conference_proceedings_page_request(
meta=response.meta,
)

def get_conference_proceendings_page_request(self, meta):
def get_conference_proceedings_page_request(self, meta):
"""Gets the conference proceedings page, using the indernal conference
id from the record html page retrieved before.
"""
Expand All @@ -155,9 +156,10 @@ def get_conference_proceendings_page_request(self, meta):
)

page_selector = Selector(
text=meta.get('html_record'),
type='html',
text=meta.get('xml_record'),
type='xml',
)
page_selector.remove_namespaces()
pos_id = page_selector.xpath(
".//metadata/pex-dc/identifier/text()"
).extract_first()
Expand Down Expand Up @@ -220,15 +222,15 @@ def build_conference_paper_item(
).extract_first()
record.add_value(
'journal_title',
self._get_journal_title(identifier=identifier),
self._get_journal_title(pos_ext_identifier=identifier),
)
record.add_value(
'journal_volume',
self._get_journal_volume(identifier=identifier),
self._get_journal_volume(pos_ext_identifier=identifier),
)
record.add_value(
'journal_artid',
self._get_journal_artid(identifier=identifier),
self._get_journal_artid(pos_ext_identifier=identifier),
)

record.add_xpath('title', '//metadata/pex-dc/title/text()')
Expand All @@ -240,8 +242,13 @@ def build_conference_paper_item(
record.add_value('language', self._get_language(selector=selector))
record.add_value('authors', self._get_authors(selector=selector))
record.add_value('collections', ['conferencepaper'])
record.add_value('urls', conference_paper_pdf_url)
record.add_value('_fulltext_url', self._get_conference_paper_pdf_url())
record.add_value('urls', [conference_paper_url])
record.add_value(
'_fft',
self._set_fft(
path=conference_paper_pdf_url,
),
)

parsed_item = ParsedItem(
record=record.load_item(),
Expand Down Expand Up @@ -277,7 +284,7 @@ def build_conference_proceedings_item(
record.add_value('journal_title', 'PoS')
record.add_value(
'journal_volume',
self._get_journal_volume(pos_id=pos_id),
self._get_journal_volume(pos_ext_identifier=pos_id),
)

parsed_proceeding = ParsedItem(
Expand Down Expand Up @@ -309,6 +316,14 @@ def _get_conference_paper_pdf_url(self, conference_paper_page_html):
conference_paper_pdf_relative_url,
)

@staticmethod
def _set_fft(path):
return [
{
'path': path,
},
]

@staticmethod
def _get_language(selector):
language = selector.xpath(
Expand All @@ -317,16 +332,16 @@ def _get_language(selector):
return language if language != 'en' else None

@staticmethod
def _get_journal_title(pos_id):
return re.split('[()]', pos_id)[0]
def _get_journal_title(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[0]

@staticmethod
def _get_journal_volume(pos_id):
return re.split('[()]', pos_id)[1]
def _get_journal_volume(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[1]

@staticmethod
def _get_journal_artid(pos_id):
return re.split('[()]', pos_id)[2]
def _get_journal_artid(pos_ext_identifier):
return re.split('[()]', pos_ext_identifier)[2]

@staticmethod
def _get_ext_systems_number(selector):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@
"title": "Heavy Flavour Physics Review"
}
],
"_fft": [
{
"path": "https://server.local/187/001/pdf"
}
],
"urls": [
{
"value": "https://server.local/PoS(LATTICE%202013)001.html"
}
],
"authors": [
{
"affiliations": [
Expand Down
24 changes: 15 additions & 9 deletions tests/functional/pos/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,13 @@ def override_generated_fields(record):


@pytest.fixture(scope="function")
def set_up_environment():
def wait_until_services_are_up(seconds=10):
# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(seconds)


@pytest.fixture(scope="function")
def configuration():
package_location = get_test_suite_path(
'pos',
'fixtures',
Expand All @@ -41,9 +47,6 @@ def set_up_environment():
test_suite='functional',
)

# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)

yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
Expand All @@ -69,21 +72,22 @@ def set_up_environment():
]
)
def test_pos_conference_paper_record_and_proceedings_record(
set_up_environment,
expected_results,
configuration,
wait_until_services_are_up,
expected_results,
):
crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL'))
crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
events_limit=1,
crawler_instance=crawler,
project=set_up_environment.get('CRAWLER_PROJECT'),
project=configuration.get('CRAWLER_PROJECT'),
spider='pos',
settings={},
**set_up_environment.get('CRAWLER_ARGUMENTS')
**configuration.get('CRAWLER_ARGUMENTS')
)

gotten_results = [override_generated_fields(result) for result in results]
Expand All @@ -93,6 +97,8 @@ def test_pos_conference_paper_record_and_proceedings_record(


# TODO create test that receives conference paper record AND proceedings record.
# 'Crawl-once' plug-in needed.


# TODO create test that receives proceedings record ONLY.
# 'Crawl-once' plug-in needed.
Loading

0 comments on commit 8320ade

Please sign in to comment.