Skip to content

Commit

Permalink
pos: adapt to documents
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Oct 28, 2017
1 parent d72815a commit 7ebcec4
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 24 deletions.
9 changes: 5 additions & 4 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,16 @@ class HEPRecord(scrapy.Item):
file_urls = scrapy.Field()
"""List of files to be downloaded with FilesPipeline and added to files."""

additional_files = scrapy.Field()
documents = scrapy.Field()
"""Files (fulltexts, package) belonging to this item.
Example:
::
[{
"type": "Fulltext", # Fulltext, Supplemental, Data, Figure
"uri": "file:///path/to/file", # can also be HTTP
"fulltext": true,
"url": "file:///path/to/file",
"description": "some fancy stuff",
"key": "usually_a_file_name.pdf",
}]
"""

Expand Down
26 changes: 18 additions & 8 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from __future__ import absolute_import, division, print_function

import re

import os
from urlparse import urljoin

from scrapy import Request, Selector
Expand Down Expand Up @@ -84,8 +84,8 @@ def __init__(
):
super(POSSpider, self).__init__(**kwargs)
self.source_file = source_file
self.BASE_CONFERENCE_PAPER_URL = base_conference_paper_url
self.BASE_PROCEEDINGS_URL = base_proceedings_url
self.base_conference_paper_url = base_conference_paper_url
self.base_proceedings_url = base_proceedings_url

def start_requests(self):
yield Request(self.source_file)
Expand Down Expand Up @@ -124,6 +124,9 @@ def get_conference_paper_page_request(self, xml_selector, meta=None):
)

def parse_conference_paper(self, response):
self.log(
'Parsing conference paper from: {response.url}'.format(**vars())
)
xml_record = response.meta.get('xml_record')
conference_paper_url = response.url
conference_paper_pdf_url = self._get_conference_paper_pdf_url(
Expand Down Expand Up @@ -245,8 +248,8 @@ def build_conference_paper_item(
record.add_value('collections', ['conferencepaper'])
record.add_value('urls', [conference_paper_url])
record.add_value(
'_fft',
self._set_fft(
'documents',
self.get_documents(
path=conference_paper_pdf_url,
),
)
Expand Down Expand Up @@ -322,13 +325,20 @@ def _get_proceedings_url(self, response):
"//a[not(contains(text(),'pdf'))]/@href",
).extract_first()
proceedings_identifier = internal_url.split('/')[1]
return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier)
return '{0}{1}'.format(
self.base_proceedings_url,
proceedings_identifier,
)

@staticmethod
def _set_fft(path):
def get_documents(path):
return [
{
'path': path,
'key': os.path.basename(path),
'url': path,
'original_url': path,
'hidden': True,
'fulltext': True,
},
]

Expand Down
11 changes: 11 additions & 0 deletions hepcrawl/tohep.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,17 @@ def _filter_affiliation(affiliations):
for url in crawler_record.get('urls', []):
builder.add_url(url=url.get('value'))

for document in crawler_record.get('documents', []):
builder.add_document(
description=document.get('description'),
fulltext=document.get('fulltext'),
hidden=document.get('hidden'),
key=document['key'],
material=document.get('material'),
original_url=document.get('original_url'),
url=document['url'],
)

builder.validate_record()

return builder.record
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
[
{
"_collections": [ "Literature" ],
"curated": false,
"publication_info": [
{
"journal_volume": "LATTICE 2013",
Expand All @@ -13,7 +15,7 @@
{
"source": "pos",
"title": "Proceedings, 31st International Symposium on Lattice Field Theory LATTICE 2013",
"subtitle": "1-3 August 2002, Heidelberg, Germany"
"subtitle": "29 July \u2013 3 August, 2013 Mainz, Germany"
}
],
"acquisition_source": {
Expand All @@ -24,6 +26,8 @@
}
},
{
"_collections": [ "Literature" ],
"curated": false,
"acquisition_source": {
"source": "pos",
"method": "hepcrawl",
Expand All @@ -32,8 +36,7 @@
},
"license": [
{
"url": "https://creativecommons.org/licenses/by-nc-sa/3.0",
"license": "CC-BY-NC-SA-3.0"
"license": "Creative Commons Attribution-NonCommercial-ShareAlike"
}
],
"titles": [
Expand All @@ -42,9 +45,14 @@
"title": "Heavy Flavour Physics Review"
}
],
"_fft": [
"documents": [
{
"path": "https://http-server.local/187/001/pdf"
"fulltext": true,
"hidden": true,
"url": "https://http-server.local/187/001/pdf",
"original_url": "https://http-server.local/187/001/pdf",
"key": "pdf",
"source": "pos"
}
],
"urls": [
Expand Down
13 changes: 11 additions & 2 deletions tests/functional/pos/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_pos_conference_paper_record_and_proceedings_record(
app=celery_app,
monitor_timeout=5,
monitor_iter_limit=100,
events_limit=1,
events_limit=2,
crawler_instance=crawler,
project=config['CRAWLER_PROJECT'],
spider='pos',
Expand All @@ -105,7 +105,16 @@ def test_pos_conference_paper_record_and_proceedings_record(
override_generated_fields(expected) for expected in expected_results
]

assert sorted(gotten_results) == expected_results
gotten_results = sorted(
gotten_results,
key=lambda x: x['document_type']
)
expected_results = sorted(
expected_results,
key=lambda x: x['document_type']
)

assert gotten_results == expected_results


# TODO create test that receives conference paper record AND proceedings
Expand Down
16 changes: 11 additions & 5 deletions tests/unit/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def generated_conference_paper(scrape_pos_conference_paper_page_body):
response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_page_body,
body=scrape_pos_conference_paper_page_body,
**{'encoding': 'utf-8'}
)
assert response
Expand Down Expand Up @@ -154,6 +154,8 @@ def test_authors(generated_conference_paper):

def test_pipeline_conference_paper(generated_conference_paper):
expected = {
'_collections': ['Literature'],
'curated': False,
'acquisition_source': {
'datetime': '2017-08-10T16:03:59.091110',
'method': 'hepcrawl',
Expand Down Expand Up @@ -189,8 +191,7 @@ def test_pipeline_conference_paper(generated_conference_paper):
],
'license': [
{
'license': 'CC-BY-NC-SA-3.0',
'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0'
'license': 'Creative Commons Attribution-NonCommercial-ShareAlike',
}
],
'publication_info': [
Expand All @@ -207,9 +208,14 @@ def test_pipeline_conference_paper(generated_conference_paper):
'title': u'Heavy Flavour Physics Review'
}
],
'_fft': [
'documents': [
{
'path': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf'
'key': 'LATTICE 2013_001.pdf',
'fulltext': True,
'hidden': True,
'url': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf',
'original_url': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf',
'source': 'pos',
}
],
'urls': [
Expand Down

0 comments on commit 7ebcec4

Please sign in to comment.