Skip to content

Commit

Permalink
Merge pull request #145 from spirosdelviniotis/hepcrawl_use_material_…
Browse files Browse the repository at this point in the history
…pubinfo

global: support `material` field for `publication_info`
  • Loading branch information
david-caro authored Jul 4, 2017
2 parents f33cee1 + 96d254c commit ad88862
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 3 deletions.
2 changes: 1 addition & 1 deletion docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ services:

unit:
<<: *service_base
command: bash -c "py.test tests/unit -vv && make -C docs html && python setup.py sdist && ls dist/*"
command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*"
links: []

celery:
Expand Down
1 change: 1 addition & 0 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def _filter_affiliation(affiliations):
journal_title=_pub_info.get('journal_title'),
journal_volume=_pub_info.get('journal_volume'),
pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
material=_pub_info.get('pubinfo_material'),
)

for report_number in crawler_record.get('report_numbers', []):
Expand Down
4 changes: 4 additions & 0 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ class HEPRecord(scrapy.Item):
Example:
::
[
{
'license': license_str,
Expand Down Expand Up @@ -209,6 +210,9 @@ class HEPRecord(scrapy.Item):
pubinfo_freetext = scrapy.Field()
"""Raw journal reference string."""

pubinfo_material = scrapy.Field()
"""Material of publication information."""

publication_info = scrapy.Field()
"""Structured publication information."""

Expand Down
1 change: 1 addition & 0 deletions hepcrawl/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class HEPLoader(ItemLoader):
journal_issue_out = TakeFirst()
journal_doctype_out = TakeFirst()
pubinfo_freetext_out = TakeFirst()
pubinfo_material_out = TakeFirst()

preprint_date_in = MapCompose(
parse_date,
Expand Down
2 changes: 2 additions & 0 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def process_item(self, item, spider):
'page_end': item.pop('journal_lpage', ''),
'note': item.pop('journal_doctype', ''),
'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
'pubinfo_material': item.pop('pubinfo_material', ''),
}]
if item.get('journal_year'):
item['publication_info'][0]['year'] = int(
Expand All @@ -110,6 +111,7 @@ def process_item(self, item, spider):
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])

item = crawler2hep(dict(item))
Expand Down
7 changes: 6 additions & 1 deletion hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ def parse_node(self, response, node):
dois_values=self._get_dois(node=node),
material='publication',
)
record.add_xpath('pubinfo_freetext', './/journal-ref//text()')

pubinfo_freetext = node.xpath('.//journal-ref//text()').extract()
if pubinfo_freetext:
record.add_value('pubinfo_freetext', pubinfo_freetext)
record.add_value('pubinfo_material', 'publication')

record.add_value('source', 'arXiv')

authors, collabs = self._get_authors_or_collaboration(node)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"results_data": [
{
"preprint_date": "2016-06-14",
"citeable": true,
"citeable": true,
"license": [
{
"url": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_arxiv_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def test_publication_info(results):
#TODO: check a more complete example
expected_pub_info = [
{
'material': 'publication',
'pubinfo_freetext': 'Phys.Rev. D93 (2015) 016005',
}
]
Expand Down

0 comments on commit ad88862

Please sign in to comment.