diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 9c80c88d..7ffe0122 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -42,7 +42,7 @@ services: unit: <<: *service_base - command: bash -c "py.test tests/unit -vv && make -C docs html && python setup.py sdist && ls dist/*" + command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*" links: [] celery: diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index 72573847..d6898022 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -185,6 +185,7 @@ def _filter_affiliation(affiliations): journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), + material=_pub_info.get('pubinfo_material'), ) for report_number in crawler_record.get('report_numbers', []): diff --git a/hepcrawl/items.py b/hepcrawl/items.py index a175ec0b..dab67dda 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -178,6 +178,7 @@ class HEPRecord(scrapy.Item): Example: :: + [ { 'license': license_str, @@ -209,6 +210,9 @@ class HEPRecord(scrapy.Item): pubinfo_freetext = scrapy.Field() """Raw journal reference string.""" + pubinfo_material = scrapy.Field() + """Material of publication information.""" + publication_info = scrapy.Field() """Structured publication information.""" diff --git a/hepcrawl/loaders.py b/hepcrawl/loaders.py index 9ab22450..96fa08ad 100644 --- a/hepcrawl/loaders.py +++ b/hepcrawl/loaders.py @@ -122,6 +122,7 @@ class HEPLoader(ItemLoader): journal_issue_out = TakeFirst() journal_doctype_out = TakeFirst() pubinfo_freetext_out = TakeFirst() + pubinfo_material_out = TakeFirst() preprint_date_in = MapCompose( parse_date, diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index fb19a745..62ba867c 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -93,6 +93,7 @@ def process_item(self, item, spider): 'page_end': item.pop('journal_lpage', ''), 'note': item.pop('journal_doctype', ''), 'pubinfo_freetext': item.pop('pubinfo_freetext', ''), + 'pubinfo_material': item.pop('pubinfo_material', ''), }] if item.get('journal_year'): item['publication_info'][0]['year'] = int( @@ -110,6 +111,7 @@ def process_item(self, item, spider): 'journal_doctype', 'journal_artid', 'pubinfo_freetext', + 'pubinfo_material', ]) item = crawler2hep(dict(item)) diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index dff7c2d0..d82c8318 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -64,7 +64,12 @@ def parse_node(self, response, node): dois_values=self._get_dois(node=node), material='publication', ) - record.add_xpath('pubinfo_freetext', './/journal-ref//text()') + + pubinfo_freetext = node.xpath('.//journal-ref//text()').extract() + if pubinfo_freetext: + record.add_value('pubinfo_freetext', pubinfo_freetext) + record.add_value('pubinfo_material', 'publication') + record.add_value('source', 'arXiv') authors, collabs = self._get_authors_or_collaboration(node) diff --git a/tests/unit/responses/arxiv/sample_arxiv_record10_parsed.json b/tests/unit/responses/arxiv/sample_arxiv_record10_parsed.json index 28b94a15..e591c862 100644 --- a/tests/unit/responses/arxiv/sample_arxiv_record10_parsed.json +++ b/tests/unit/responses/arxiv/sample_arxiv_record10_parsed.json @@ -5,7 +5,7 @@ "results_data": [ { "preprint_date": "2016-06-14", - "citeable": true, + "citeable": true, "license": [ { "url": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index 1954436a..a6ed66d6 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -154,6 +154,7 @@ def test_publication_info(results): #TODO: check a more complete example expected_pub_info = [ { + 'material': 'publication', 'pubinfo_freetext': 'Phys.Rev. D93 (2015) 016005', } ]