Skip to content

Commit

Permalink
global: minimal adaptation to documents
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Oct 29, 2017
1 parent 0d7ee20 commit c60a4e5
Show file tree
Hide file tree
Showing 20 changed files with 80 additions and 59 deletions.
1 change: 1 addition & 0 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class HEPRecord(scrapy.Item):
Example:
::
[{
"fulltext": true,
"url": "file:///path/to/file",
Expand Down
4 changes: 2 additions & 2 deletions hepcrawl/spiders/edp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def build_item_rich(self, response):
# NOTE: maybe this should be removed as the 'rich' format records
# are not open access.
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down Expand Up @@ -384,7 +384,7 @@ def build_item_jats(self, response):

if "pdf_links" in response.meta:
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,10 @@ def build_item(self, response):

xml_file = response.meta.get("xml_url")
if xml_file:
record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(xml_file, "HIDDEN", "Fulltext"),
)
sd_url = self._get_sd_url(xml_file)
if requests.head(sd_url).status_code == 200: # Test if valid url
record.add_value("urls", sd_url)
Expand Down
12 changes: 6 additions & 6 deletions hepcrawl/spiders/hindawi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ def get_journal_pages(node):
else:
return journal_pages, ''

def create_file(self, file_path, file_access, file_type):
"""Create a structured dictionary to add to 'files' item."""
def create_document(self, file_path):
"""Create a structured dictionary to add to 'documents' item."""
file_dict = {
"access": file_access,
"hidden": True,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
"fulltext": True,
}
return file_dict

Expand Down Expand Up @@ -219,9 +219,9 @@ def parse_node(self, response, node):
record.add_value('file_urls', pdf_links)
if xml_links:
record.add_value(
'additional_files',
'documents',
[
self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext")
self.create_document(xml)
for xml in xml_links
]
)
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/infn_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ def build_item(self, response):

pdf_files = response.meta.get("pdf_links")
if pdf_files:
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date_published"))
record.add_value('thesis', response.meta.get("thesis_info"))
Expand Down
26 changes: 15 additions & 11 deletions hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage):
if pattern in pdf_path:
return os.path.join(self.pdf_files, pdf_path)

def add_file(self, file_path, file_access, file_type):
def add_document(self, file_path, hidden, fulltext):
"""Create a structured dictionary and add to 'files' item."""
file_dict = {
"access": file_access,
"hidden": hidden,
"fulltext": fulltext,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
}
return file_dict

Expand Down Expand Up @@ -206,21 +206,25 @@ def parse_node(self, response, node):
record.add_value('collections', self.get_collections(doctype))

xml_file_path = response.url
record.add_value("additional_files",
self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext"))
record.add_value(
"documents",
self.add_document(xml_file_path, hidden=True, fulltext=True),
)
if self.pdf_files:
pdf_file_path = self.get_pdf_path(volume, issue, fpage)
if pdf_file_path:
if doctype and "erratum" in doctype.lower():
file_type = "Erratum"
fulltext = False
else:
file_type = "Fulltext"
fulltext = True
if journal_title in self.OPEN_ACCESS_JOURNALS:
file_access = "INSPIRE-PUBLIC" # FIXME: right?
hidden = False
else:
file_access = "INSPIRE-HIDDEN"
record.add_value("additional_files",
self.add_file(pdf_file_path, file_access, file_type))
hidden = True
record.add_value(
"documents",
self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),
)

parsed_item = ParsedItem(
record=record.load_item(),
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def build_item(self, response):
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("additional_files", response.meta.get("files"))
record.add_value("documents", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])

parsed_item = ParsedItem(
Expand Down
6 changes: 4 additions & 2 deletions hepcrawl/spiders/mit_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,10 @@ def build_item(self, response):

pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
if pdf_files:
record.add_value('additional_files', self.add_file(
pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_xpath('date_published',
"//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/phenix_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ def parse_node(self, response, node):
return None

pdf_files = node.xpath(".//a/@href").extract()
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_value('date_published', year)
record.add_value('thesis', {'degree_type': thesis_type})
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class POSSpider(StatefulSpider):
To do that and because each needs the information of the previous, the
spider must use the callbacks system provided by scrapy through the
:ref:`scrapy.html.response.Response` callback parameter, and chain the
:class:`scrapy.html.response.Response` callback parameter, and chain the
parser functions.
The deduplication of the conference proceedings papers is left for the
Expand Down
10 changes: 5 additions & 5 deletions hepcrawl/spiders/t2k_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,16 @@ def get_splash_links(self, node):

return out_links

def add_file(self, pdf_files, file_access, file_type):
def add_document(self, pdf_files):
"""Create a structured dictionary and add to ``files`` item."""
# NOTE: should this be moved to utils?
file_dicts = []
for link in pdf_files:
file_dict = {
"access": file_access,
"hidden": True,
"fulltext": True,
"description": self.name.title(),
"url": urljoin(self.domain, link),
"type": file_type,
}
file_dicts.append(file_dict)
return file_dicts
Expand Down Expand Up @@ -149,7 +149,7 @@ def scrape_for_pdf(self, response):
"//a[@class='contenttype-file state-internal url']/@href").extract()

response.meta["abstract"] = abstract
response.meta["additional_files"] = self.add_file(file_paths, "HIDDEN", "Fulltext")
response.meta["documents"] = self.add_document(file_paths)

return self.build_item(response)

Expand All @@ -165,7 +165,7 @@ def build_item(self, response):
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("additional_files", response.meta.get("additional_files"))
record.add_value("documents", response.meta.get("documents"))
record.add_value('collections', ['HEP', 'THESIS'])

parsed_item = ParsedItem(
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/test_edp.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,6 @@ def test_no_dois_jats():
record = parsed_item.record

assert "dois" not in record
assert "additional_files" not in record
assert isinstance(record, HEPRecord)


Expand All @@ -413,7 +412,6 @@ def test_no_dois_rich():
record = parsed_item.record

assert "dois" not in record
assert "additional_files" not in record
assert isinstance(record, HEPRecord)


Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ def test_authors(record):

def test_files(record):
"""Test file urls."""
assert record["additional_files"]
assert record["additional_files"][0]['url'] == "elsevier/sample_consyn_record.xml"
assert record["documents"]
assert record["documents"][0]['url'] == "elsevier/sample_consyn_record.xml"


def test_dois(record):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_hindawi.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ def test_urls(record):
def test_additional_files(record):
"""Test additional files."""
url = "http://downloads.hindawi.com/journals/aa/2010/194946.xml"
assert "additional_files" in record
assert record["additional_files"][0]["url"] == url
assert record["additional_files"][0]["access"] == "INSPIRE-HIDDEN"
assert "documents" in record
assert record["documents"][0]["url"] == url
assert record["documents"][0]["hidden"]


def test_collections(record):
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/test_infn.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ def test_date_published(record):

def test_files(record):
"""Test pdf files."""
assert record["additional_files"][0][
"url"] == "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf"
assert record["documents"][0]["url"] == (
"http://www.infn.it/thesis/PDF/getfile.php"
"?filename=10136-Fedon-dottorato.pdf"
)


def test_thesis(record):
Expand Down
21 changes: 10 additions & 11 deletions tests/unit/test_iop.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ def test_files(record):
"""Test files dictionary."""
pdf_filename = "test_143_3_336.pdf"

assert "additional_files" in record
assert record["additional_files"][1]["access"] == 'INSPIRE-HIDDEN'
assert record["additional_files"][1]["type"] == 'Fulltext'
assert record["additional_files"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)
assert "documents" in record
assert record["documents"][1]["hidden"]
assert record["documents"][1]["fulltext"]
assert record["documents"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)


@pytest.fixture
Expand Down Expand Up @@ -196,13 +196,12 @@ def erratum_open_access_record():
def test_files_erratum_open_access_record(erratum_open_access_record):
"""Test files dict with open access journal with erratum article."""
pdf_filename = "test_143_3_336.pdf"
assert "additional_files" in erratum_open_access_record
assert erratum_open_access_record["additional_files"][
1]["access"] == 'INSPIRE-PUBLIC'
assert erratum_open_access_record[
"additional_files"][1]["type"] == 'Erratum'
assert erratum_open_access_record["additional_files"][
1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename)
assert "documents" in erratum_open_access_record
assert not erratum_open_access_record["documents"][1]["hidden"]
assert not erratum_open_access_record["documents"][1]["fulltext"]
assert erratum_open_access_record["documents"][1]["url"] == (
os.path.join(TEST_PDF_DIR, pdf_filename)
)


def test_not_published_record():
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_magic.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def test_url(record):
def test_pdf_link(record):
"""Test pdf link(s)"""
files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf"
assert 'additional_files' in record
assert record['additional_files'][1]['url'] == files
assert 'documents' in record
assert record['documents'][1]['url'] == files


def test_no_author_no_date_no_url():
Expand Down
5 changes: 4 additions & 1 deletion tests/unit/test_mit.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ def test_date_published(record):

def test_files(record):
"""Test pdf files."""
assert record["additional_files"][0]["url"] == "http://dspace.mit.edu/bitstream/handle/1721.1/99287/922886248-MIT.pdf?sequence=1"
assert record["documents"][0]["url"] == (
"http://dspace.mit.edu/bitstream/handle/1721.1/99287/"
"922886248-MIT.pdf?sequence=1"
)


def test_thesis(record):
Expand Down
9 changes: 6 additions & 3 deletions tests/unit/test_phenix.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def test_authors(record):

def test_pdf_link(record):
"""Test pdf link(s)"""
files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf"
assert 'additional_files' in record
assert record['additional_files'][0]['url'] == files
files = (
"http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/"
"Guragain_Hari-DISSERTATION.pdf"
)
assert 'documents' in record
assert record['documents'][0]['url'] == files
4 changes: 2 additions & 2 deletions tests/unit/test_t2k.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def test_url(record):
def test_pdf_link(record):
"""Test pdf link(s)"""
files = "http://www.t2k.org/docs/thesis/001/IJT-THESIS"
assert 'additional_files' in record
assert record['additional_files'][0]['url'] == files
assert 'documents' in record
assert record['documents'][0]['url'] == files


@pytest.fixture
Expand Down

0 comments on commit c60a4e5

Please sign in to comment.