From 1dd708dfd51374905bc87e9b90f6275bca671862 Mon Sep 17 00:00:00 2001 From: David Caro Date: Sat, 28 Oct 2017 21:39:36 +0200 Subject: [PATCH] global: minimal adaptation to documents Signed-off-by: David Caro --- hepcrawl/items.py | 1 + hepcrawl/spiders/edp_spider.py | 4 ++-- hepcrawl/spiders/elsevier_spider.py | 5 ++++- hepcrawl/spiders/hindawi_spider.py | 12 ++++++------ hepcrawl/spiders/infn_spider.py | 5 ++++- hepcrawl/spiders/iop_spider.py | 26 +++++++++++++++----------- hepcrawl/spiders/magic_spider.py | 2 +- hepcrawl/spiders/mit_spider.py | 6 ++++-- hepcrawl/spiders/phenix_spider.py | 5 ++++- hepcrawl/spiders/pos_spider.py | 2 +- hepcrawl/spiders/t2k_spider.py | 10 +++++----- tests/unit/test_edp.py | 2 -- tests/unit/test_elsevier.py | 4 ++-- tests/unit/test_hindawi.py | 6 +++--- tests/unit/test_infn.py | 6 ++++-- tests/unit/test_iop.py | 21 ++++++++++----------- tests/unit/test_magic.py | 4 ++-- tests/unit/test_mit.py | 5 ++++- tests/unit/test_phenix.py | 9 ++++++--- tests/unit/test_t2k.py | 4 ++-- 20 files changed, 80 insertions(+), 59 deletions(-) diff --git a/hepcrawl/items.py b/hepcrawl/items.py index f14f17df..09d0d552 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -49,6 +49,7 @@ class HEPRecord(scrapy.Item): Example: :: + [{ "fulltext": true, "url": "file:///path/to/file", diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index ef075e4d..c051c8ee 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -312,7 +312,7 @@ def build_item_rich(self, response): # NOTE: maybe this should be removed as the 'rich' format records # are not open access. record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", @@ -384,7 +384,7 @@ def build_item_jats(self, response): if "pdf_links" in response.meta: record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 3f1fe0c6..e2d4e919 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -995,7 +995,10 @@ def build_item(self, response): xml_file = response.meta.get("xml_url") if xml_file: - record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(xml_file, "HIDDEN", "Fulltext"), + ) sd_url = self._get_sd_url(xml_file) if requests.head(sd_url).status_code == 200: # Test if valid url record.add_value("urls", sd_url) diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 999b7183..5f81f5b4 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -154,13 +154,13 @@ def get_journal_pages(node): else: return journal_pages, '' - def create_file(self, file_path, file_access, file_type): - """Create a structured dictionary to add to 'files' item.""" + def create_document(self, file_path): + """Create a structured dictionary to add to 'documents' item.""" file_dict = { - "access": file_access, + "hidden": True, "description": self.name.upper(), "url": file_path, - "type": file_type, + "fulltext": True, } return file_dict @@ -219,9 +219,9 @@ def parse_node(self, response, node): record.add_value('file_urls', pdf_links) if xml_links: record.add_value( - 'additional_files', + 'documents', [ - self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext") + self.create_document(xml) for xml in xml_links ] ) diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 23e71708..2e093ab1 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -232,7 +232,10 @@ def build_item(self, response): pdf_files = response.meta.get("pdf_links") if pdf_files: - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', response.meta.get("authors")) record.add_value('date_published', response.meta.get("date_published")) record.add_value('thesis', response.meta.get("thesis_info")) diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index ee778e58..fbca3ae5 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage): if pattern in pdf_path: return os.path.join(self.pdf_files, pdf_path) - def add_file(self, file_path, file_access, file_type): + def add_document(self, file_path, hidden, fulltext): """Create a structured dictionary and add to 'files' item.""" file_dict = { - "access": file_access, + "hidden": hidden, + "fulltext": fulltext, "description": self.name.upper(), "url": file_path, - "type": file_type, } return file_dict @@ -206,21 +206,25 @@ def parse_node(self, response, node): record.add_value('collections', self.get_collections(doctype)) xml_file_path = response.url - record.add_value("additional_files", - self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext")) + record.add_value( + "documents", + self.add_document(xml_file_path, hidden=True, fulltext=True), + ) if self.pdf_files: pdf_file_path = self.get_pdf_path(volume, issue, fpage) if pdf_file_path: if doctype and "erratum" in doctype.lower(): - file_type = "Erratum" + fulltext = False else: - file_type = "Fulltext" + fulltext = True if journal_title in self.OPEN_ACCESS_JOURNALS: - file_access = "INSPIRE-PUBLIC" # FIXME: right? + hidden = False else: - file_access = "INSPIRE-HIDDEN" - record.add_value("additional_files", - self.add_file(pdf_file_path, file_access, file_type)) + hidden = True + record.add_value( + "documents", + self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext), + ) parsed_item = ParsedItem( record=record.load_item(), diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 41687674..8dfd5d51 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -177,7 +177,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("files")) + record.add_value("documents", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 8ba61d89..21804873 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -207,8 +207,10 @@ def build_item(self, response): pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract() if pdf_files: - record.add_value('additional_files', self.add_file( - pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_xpath('date_published', "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()") diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index a5fcd025..aa54bd98 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -121,7 +121,10 @@ def parse_node(self, response, node): return None pdf_files = node.xpath(".//a/@href").extract() - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_value('date_published', year) record.add_value('thesis', {'degree_type': thesis_type}) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index b98c8508..19d4fee5 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -62,7 +62,7 @@ class POSSpider(StatefulSpider): To do that and because each needs the information of the previous, the spider must use the callbacks system provided by scrapy through the - :ref:`scrapy.html.response.Response` callback parameter, and chain the + :class:`scrapy.html.response.Response` callback parameter, and chain the parser functions. The deduplication of the conference proceedings papers is left for the diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 4dd495a9..db18eb1e 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -101,16 +101,16 @@ def get_splash_links(self, node): return out_links - def add_file(self, pdf_files, file_access, file_type): + def add_document(self, pdf_files): """Create a structured dictionary and add to ``files`` item.""" # NOTE: should this be moved to utils? file_dicts = [] for link in pdf_files: file_dict = { - "access": file_access, + "hidden": True, + "fulltext": True, "description": self.name.title(), "url": urljoin(self.domain, link), - "type": file_type, } file_dicts.append(file_dict) return file_dicts @@ -149,7 +149,7 @@ def scrape_for_pdf(self, response): "//a[@class='contenttype-file state-internal url']/@href").extract() response.meta["abstract"] = abstract - response.meta["additional_files"] = self.add_file(file_paths, "HIDDEN", "Fulltext") + response.meta["documents"] = self.add_document(file_paths) return self.build_item(response) @@ -165,7 +165,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("additional_files")) + record.add_value("documents", response.meta.get("documents")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index 219245bb..3ff75717 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -389,7 +389,6 @@ def test_no_dois_jats(): record = parsed_item.record assert "dois" not in record - assert "additional_files" not in record assert isinstance(record, HEPRecord) @@ -413,7 +412,6 @@ def test_no_dois_rich(): record = parsed_item.record assert "dois" not in record - assert "additional_files" not in record assert isinstance(record, HEPRecord) diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index 126792a7..a3c436a5 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -326,8 +326,8 @@ def test_authors(record): def test_files(record): """Test file urls.""" - assert record["additional_files"] - assert record["additional_files"][0]['url'] == "elsevier/sample_consyn_record.xml" + assert record["documents"] + assert record["documents"][0]['url'] == "elsevier/sample_consyn_record.xml" def test_dois(record): diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py index 51f0fc77..a8f8f20b 100644 --- a/tests/unit/test_hindawi.py +++ b/tests/unit/test_hindawi.py @@ -96,9 +96,9 @@ def test_urls(record): def test_additional_files(record): """Test additional files.""" url = "http://downloads.hindawi.com/journals/aa/2010/194946.xml" - assert "additional_files" in record - assert record["additional_files"][0]["url"] == url - assert record["additional_files"][0]["access"] == "INSPIRE-HIDDEN" + assert "documents" in record + assert record["documents"][0]["url"] == url + assert record["documents"][0]["hidden"] def test_collections(record): diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py index c15ef727..526fdf40 100644 --- a/tests/unit/test_infn.py +++ b/tests/unit/test_infn.py @@ -83,8 +83,10 @@ def test_date_published(record): def test_files(record): """Test pdf files.""" - assert record["additional_files"][0][ - "url"] == "http://www.infn.it/thesis/PDF/getfile.php?filename=10136-Fedon-dottorato.pdf" + assert record["documents"][0]["url"] == ( + "http://www.infn.it/thesis/PDF/getfile.php" + "?filename=10136-Fedon-dottorato.pdf" + ) def test_thesis(record): diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index bb01766c..1e48fb8a 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -154,10 +154,10 @@ def test_files(record): """Test files dictionary.""" pdf_filename = "test_143_3_336.pdf" - assert "additional_files" in record - assert record["additional_files"][1]["access"] == 'INSPIRE-HIDDEN' - assert record["additional_files"][1]["type"] == 'Fulltext' - assert record["additional_files"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) + assert "documents" in record + assert record["documents"][1]["hidden"] + assert record["documents"][1]["fulltext"] + assert record["documents"][1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) @pytest.fixture @@ -196,13 +196,12 @@ def erratum_open_access_record(): def test_files_erratum_open_access_record(erratum_open_access_record): """Test files dict with open access journal with erratum article.""" pdf_filename = "test_143_3_336.pdf" - assert "additional_files" in erratum_open_access_record - assert erratum_open_access_record["additional_files"][ - 1]["access"] == 'INSPIRE-PUBLIC' - assert erratum_open_access_record[ - "additional_files"][1]["type"] == 'Erratum' - assert erratum_open_access_record["additional_files"][ - 1]["url"] == os.path.join(TEST_PDF_DIR, pdf_filename) + assert "documents" in erratum_open_access_record + assert not erratum_open_access_record["documents"][1]["hidden"] + assert not erratum_open_access_record["documents"][1]["fulltext"] + assert erratum_open_access_record["documents"][1]["url"] == ( + os.path.join(TEST_PDF_DIR, pdf_filename) + ) def test_not_published_record(): diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py index f3c0f355..16c52881 100644 --- a/tests/unit/test_magic.py +++ b/tests/unit/test_magic.py @@ -145,8 +145,8 @@ def test_url(record): def test_pdf_link(record): """Test pdf link(s)""" files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf" - assert 'additional_files' in record - assert record['additional_files'][1]['url'] == files + assert 'documents' in record + assert record['documents'][1]['url'] == files def test_no_author_no_date_no_url(): diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py index 8a185cef..895d2c2d 100644 --- a/tests/unit/test_mit.py +++ b/tests/unit/test_mit.py @@ -106,7 +106,10 @@ def test_date_published(record): def test_files(record): """Test pdf files.""" - assert record["additional_files"][0]["url"] == "http://dspace.mit.edu/bitstream/handle/1721.1/99287/922886248-MIT.pdf?sequence=1" + assert record["documents"][0]["url"] == ( + "http://dspace.mit.edu/bitstream/handle/1721.1/99287/" + "922886248-MIT.pdf?sequence=1" + ) def test_thesis(record): diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py index c272683f..2d9b2c58 100644 --- a/tests/unit/test_phenix.py +++ b/tests/unit/test_phenix.py @@ -91,6 +91,9 @@ def test_authors(record): def test_pdf_link(record): """Test pdf link(s)""" - files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf" - assert 'additional_files' in record - assert record['additional_files'][0]['url'] == files + files = ( + "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/" + "Guragain_Hari-DISSERTATION.pdf" + ) + assert 'documents' in record + assert record['documents'][0]['url'] == files diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py index d9395aa2..c3fe0e2c 100644 --- a/tests/unit/test_t2k.py +++ b/tests/unit/test_t2k.py @@ -113,8 +113,8 @@ def test_url(record): def test_pdf_link(record): """Test pdf link(s)""" files = "http://www.t2k.org/docs/thesis/001/IJT-THESIS" - assert 'additional_files' in record - assert record['additional_files'][0]['url'] == files + assert 'documents' in record + assert record['documents'][0]['url'] == files @pytest.fixture