From 7a9e5d723b78ebe542e5d8f78b856b8c1d1c6f36 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 24 Aug 2020 21:39:23 +0200 Subject: [PATCH] Fix APS and ScienceDirect fetcher (#6781) --- CHANGELOG.md | 2 + .../fetcher/AstrophysicsDataSystem.java | 4 +- .../logic/importer/fetcher/ScienceDirect.java | 44 ++++++++++------- .../fetcher/GrobidCitationFetcherTest.java | 49 ++++++++++++------- .../importer/fetcher/ScienceDirectTest.java | 12 +++++ 5 files changed, 74 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d68ce13aa8..814d5209d60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We fixed an issue about duplicated group color indicators [#6175](https://github.com/JabRef/jabref/issues/6175) - We fixed an issue where entries with the entry type Misc from an imported aux file would not be saved correctly to the bib file on disk [#6405](https://github.com/JabRef/jabref/issues/6405) - We fixed an issue where percent sign ('%') was not formatted properly by the HTML formatter [#6753](https://github.com/JabRef/jabref/issues/6753) +- We fixed an issue with the [SAO/NASA Astrophysics Data System](https://docs.jabref.org/collect/import-using-online-bibliographic-database/ads) fetcher where `\textbackslash` appeared at the end of the abstract. +- We fixed an issue with the Science Direct fetcher where PDFs could not be downloaded. Fixes [#5860](https://github.com/JabRef/jabref/issues/5860) ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/AstrophysicsDataSystem.java b/src/main/java/org/jabref/logic/importer/fetcher/AstrophysicsDataSystem.java index 33fdcabbea5..a1253589ce7 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/AstrophysicsDataSystem.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/AstrophysicsDataSystem.java @@ -156,7 +156,9 @@ public void doPostCleanup(BibEntry entry) { .ifPresent(abstractText -> entry.clearField(StandardField.ABSTRACT)); entry.getField(StandardField.ABSTRACT) - .map(abstractText -> abstractText.replace("

", "").trim()) + .map(abstractText -> abstractText.replace("

", "")) + .map(abstractText -> abstractText.replace("\\textbackslash", "")) + .map(abstractText -> abstractText.trim()) .ifPresent(abstractText -> entry.setField(StandardField.ABSTRACT, abstractText)); // The fetcher adds some garbage (number of found entries etc before) entry.setCommentsBeforeEntry(""); diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java index 86e180de257..9db80d2b600 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java @@ -20,14 +20,12 @@ import kong.unirest.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. - * See https://dev.elsevier.com/ + * FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. See https://dev.elsevier.com/ */ public class ScienceDirect implements FulltextFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(ScienceDirect.class); @@ -62,22 +60,30 @@ public Optional findFullText(BibEntry entry) throws IOException { return Optional.of(new URL(link)); } - // Retrieve PDF link (old page) - // TODO: can possibly be removed - Element link = html.getElementById("pdfLink"); - - if (link != null) { - LOGGER.info("Fulltext PDF found @ ScienceDirect (old page)."); - Optional pdfLink = Optional.of(new URL(link.attr("pdfurl"))); - return pdfLink; - } - // Retrieve PDF link (new page) - // TODO: can possibly be removed - String url = html.getElementsByClass("pdf-download-btn-link").attr("href"); - - if (url != null) { - LOGGER.info("Fulltext PDF found @ ScienceDirect (new page)."); - Optional pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url)); + URL url = new URL(sciLink); + String protocol = url.getProtocol(); + String authority = url.getAuthority(); + + Optional fullLinkToPdf = html + .getElementsByAttributeValue("type", "application/json") + .stream() + .flatMap(element -> element.getElementsByTag("script").stream()) + // get the text element + .map(element -> element.childNode(0)) + .map(element -> element.toString()) + .map(text -> new JSONObject(text)) + .filter(json -> json.has("article")) + .map(json -> json.getJSONObject("article")) + .filter(json -> json.has("pdfDownload")) + .map(json -> json.getJSONObject("pdfDownload")) + .filter(json -> json.has("linkToPdf")) + .map(json -> json.getString("linkToPdf")) + .map(linkToPdf -> String.format("%s://%s%s", protocol, authority, linkToPdf)) + .findAny(); + if (fullLinkToPdf.isPresent()) { + LOGGER.info("Fulltext PDF found at ScienceDirect."); + // new URL may through "MalformedURLException", thus using "isPresent()" above and ".get()" + Optional pdfLink = Optional.of(new URL(fullLinkToPdf.get())); return pdfLink; } } diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java index 2235aa9c4ff..45a981d52fe 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GrobidCitationFetcherTest.java @@ -2,6 +2,7 @@ import java.util.Collections; import java.util.List; +import java.util.stream.Stream; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.model.entry.BibEntry; @@ -10,6 +11,9 @@ import org.jabref.testutils.category.FetcherTest; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.mockito.Answers; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -30,12 +34,14 @@ public class GrobidCitationFetcherTest { .withField(StandardField.PAGES, "245--259") .withField(StandardField.VOLUME, "23") .withField(StandardField.NUMBER, "4"); + static String example2 = "Thomas, H. K. (2004). Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation). University of Colorado, Boulder."; static BibEntry example2AsBibEntry = new BibEntry(BibEntry.DEFAULT_TYPE).withCiteKey("-1") .withField(StandardField.AUTHOR, "Thomas, H") .withField(StandardField.TITLE, "Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation)") .withField(StandardField.YEAR, "2004") .withField(StandardField.ADDRESS, "Boulder"); + static String example3 = "Turk, J., Graham, P., & Verhulst, F. (2007). Child and adolescent psychiatry : A developmental approach. Oxford, England: Oxford University Press."; static BibEntry example3AsBibEntry = new BibEntry(BibEntry.DEFAULT_TYPE).withCiteKey("-1") .withField(StandardField.AUTHOR, "Turk, J and Graham, P and Verhulst, F") @@ -43,6 +49,7 @@ public class GrobidCitationFetcherTest { .withField(StandardField.PUBLISHER, "Oxford University Press") .withField(StandardField.YEAR, "2007") .withField(StandardField.ADDRESS, "Oxford, England"); + static String example4 = "Carr, I., & Kidner, R. (2003). Statutes and conventions on international trade law (4th ed.). London, England: Cavendish."; static BibEntry example4AsBibEntry = new BibEntry(StandardEntryType.InBook).withCiteKey("-1") .withField(StandardField.AUTHOR, "Carr, I and Kidner, R") @@ -50,19 +57,28 @@ public class GrobidCitationFetcherTest { .withField(StandardField.PUBLISHER, "Cavendish") .withField(StandardField.YEAR, "2003") .withField(StandardField.ADDRESS, "London, England"); - static String invalidInput1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx________________________________"; - static String invalidInput2 = "¦@#¦@#¦@#¦@#¦@#¦@#¦@°#¦@¦°¦@°"; - @Test - public void grobidPerformSearchCorrectResultTest() { - List entries = grobidCitationFetcher.performSearch(example1); - assertEquals(List.of(example1AsBibEntry), entries); - entries = grobidCitationFetcher.performSearch(example2); - assertEquals(List.of(example2AsBibEntry), entries); - entries = grobidCitationFetcher.performSearch(example3); - assertEquals(List.of(example3AsBibEntry), entries); - entries = grobidCitationFetcher.performSearch(example4); - assertEquals(List.of(example4AsBibEntry), entries); + public static Stream provideExamplesForCorrectResultTest() { + return Stream.of( + Arguments.of("example1", example1AsBibEntry, example1), + Arguments.of("example2", example2AsBibEntry, example2), + Arguments.of("example3", example3AsBibEntry, example3), + Arguments.of("example4", example4AsBibEntry, example4) + ); + } + + public static Stream provideInvalidInput() { + return Stream.of( + Arguments.of("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx________________________________"), + Arguments.of("¦@#¦@#¦@#¦@#¦@#¦@#¦@°#¦@¦°¦@°") + ); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("provideExamplesForCorrectResultTest") + public void grobidPerformSearchCorrectResultTest(String testName, BibEntry expectedBibEntry, String searchQuery) { + List entries = grobidCitationFetcher.performSearch(searchQuery); + assertEquals(List.of(expectedBibEntry), entries); } @Test @@ -77,11 +93,10 @@ public void grobidPerformSearchWithEmptyStringsTest() { assertEquals(Collections.emptyList(), entries); } - @Test - public void grobidPerformSearchWithInvalidDataTest() { - List entries = grobidCitationFetcher.performSearch(invalidInput1); - assertEquals(Collections.emptyList(), entries); - entries = grobidCitationFetcher.performSearch(invalidInput2); + @ParameterizedTest + @MethodSource("provideInvalidInput") + public void grobidPerformSearchWithInvalidDataTest(String invalidInput) { + List entries = grobidCitationFetcher.performSearch(invalidInput); assertEquals(Collections.emptyList(), entries); } diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java index b868a5b715c..95bcf8b6358 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java @@ -48,6 +48,18 @@ void findByDOINewPage() throws IOException { ); } + @Test + @DisabledOnCIServer("CI server is blocked") + void findByDoiWorksForBoneArticle() throws IOException { + // The DOI is an example by a user taken from https://github.com/JabRef/jabref/issues/5860 + entry.setField(StandardField.DOI, "https://doi.org/10.1016/j.bone.2020.115226"); + + assertEquals( + Optional.of(new URL("https://www.sciencedirect.com/science/article/pii/S8756328220300065/pdfft?md5=0ad75ff155637dec358e5c9fb8b90afd&pid=1-s2.0-S8756328220300065-main.pdf")), + finder.findFullText(entry) + ); + } + @Test @DisabledOnCIServer("CI server is blocked") void notFoundByDOI() throws IOException {