Skip to content

Commit

Permalink
Fix APS and ScienceDirect fetcher (#6781)
Browse files Browse the repository at this point in the history
  • Loading branch information
koppor authored Aug 24, 2020
1 parent 1ed1704 commit 7a9e5d7
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 37 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We fixed an issue about duplicated group color indicators [#6175](https://github.com/JabRef/jabref/issues/6175)
- We fixed an issue where entries with the entry type Misc from an imported aux file would not be saved correctly to the bib file on disk [#6405](https://github.com/JabRef/jabref/issues/6405)
- We fixed an issue where percent sign ('%') was not formatted properly by the HTML formatter [#6753](https://github.com/JabRef/jabref/issues/6753)
- We fixed an issue with the [SAO/NASA Astrophysics Data System](https://docs.jabref.org/collect/import-using-online-bibliographic-database/ads) fetcher where `\textbackslash` appeared at the end of the abstract.
- We fixed an issue with the Science Direct fetcher where PDFs could not be downloaded. Fixes [#5860](https://github.com/JabRef/jabref/issues/5860)

### Removed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ public void doPostCleanup(BibEntry entry) {
.ifPresent(abstractText -> entry.clearField(StandardField.ABSTRACT));

entry.getField(StandardField.ABSTRACT)
.map(abstractText -> abstractText.replace("<P />", "").trim())
.map(abstractText -> abstractText.replace("<P />", ""))
.map(abstractText -> abstractText.replace("\\textbackslash", ""))
.map(abstractText -> abstractText.trim())
.ifPresent(abstractText -> entry.setField(StandardField.ABSTRACT, abstractText));
// The fetcher adds some garbage (number of found entries etc before)
entry.setCommentsBeforeEntry("");
Expand Down
44 changes: 25 additions & 19 deletions src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,12 @@
import kong.unirest.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect.
* See <a href="https://dev.elsevier.com/">https://dev.elsevier.com/</a>
* FulltextFetcher implementation that attempts to find a PDF URL at ScienceDirect. See <a href="https://dev.elsevier.com/">https://dev.elsevier.com/</a>
*/
public class ScienceDirect implements FulltextFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(ScienceDirect.class);
Expand Down Expand Up @@ -62,22 +60,30 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
return Optional.of(new URL(link));
}

// Retrieve PDF link (old page)
// TODO: can possibly be removed
Element link = html.getElementById("pdfLink");

if (link != null) {
LOGGER.info("Fulltext PDF found @ ScienceDirect (old page).");
Optional<URL> pdfLink = Optional.of(new URL(link.attr("pdfurl")));
return pdfLink;
}
// Retrieve PDF link (new page)
// TODO: can possibly be removed
String url = html.getElementsByClass("pdf-download-btn-link").attr("href");

if (url != null) {
LOGGER.info("Fulltext PDF found @ ScienceDirect (new page).");
Optional<URL> pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url));
URL url = new URL(sciLink);
String protocol = url.getProtocol();
String authority = url.getAuthority();

Optional<String> fullLinkToPdf = html
.getElementsByAttributeValue("type", "application/json")
.stream()
.flatMap(element -> element.getElementsByTag("script").stream())
// get the text element
.map(element -> element.childNode(0))
.map(element -> element.toString())
.map(text -> new JSONObject(text))
.filter(json -> json.has("article"))
.map(json -> json.getJSONObject("article"))
.filter(json -> json.has("pdfDownload"))
.map(json -> json.getJSONObject("pdfDownload"))
.filter(json -> json.has("linkToPdf"))
.map(json -> json.getString("linkToPdf"))
.map(linkToPdf -> String.format("%s://%s%s", protocol, authority, linkToPdf))
.findAny();
if (fullLinkToPdf.isPresent()) {
LOGGER.info("Fulltext PDF found at ScienceDirect.");
// new URL may through "MalformedURLException", thus using "isPresent()" above and ".get()"
Optional<URL> pdfLink = Optional.of(new URL(fullLinkToPdf.get()));
return pdfLink;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.util.Collections;
import java.util.List;
import java.util.stream.Stream;

import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.model.entry.BibEntry;
Expand All @@ -10,6 +11,9 @@
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.mockito.Answers;

import static org.junit.jupiter.api.Assertions.assertEquals;
Expand All @@ -30,39 +34,51 @@ public class GrobidCitationFetcherTest {
.withField(StandardField.PAGES, "245--259")
.withField(StandardField.VOLUME, "23")
.withField(StandardField.NUMBER, "4");

static String example2 = "Thomas, H. K. (2004). Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation). University of Colorado, Boulder.";
static BibEntry example2AsBibEntry = new BibEntry(BibEntry.DEFAULT_TYPE).withCiteKey("-1")
.withField(StandardField.AUTHOR, "Thomas, H")
.withField(StandardField.TITLE, "Training strategies for improving listeners' comprehension of foreign-accented speech (Doctoral dissertation)")
.withField(StandardField.YEAR, "2004")
.withField(StandardField.ADDRESS, "Boulder");

static String example3 = "Turk, J., Graham, P., & Verhulst, F. (2007). Child and adolescent psychiatry : A developmental approach. Oxford, England: Oxford University Press.";
static BibEntry example3AsBibEntry = new BibEntry(BibEntry.DEFAULT_TYPE).withCiteKey("-1")
.withField(StandardField.AUTHOR, "Turk, J and Graham, P and Verhulst, F")
.withField(StandardField.TITLE, "Child and adolescent psychiatry : A developmental approach")
.withField(StandardField.PUBLISHER, "Oxford University Press")
.withField(StandardField.YEAR, "2007")
.withField(StandardField.ADDRESS, "Oxford, England");

static String example4 = "Carr, I., & Kidner, R. (2003). Statutes and conventions on international trade law (4th ed.). London, England: Cavendish.";
static BibEntry example4AsBibEntry = new BibEntry(StandardEntryType.InBook).withCiteKey("-1")
.withField(StandardField.AUTHOR, "Carr, I and Kidner, R")
.withField(StandardField.BOOKTITLE, "Statutes and conventions on international trade law")
.withField(StandardField.PUBLISHER, "Cavendish")
.withField(StandardField.YEAR, "2003")
.withField(StandardField.ADDRESS, "London, England");
static String invalidInput1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx________________________________";
static String invalidInput2 = "¦@#¦@#¦@#¦@#¦@#¦@#¦@°#¦@¦°¦@°";

@Test
public void grobidPerformSearchCorrectResultTest() {
List<BibEntry> entries = grobidCitationFetcher.performSearch(example1);
assertEquals(List.of(example1AsBibEntry), entries);
entries = grobidCitationFetcher.performSearch(example2);
assertEquals(List.of(example2AsBibEntry), entries);
entries = grobidCitationFetcher.performSearch(example3);
assertEquals(List.of(example3AsBibEntry), entries);
entries = grobidCitationFetcher.performSearch(example4);
assertEquals(List.of(example4AsBibEntry), entries);
public static Stream<Arguments> provideExamplesForCorrectResultTest() {
return Stream.of(
Arguments.of("example1", example1AsBibEntry, example1),
Arguments.of("example2", example2AsBibEntry, example2),
Arguments.of("example3", example3AsBibEntry, example3),
Arguments.of("example4", example4AsBibEntry, example4)
);
}

public static Stream<Arguments> provideInvalidInput() {
return Stream.of(
Arguments.of("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx________________________________"),
Arguments.of("¦@#¦@#¦@#¦@#¦@#¦@#¦@°#¦@¦°¦@°")
);
}

@ParameterizedTest(name = "{0}")
@MethodSource("provideExamplesForCorrectResultTest")
public void grobidPerformSearchCorrectResultTest(String testName, BibEntry expectedBibEntry, String searchQuery) {
List<BibEntry> entries = grobidCitationFetcher.performSearch(searchQuery);
assertEquals(List.of(expectedBibEntry), entries);
}

@Test
Expand All @@ -77,11 +93,10 @@ public void grobidPerformSearchWithEmptyStringsTest() {
assertEquals(Collections.emptyList(), entries);
}

@Test
public void grobidPerformSearchWithInvalidDataTest() {
List<BibEntry> entries = grobidCitationFetcher.performSearch(invalidInput1);
assertEquals(Collections.emptyList(), entries);
entries = grobidCitationFetcher.performSearch(invalidInput2);
@ParameterizedTest
@MethodSource("provideInvalidInput")
public void grobidPerformSearchWithInvalidDataTest(String invalidInput) {
List<BibEntry> entries = grobidCitationFetcher.performSearch(invalidInput);
assertEquals(Collections.emptyList(), entries);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ void findByDOINewPage() throws IOException {
);
}

@Test
@DisabledOnCIServer("CI server is blocked")
void findByDoiWorksForBoneArticle() throws IOException {
// The DOI is an example by a user taken from https://github.com/JabRef/jabref/issues/5860
entry.setField(StandardField.DOI, "https://doi.org/10.1016/j.bone.2020.115226");

assertEquals(
Optional.of(new URL("https://www.sciencedirect.com/science/article/pii/S8756328220300065/pdfft?md5=0ad75ff155637dec358e5c9fb8b90afd&pid=1-s2.0-S8756328220300065-main.pdf")),
finder.findFullText(entry)
);
}

@Test
@DisabledOnCIServer("CI server is blocked")
void notFoundByDOI() throws IOException {
Expand Down

0 comments on commit 7a9e5d7

Please sign in to comment.