From 428ae436af374e9b5b168eafff3230603ce71fa1 Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Tue, 21 Feb 2017 18:55:54 +0100 Subject: [PATCH] Add logic for new Sciencedirect pages (#2576) * Fixes #2574 Add logic for new Sciencedirect pages * Add changelog --- CHANGELOG.md | 12 +++++++++ .../logic/importer/fetcher/ScienceDirect.java | 25 ++++++++++++++----- .../importer/fetcher/ScienceDirectTest.java | 15 ++++++++++- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfe8ebc7708..68228bd83b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,18 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `# - The field `issue` is now always exported to the corresponding `issue` field in MS-Office XML. - We fixed an issue with repeated escaping of the %-sign when running the LaTeXCleanup more than once. [#2451](https://github.com/JabRef/jabref/issues/2451) - We fixed the import of MS-Office XML files, when the `month` field contained an invalid value. + + + + - Sciencedirect/Elsevier fetcher is now able to scrape new HTML structure [#2576](https://github.com/JabRef/jabref/issues/2576) + + + + + + + + ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java index bb041776c9d..ce63112b1c0 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java @@ -36,7 +36,6 @@ public class ScienceDirect implements FulltextFetcher { @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); // Try unique DOI first Optional doi = entry.getField(FieldName.DOI).flatMap(DOI::build); @@ -46,21 +45,35 @@ public Optional findFullText(BibEntry entry) throws IOException { try { String sciLink = getUrlByDoi(doi.get().getDOI()); + // scrape the web page not as mobile client! if (!sciLink.isEmpty()) { - // Retrieve PDF link - Document html = Jsoup.connect(sciLink).ignoreHttpErrors(true).get(); + Document html = Jsoup.connect(sciLink) + .userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") + .referrer("http://www.google.com") + .ignoreHttpErrors(true).get(); + + // Retrieve PDF link (old page) Element link = html.getElementById("pdfLink"); if (link != null) { - LOGGER.info("Fulltext PDF found @ ScienceDirect."); - pdfLink = Optional.of(new URL(link.attr("pdfurl"))); + LOGGER.info("Fulltext PDF found @ ScienceDirect (old page)."); + Optional pdfLink = Optional.of(new URL(link.attr("pdfurl"))); + return pdfLink; + } + // Retrieve PDF link (new page) + String url = html.getElementsByClass("pdf-download-btn-link").attr("href"); + + if (url != null) { + LOGGER.info("Fulltext PDF found @ ScienceDirect (new page)."); + Optional pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url)); + return pdfLink; } } } catch(UnirestException e) { LOGGER.warn("ScienceDirect API request failed", e); } } - return pdfLink; + return Optional.empty(); } private String getUrlByDoi(String doi) throws UnirestException { diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java index 56d8e9a114f..b2202167687 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java @@ -38,7 +38,7 @@ public void doiNotPresent() throws IOException { } @Test - public void findByDOI() throws IOException { + public void findByDOIOldPage() throws IOException { // CI server is blocked Assume.assumeFalse(DevEnvironment.isCIServer()); @@ -50,6 +50,19 @@ public void findByDOI() throws IOException { ); } + @Test + public void findByDOINewPage() throws IOException { + // CI server is blocked + Assume.assumeFalse(DevEnvironment.isCIServer()); + + entry.setField("doi", "10.1016/j.aasri.2014.09.002"); + + Assert.assertEquals( + Optional.of(new URL("http://www.sciencedirect.com/science/article/pii/S2212671614001024/pdf?md5=4e2e9a369b4d5b3db5100aba599bef8b&pid=1-s2.0-S2212671614001024-main.pdf")), + finder.findFullText(entry) + ); + } + @Test public void notFoundByDOI() throws IOException { // CI server is blocked