Add logic for new Sciencedirect pages (#2576)

* Fixes #2574 Add logic for new Sciencedirect pages * Add changelog
JabRef · Feb 21, 2017 · 428ae43 · 428ae43
1 parent 41d1551
commit 428ae43
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -43,6 +43,18 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
  - The field `issue` is now always exported to the corresponding `issue` field in MS-Office XML.
  - We fixed an issue with repeated escaping of the %-sign when running the LaTeXCleanup more than once. [#2451](https://github.com/JabRef/jabref/issues/2451)
  - We fixed the import of MS-Office XML files, when the `month` field contained an invalid value.
+
+
+
+ - Sciencedirect/Elsevier fetcher is now able to scrape new HTML structure [#2576](https://github.com/JabRef/jabref/issues/2576)
+
+
+
+
+
+
+
+
 ### Removed
 
 

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java
@@ -36,7 +36,6 @@ public class ScienceDirect implements FulltextFetcher {
     @Override
     public Optional<URL> findFullText(BibEntry entry) throws IOException {
         Objects.requireNonNull(entry);
-        Optional<URL> pdfLink = Optional.empty();
 
         // Try unique DOI first
         Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::build);
@@ -46,21 +45,35 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
             try {
                 String sciLink = getUrlByDoi(doi.get().getDOI());
 
+                // scrape the web page not as mobile client!
                 if (!sciLink.isEmpty()) {
-                    // Retrieve PDF link
-                    Document html = Jsoup.connect(sciLink).ignoreHttpErrors(true).get();
+                    Document html = Jsoup.connect(sciLink)
+                            .userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
+                            .referrer("http://www.google.com")
+                            .ignoreHttpErrors(true).get();
+
+                    // Retrieve PDF link (old page)
                     Element link = html.getElementById("pdfLink");
 
                     if (link != null) {
-                        LOGGER.info("Fulltext PDF found @ ScienceDirect.");
-                        pdfLink = Optional.of(new URL(link.attr("pdfurl")));
+                        LOGGER.info("Fulltext PDF found @ ScienceDirect (old page).");
+                        Optional<URL> pdfLink = Optional.of(new URL(link.attr("pdfurl")));
+                        return pdfLink;
+                    }
+                    // Retrieve PDF link (new page)
+                    String url = html.getElementsByClass("pdf-download-btn-link").attr("href");
+
+                    if (url != null) {
+                        LOGGER.info("Fulltext PDF found @ ScienceDirect (new page).");
+                        Optional<URL> pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url));
+                        return pdfLink;
                     }
                 }
             } catch(UnirestException e) {
                 LOGGER.warn("ScienceDirect API request failed", e);
             }
         }
-        return pdfLink;
+        return Optional.empty();
     }
 
     private String getUrlByDoi(String doi) throws UnirestException {

diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java
@@ -38,7 +38,7 @@ public void doiNotPresent() throws IOException {
     }
 
     @Test
-    public void findByDOI() throws IOException {
+    public void findByDOIOldPage() throws IOException {
         // CI server is blocked
         Assume.assumeFalse(DevEnvironment.isCIServer());
 
@@ -50,6 +50,19 @@ public void findByDOI() throws IOException {
         );
     }
 
+    @Test
+    public void findByDOINewPage() throws IOException {
+        // CI server is blocked
+        Assume.assumeFalse(DevEnvironment.isCIServer());
+
+        entry.setField("doi", "10.1016/j.aasri.2014.09.002");
+
+        Assert.assertEquals(
+                Optional.of(new URL("http://www.sciencedirect.com/science/article/pii/S2212671614001024/pdf?md5=4e2e9a369b4d5b3db5100aba599bef8b&pid=1-s2.0-S2212671614001024-main.pdf")),
+                finder.findFullText(entry)
+        );
+    }
+
     @Test
     public void notFoundByDOI() throws IOException {
         // CI server is blocked