Skip to content

Commit

Permalink
Add logic for new Sciencedirect pages (#2576)
Browse files Browse the repository at this point in the history
* Fixes #2574 Add logic for new Sciencedirect pages
* Add changelog
  • Loading branch information
stefan-kolb authored Feb 21, 2017
1 parent 41d1551 commit 428ae43
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 7 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The field `issue` is now always exported to the corresponding `issue` field in MS-Office XML.
- We fixed an issue with repeated escaping of the %-sign when running the LaTeXCleanup more than once. [#2451](https://github.com/JabRef/jabref/issues/2451)
- We fixed the import of MS-Office XML files, when the `month` field contained an invalid value.



- Sciencedirect/Elsevier fetcher is now able to scrape new HTML structure [#2576](https://github.com/JabRef/jabref/issues/2576)








### Removed


Expand Down
25 changes: 19 additions & 6 deletions src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ public class ScienceDirect implements FulltextFetcher {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// Try unique DOI first
Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::build);
Expand All @@ -46,21 +45,35 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
try {
String sciLink = getUrlByDoi(doi.get().getDOI());

// scrape the web page not as mobile client!
if (!sciLink.isEmpty()) {
// Retrieve PDF link
Document html = Jsoup.connect(sciLink).ignoreHttpErrors(true).get();
Document html = Jsoup.connect(sciLink)
.userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
.referrer("http://www.google.com")
.ignoreHttpErrors(true).get();

// Retrieve PDF link (old page)
Element link = html.getElementById("pdfLink");

if (link != null) {
LOGGER.info("Fulltext PDF found @ ScienceDirect.");
pdfLink = Optional.of(new URL(link.attr("pdfurl")));
LOGGER.info("Fulltext PDF found @ ScienceDirect (old page).");
Optional<URL> pdfLink = Optional.of(new URL(link.attr("pdfurl")));
return pdfLink;
}
// Retrieve PDF link (new page)
String url = html.getElementsByClass("pdf-download-btn-link").attr("href");

if (url != null) {
LOGGER.info("Fulltext PDF found @ ScienceDirect (new page).");
Optional<URL> pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url));
return pdfLink;
}
}
} catch(UnirestException e) {
LOGGER.warn("ScienceDirect API request failed", e);
}
}
return pdfLink;
return Optional.empty();
}

private String getUrlByDoi(String doi) throws UnirestException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void doiNotPresent() throws IOException {
}

@Test
public void findByDOI() throws IOException {
public void findByDOIOldPage() throws IOException {
// CI server is blocked
Assume.assumeFalse(DevEnvironment.isCIServer());

Expand All @@ -50,6 +50,19 @@ public void findByDOI() throws IOException {
);
}

@Test
public void findByDOINewPage() throws IOException {
// CI server is blocked
Assume.assumeFalse(DevEnvironment.isCIServer());

entry.setField("doi", "10.1016/j.aasri.2014.09.002");

Assert.assertEquals(
Optional.of(new URL("http://www.sciencedirect.com/science/article/pii/S2212671614001024/pdf?md5=4e2e9a369b4d5b3db5100aba599bef8b&pid=1-s2.0-S2212671614001024-main.pdf")),
finder.findFullText(entry)
);
}

@Test
public void notFoundByDOI() throws IOException {
// CI server is blocked
Expand Down

0 comments on commit 428ae43

Please sign in to comment.