Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor fetcher improvements #6126

Merged
merged 3 commits into from
Mar 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions src/main/java/org/jabref/logic/importer/fetcher/ACS.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public class ACS implements FulltextFetcher {

/**
* Tries to find a fulltext URL for a given BibTex entry.
*
* <p>
* Currently only uses the DOI if found.
*
* @param entry The Bibtex entry
Expand All @@ -37,23 +37,24 @@ public class ACS implements FulltextFetcher {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// DOI search
Optional<DOI> doi = entry.getField(StandardField.DOI).flatMap(DOI::parse);

if (doi.isPresent()) {
String source = String.format(SOURCE, doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select("a.button_primary").first();
if (!doi.isPresent()) {
return Optional.empty();
}

String source = String.format(SOURCE, doi.get().getDOI());
// Retrieve PDF link
Document html = Jsoup.connect(source).ignoreHttpErrors(true).get();
Element link = html.select("a.button_primary").first();

if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
pdfLink = Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
if (link != null) {
LOGGER.info("Fulltext PDF found @ ACS.");
return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/")));
}
return pdfLink;
return Optional.empty();
}

@Override
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,8 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
pdfUrl.ifPresent(url -> LOGGER.info("Fulltext PDF found @ arXiv."));

if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
}
return pdfUrl;
} catch (FetcherException e) {
LOGGER.warn("arXiv API request failed", e);
Expand Down
23 changes: 17 additions & 6 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

/**
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
*
* <p>
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
*/
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
Expand All @@ -58,11 +58,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();

// Search in title
if (!entry.hasField(StandardField.TITLE)) {
return pdfLink;
return Optional.empty();
}

try {
Expand All @@ -74,12 +73,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
// as_occt field to search in
uriBuilder.addParameter("as_occt", "title");

pdfLink = search(uriBuilder.toString());
return search(uriBuilder.toString());
} catch (URISyntaxException e) {
throw new FetcherException("Building URI failed.", e);
}

return pdfLink;
}

@Override
Expand All @@ -91,6 +88,11 @@ private Optional<URL> search(String url) throws IOException {
Optional<URL> pdfLink = Optional.empty();

Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get();

if (needsCaptcha(doc.body().html())) {
LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching.");
return Optional.empty();
}
// Check results for PDF link
// TODO: link always on first result or none?
for (int i = 0; i < NUM_RESULTS; i++) {
Expand All @@ -111,6 +113,10 @@ private Optional<URL> search(String url) throws IOException {
return pdfLink;
}

private boolean needsCaptcha(String body) {
return body.contains("id=\"gs_captcha_ccl\"");
}

@Override
public String getName() {
return "Google Scholar";
Expand Down Expand Up @@ -158,6 +164,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
String content = new URLDownload(queryURL).asString();

if (needsCaptcha(content)) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
}

Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
while (matcher.find()) {
String citationsPageURL = matcher.group().replace("&amp;", "&");
Expand Down
11 changes: 10 additions & 1 deletion src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

@FetcherTest
class ACSTest {

private ACS finder;
private BibEntry entry;

Expand Down Expand Up @@ -44,4 +43,14 @@ void notFoundByDOI() throws IOException {

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void entityWithoutDoi() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void trustLevel() {
assertEquals(TrustLevel.PUBLISHER, finder.getTrustLevel());
}
}
11 changes: 10 additions & 1 deletion src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

@FetcherTest
class ArXivTest {

private ArXiv finder;
private BibEntry entry;
private BibEntry sliceTheoremPaper;
Expand Down Expand Up @@ -121,6 +120,16 @@ void findFullTextByDOINotAvailableInCatalog() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void findFullTextEntityWithoutDoi() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
void findFullTextTrustLevel() {
assertEquals(TrustLevel.PREPRINT, finder.getTrustLevel());
}

@Test
void searchEntryByPartOfTitle() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
Expand Down