Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Google Scholar fetcher for downloading a single entry #7075

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.jabref.gui.dialogs;

import java.util.concurrent.CountDownLatch;

import javafx.application.Platform;
import javafx.scene.control.ButtonType;
import javafx.scene.web.WebView;

import org.jabref.gui.util.BaseDialog;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.net.URLDownload;

import org.jsoup.helper.W3CDom;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

public class CaptchaSolverDialog extends BaseDialog<String> implements org.jabref.logic.importer.fetcher.CaptchaSolver {

public static final Logger LOGGER = LoggerFactory.getLogger(CaptchaSolverDialog.class);

private WebView webView;

public CaptchaSolverDialog() {
super();
this.setTitle(Localization.lang("Captcha Solver"));
getDialogPane().getButtonTypes().add(ButtonType.CLOSE);
getDialogPane().lookupButton(ButtonType.CLOSE).setVisible(true);

webView = new WebView();
webView.getEngine().setJavaScriptEnabled(true);
webView.getEngine().setUserAgent(URLDownload.USER_AGENT);
getDialogPane().setContent(webView);
}

@Override
public String solve(String queryURL) {
// slim implementation of https://news.kynosarges.org/2014/05/01/simulating-platform-runandwait/
final CountDownLatch doneLatch = new CountDownLatch(1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to listen for the web engine ready event, see the preview Tab viewer where we add this highlight ja stuff

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

   previewView.getEngine().getLoadWorker().stateProperty().addListener((observable, oldValue, newValue) -> {

            if (newValue != Worker.State.SUCCEEDED) {
                return;
            }

See https://openjfx.io/javadoc/11/javafx.web/javafx/scene/web/WebEngine.html

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to listen for the web engine ready event, see the preview Tab viewer where we add this highlight ja stuff

Is this happen synchronously? The interface for the Captcha solver is designed in a synchronous way. Otherwise all fetchers need to be changed.

I'll be away anyway for the next days. Thus, you are free to experiment 😅

Platform.runLater(() -> {
webView.getEngine().load(queryURL);
// For the quick implementation, we ignore the result
// Later, at "webView", we directly extract it from the web view
this.showAndWait();
doneLatch.countDown();
});
try {
doneLatch.await();
Document document = webView.getEngine().getDocument();
return W3CDom.asString(document, null);
} catch (InterruptedException e) {
LOGGER.error("Issues with the UI", e);
}
return "";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@

import org.jabref.gui.DialogService;
import org.jabref.gui.StateManager;
import org.jabref.gui.dialogs.CaptchaSolverDialog;
import org.jabref.gui.importer.ImportEntriesDialog;
import org.jabref.gui.util.BackgroundTask;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.WebFetchers;
import org.jabref.logic.importer.fetcher.GoogleScholar;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.strings.StringUtil;
import org.jabref.preferences.PreferencesService;
Expand All @@ -43,6 +45,7 @@ public WebSearchPaneViewModel(PreferencesService preferencesService, DialogServi
this.dialogService = dialogService;
this.stateManager = stateManager;

WebFetchers.setCaptchaSolver(new CaptchaSolverDialog());
SortedSet<SearchBasedFetcher> allFetchers = WebFetchers.getSearchBasedFetchers(preferencesService.getImportFormatPreferences());
fetchers.setAll(allFetchers);

Expand Down Expand Up @@ -107,6 +110,9 @@ public void search() {
task = BackgroundTask.wrap(() -> new ParserResult(activeFetcher.performSearch(getQuery().trim())))
.withInitialMessage(Localization.lang("Processing %0", getQuery().trim()));
task.onFailure(dialogService::showErrorDialogAndWait);
if (activeFetcher instanceof GoogleScholar) {
task.showToUser(true);
}

ImportEntriesDialog dialog = new ImportEntriesDialog(stateManager.getActiveDatabase().get(), task);
dialog.setTitle(activeFetcher.getName());
Expand Down
13 changes: 11 additions & 2 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.jabref.logic.importer.fetcher.ApsFetcher;
import org.jabref.logic.importer.fetcher.ArXiv;
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
import org.jabref.logic.importer.fetcher.CaptchaSolver;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
Expand All @@ -31,6 +32,7 @@
import org.jabref.logic.importer.fetcher.MathSciNet;
import org.jabref.logic.importer.fetcher.MedlineFetcher;
import org.jabref.logic.importer.fetcher.Medra;
import org.jabref.logic.importer.fetcher.NoneCaptchaSolver;
import org.jabref.logic.importer.fetcher.OpenAccessDoi;
import org.jabref.logic.importer.fetcher.RfcFetcher;
import org.jabref.logic.importer.fetcher.ScienceDirect;
Expand All @@ -51,6 +53,13 @@ public class WebFetchers {
private WebFetchers() {
}

// Default CaptchaSolver is the useless one (which just does not through an exception)
private static CaptchaSolver captchaSolver = new NoneCaptchaSolver();

public static void setCaptchaSolver(CaptchaSolver captchaSolver) {
WebFetchers.captchaSolver = captchaSolver;
}

public static Optional<IdBasedFetcher> getIdBasedFetcherForField(Field field, ImportFormatPreferences preferences) {
IdBasedFetcher fetcher;

Expand Down Expand Up @@ -96,7 +105,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new ZbMATH(importFormatPreferences));
// see https://github.com/JabRef/jabref/issues/5804
// set.add(new ACMPortalFetcher(importFormatPreferences));
set.add(new GoogleScholar(importFormatPreferences));
set.add(new GoogleScholar(importFormatPreferences, captchaSolver));
set.add(new DBLPFetcher(importFormatPreferences));
set.add(new SpringerFetcher());
set.add(new CrossRef());
Expand Down Expand Up @@ -170,7 +179,7 @@ public static Set<FulltextFetcher> getFullTextFetchers(ImportFormatPreferences i
fetchers.add(new ApsFetcher());
// Meta search
fetchers.add(new JstorFetcher(importFormatPreferences));
fetchers.add(new GoogleScholar(importFormatPreferences));
fetchers.add(new GoogleScholar(importFormatPreferences, captchaSolver));
fetchers.add(new OpenAccessDoi());

return fetchers;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
public interface CaptchaSolver {

/**
* Instructes the user to solve the captcha given at
* @param queryURL
* @return
* Instructs the user to solve the captcha given at
*
* @param queryURL the URL to query
* @return html content after solving the captcha
*/
String solve(String queryURL);
}
131 changes: 64 additions & 67 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.jabref.model.util.DummyFileUpdateMonitor;

import com.sun.star.sheet.XSolver;
import kong.unirest.Unirest;
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand Down Expand Up @@ -88,11 +89,6 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
}
}

@Override
public TrustLevel getTrustLevel() {
return TrustLevel.META_SEARCH;
}

private Optional<URL> search(String url) throws IOException {
Optional<URL> pdfLink = Optional.empty();

Expand Down Expand Up @@ -122,8 +118,9 @@ private Optional<URL> search(String url) throws IOException {
return pdfLink;
}

private boolean needsCaptcha(String body) {
return body.contains("id=\"gs_captcha_ccl\"");
@Override
public TrustLevel getTrustLevel() {
return TrustLevel.META_SEARCH;
}

@Override
Expand All @@ -136,9 +133,62 @@ public Optional<HelpFile> getHelpPage() {
return Optional.of(HelpFile.FETCHER_GOOGLE_SCHOLAR);
}

@Override
public Page<BibEntry> performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
LOGGER.debug("Using query {}", complexSearchQuery);
List<BibEntry> foundEntries = new ArrayList<>(getPageSize());

String complexQueryString = constructComplexQueryString(complexSearchQuery);
final URIBuilder uriBuilder;
try {
uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName(), e);
}
uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", complexQueryString);
uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize()));
uriBuilder.addParameter("num", String.valueOf(getPageSize()));
complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString()));
complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString()));
complexSearchQuery.getSingleYear().ifPresent(year -> {
uriBuilder.addParameter("as_ylo", year.toString());
uriBuilder.addParameter("as_yhi", year.toString());
});

String queryURL = uriBuilder.toString();
LOGGER.debug("Using URL {}", queryURL);
try {
addHitsFromQuery(foundEntries, queryURL);
} catch (IOException e) {
LOGGER.info("IOException for URL {}", queryURL);
// If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge
// Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") ||
e.getMessage().contains("Server returned HTTP response code: 429 for URL") ||
e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
LOGGER.debug("Captcha found. Calling the CaptchaSolver");
String content = captchaSolver.solve(queryURL);
LOGGER.debug("Returned result {}", content);
try {
extractEntriesFromContent(content, foundEntries);
} catch (IOException ioException) {
LOGGER.error("Still failing at Google Scholar", ioException);
}
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
return new Page<>(complexQueryString, pageNumber, foundEntries);
}

private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery) {
List<String> searchTerms = new ArrayList<>();
searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases());
List<String> searchTerms = new ArrayList<>(complexSearchQuery.getDefaultFieldPhrases());
complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("author:" + author));
if (!complexSearchQuery.getTitlePhrases().isEmpty()) {
searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases()));
Expand All @@ -153,13 +203,7 @@ private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws
URLDownload urlDownload = new URLDownload(queryURL);
obtainAndModifyCookie(urlDownload);

// We need JSOUP directly to read the content when 429 is returned

String content;
try {
content = urlDownload.asString();
}

String content = urlDownload.asString();
if (needsCaptcha(content)) {
throw new FetcherException("Fetching from Google Scholar failed: Captcha hit at " + queryURL + ".",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
Expand Down Expand Up @@ -227,56 +271,6 @@ private void obtainAndModifyCookie(URLDownload downloader) throws FetcherExcepti
}
}

@Override
public Page<BibEntry> performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
LOGGER.debug("Using query {}", complexSearchQuery);
List<BibEntry> foundEntries = new ArrayList<>(getPageSize());

String complexQueryString = constructComplexQueryString(complexSearchQuery);
final URIBuilder uriBuilder;
try {
uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName(), e);
}
uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", complexQueryString);
uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize()));
uriBuilder.addParameter("num", String.valueOf(getPageSize()));
complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString()));
complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString()));
complexSearchQuery.getSingleYear().ifPresent(year -> {
uriBuilder.addParameter("as_ylo", year.toString());
uriBuilder.addParameter("as_yhi", year.toString());
});

String queryURL = uriBuilder.toString();
LOGGER.debug("Using URL {}", queryURL);
try {
addHitsFromQuery(foundEntries, queryURL);
} catch (IOException e) {
LOGGER.info("IOException for URL {}", queryURL);
// If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge
// Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") ||
e.getMessage().contains("Server returned HTTP response code: 429 for URL") ||
e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
LOGGER.debug("Captcha found. Calling the CaptchaSolver");
String content = captchaSolver.solve(queryURL);
extractEntriesFromContent(content, foundEntries);

throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
return new Page<>(complexQueryString, pageNumber, foundEntries);
}

public void displayCaptchaDialog(String link) {
Platform.runLater(() -> new CaptchaDialog(link).showAndWait());
/*
Expand All @@ -286,6 +280,10 @@ public void displayCaptchaDialog(String link) {
*/
}

private boolean needsCaptcha(String body) {
return body.contains("id=\"gs_captcha_ccl\"");
}

private static final class CaptchaDialog extends BaseDialog<Void> {
public CaptchaDialog(String content) {
super();
Expand All @@ -303,5 +301,4 @@ public boolean retry() {
return false;
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package org.jabref.logic.importer.fetcher;

public class NoneCaptchaSolver implements CaptchaSolver {
@Override
public String solve(String queryURL) {
return "";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ static Stream<Arguments> performSearchParameters() {
list.add(new AstrophysicsDataSystem(importFormatPreferences));
list.add(new MathSciNet(importFormatPreferences));
list.add(new ZbMATH(importFormatPreferences));
list.add(new GoogleScholar(importFormatPreferences));
list.add(new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver()));
list.add(new DBLPFetcher(importFormatPreferences));
list.add(new SpringerFetcher());
list.add(new CrossRef());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn(
mock(FieldContentFormatterPreferences.class));
fetcher = new GoogleScholar(importFormatPreferences);
fetcher = new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver());
}

@Test
Expand Down