From 9e64e0ce12788f7ea7385acce799568bbef0853b Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 7 Sep 2024 18:52:52 +0200 Subject: [PATCH 1/6] Try with HTMLUnit --- build.gradle | 3 ++ src/main/java/module-info.java | 5 ++- .../jabref/logic/importer/fetcher/ACS.java | 40 +++++++++++-------- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/build.gradle b/build.gradle index fd199a8b667..87bed09a28c 100644 --- a/build.gradle +++ b/build.gradle @@ -255,10 +255,13 @@ dependencies { implementation 'org.controlsfx:controlsfx:11.2.1' + // region HTTP clients + implementation 'org.htmlunit:htmlunit:4.4.0' // used for web scraping implementation 'org.jsoup:jsoup:1.18.1' implementation 'com.konghq:unirest-java-core:4.4.4' implementation 'com.konghq:unirest-modules-gson:4.4.4' implementation 'org.apache.httpcomponents.client5:httpclient5:5.3.1' + // endregion implementation 'org.slf4j:slf4j-api:2.0.16' implementation 'org.tinylog:tinylog-api:2.7.0' diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index 040717bf907..dc18d9dd3f2 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -90,10 +90,11 @@ requires org.glassfish.hk2.api; // region: http clients - requires unirest.java.core; - requires unirest.modules.gson; + requires htmlunit; requires org.apache.httpcomponents.core5.httpcore5; requires org.jsoup; + requires unirest.java.core; + requires unirest.modules.gson; // endregion // region: SQL databases diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 3c81d89db2a..581810d0479 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -10,6 +10,9 @@ import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; +import org.htmlunit.BrowserVersion; +import org.htmlunit.WebClient; +import org.htmlunit.html.HtmlPage; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -25,34 +28,37 @@ public class ACS implements FulltextFetcher { private static final String SOURCE = "https://pubs.acs.org/doi/abs/%s"; /** - * Tries to find a fulltext URL for a given BibTex entry. - *

- * Currently only uses the DOI if found. - * - * @param entry The Bibtex entry - * @return The fulltext PDF URL Optional, if found, or an empty Optional if not found. - * @throws NullPointerException if no BibTex entry is given - * @throws java.io.IOException + * Tries to find a fulltext URL for a given BibTeX entry. + * Requires the entry to have a DOI field. + * In case no DOI is present, an empty Optional is returned. */ @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - - // DOI search Optional doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); - if (!doi.isPresent()) { return Optional.empty(); } String source = SOURCE.formatted(doi.get().getDOI()); - // Retrieve PDF link - Document html = Jsoup.connect(source).ignoreHttpErrors(true).get(); - Element link = html.select("a.button_primary").first(); - if (link != null) { - LOGGER.info("Fulltext PDF found @ ACS."); - return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); + try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) { + webClient.getOptions().setSSLClientProtocols("TLSv1.3", "TLSv1.2"); + // inspired by https://www.innoq.com/en/blog/2016/01/webscraping/ + webClient.getCookieManager().setCookiesEnabled(true); + webClient.getOptions().setJavaScriptEnabled(true); + webClient.getOptions().setTimeout(10_000); + webClient.waitForBackgroundJavaScript(5000); + webClient.getOptions().setThrowExceptionOnScriptError(false); + webClient.getOptions().setPrintContentOnFailingStatusCode(true); + + HtmlPage page = webClient.getPage(source); + boolean pdfButtonExists = page.querySelectorAll("a[title=\"PDF\"].article__btn__secondary").isEmpty(); + if (pdfButtonExists) { + LOGGER.info("Fulltext PDF found at ACS."); + // We "guess" the URL instead of parsing the HTML for the actual link + return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); + } } return Optional.empty(); } From b256ba712a1482692a8d06731e5e03a242e942d5 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 7 Sep 2024 19:54:18 +0200 Subject: [PATCH 2/6] Try JCEF --- .gitignore | 3 + build.gradle | 2 +- src/main/java/module-info.java | 3 +- .../jabref/logic/importer/fetcher/ACS.java | 75 +++++++++++++------ 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 2e33739ec64..e3abb530fc8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,11 @@ src/main/gen/ src/main/generated/ src-gen/ + .lycheecache +jcef-bundle/ + javafx/javafx-sdk-* javafx/javafx-jmods-* javafx/javafx.html diff --git a/build.gradle b/build.gradle index 87bed09a28c..912ec1edcb3 100644 --- a/build.gradle +++ b/build.gradle @@ -256,7 +256,7 @@ dependencies { implementation 'org.controlsfx:controlsfx:11.2.1' // region HTTP clients - implementation 'org.htmlunit:htmlunit:4.4.0' // used for web scraping + implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping implementation 'org.jsoup:jsoup:1.18.1' implementation 'com.konghq:unirest-java-core:4.4.4' implementation 'com.konghq:unirest-modules-gson:4.4.4' diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index dc18d9dd3f2..34224f33dc2 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -90,7 +90,7 @@ requires org.glassfish.hk2.api; // region: http clients - requires htmlunit; + requires jcefmaven; requires org.apache.httpcomponents.core5.httpcore5; requires org.jsoup; requires unirest.java.core; @@ -184,5 +184,6 @@ requires mslinks; requires org.antlr.antlr4.runtime; requires org.libreoffice.uno; + requires jcef; // endregion } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 581810d0479..1f6d8d5be33 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -10,17 +10,21 @@ import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; -import org.htmlunit.BrowserVersion; -import org.htmlunit.WebClient; -import org.htmlunit.html.HtmlPage; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; +import me.friwi.jcefmaven.CefAppBuilder; +import me.friwi.jcefmaven.MavenCefAppHandlerAdapter; +import org.cef.CefApp; +import org.cef.CefClient; +import org.cef.CefSettings; +import org.cef.browser.CefBrowser; +import org.cef.browser.CefFrame; +import org.cef.callback.CefStringVisitor; +import org.cef.handler.CefDisplayHandlerAdapter; +import org.cef.handler.CefLoadHandlerAdapter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * FulltextFetcher implementation that attempts to find a PDF URL at ACS. + * FulltextFetcher implementation that attempts to find a PDF URL at ACS. */ public class ACS implements FulltextFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(ACS.class); @@ -42,24 +46,51 @@ public Optional findFullText(BibEntry entry) throws IOException { String source = SOURCE.formatted(doi.get().getDOI()); - try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) { - webClient.getOptions().setSSLClientProtocols("TLSv1.3", "TLSv1.2"); - // inspired by https://www.innoq.com/en/blog/2016/01/webscraping/ - webClient.getCookieManager().setCookiesEnabled(true); - webClient.getOptions().setJavaScriptEnabled(true); - webClient.getOptions().setTimeout(10_000); - webClient.waitForBackgroundJavaScript(5000); - webClient.getOptions().setThrowExceptionOnScriptError(false); - webClient.getOptions().setPrintContentOnFailingStatusCode(true); + CefAppBuilder builder = new CefAppBuilder(); + builder.setAppHandler(new MavenCefAppHandlerAdapter(){}); + CefApp cefApp; + try { + cefApp = builder.build(); + } catch (Exception e) { + LOGGER.error("Could not initialize CEF", e); + throw new IOException(e); + } + + CefClient client = cefApp.createClient(); + CefBrowser browser = client.createBrowser(source, false, false); + + client.addLoadHandler(new CefLoadHandlerAdapter() { + @Override + public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) { + System.out.println("lalala"); + if (frame.isMain()) { + frame.executeJavaScript( + "document.documentElement.outerHTML;", + frame.getURL(), + 0 + ); + } + } + }); - HtmlPage page = webClient.getPage(source); - boolean pdfButtonExists = page.querySelectorAll("a[title=\"PDF\"].article__btn__secondary").isEmpty(); - if (pdfButtonExists) { - LOGGER.info("Fulltext PDF found at ACS."); - // We "guess" the URL instead of parsing the HTML for the actual link - return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); + client.addDisplayHandler(new CefDisplayHandlerAdapter() { + @Override + public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity level, String message, String source, int line) { + // Capture the result of the JavaScript execution in the console message + System.out.println("Page HTML content:\n" + message); + return true; } + }); + + browser.loadURL(source); + + try { + Thread.sleep(5000); + } catch ( + InterruptedException e) { + throw new RuntimeException(e); } + return Optional.empty(); } From 7b12c8ab32cb2e4f8e8b416949d81f579230842b Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 7 Sep 2024 20:26:08 +0200 Subject: [PATCH 3/6] wip --- build.gradle | 2 +- buildres/abbrv.jabref.org | 2 +- .../jabref/logic/importer/fetcher/ACS.java | 58 ++++++++++++++++--- src/main/resources/csl-locales | 2 +- src/main/resources/csl-styles | 2 +- 5 files changed, 55 insertions(+), 11 deletions(-) diff --git a/build.gradle b/build.gradle index 912ec1edcb3..eafd40f02db 100644 --- a/build.gradle +++ b/build.gradle @@ -256,7 +256,7 @@ dependencies { implementation 'org.controlsfx:controlsfx:11.2.1' // region HTTP clients - implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping + implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping; https://github.com/jcefmaven/jcefmaven implementation 'org.jsoup:jsoup:1.18.1' implementation 'com.konghq:unirest-java-core:4.4.4' implementation 'com.konghq:unirest-modules-gson:4.4.4' diff --git a/buildres/abbrv.jabref.org b/buildres/abbrv.jabref.org index b69f1d607a5..8fbad5a1285 160000 --- a/buildres/abbrv.jabref.org +++ b/buildres/abbrv.jabref.org @@ -1 +1 @@ -Subproject commit b69f1d607a57488276f3402bbf610d9129e7f6fb +Subproject commit 8fbad5a1285926b177803087b35b0eb6b0fd0142 diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 1f6d8d5be33..ddd36ecf914 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -4,6 +4,8 @@ import java.net.URL; import java.util.Objects; import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.model.entry.BibEntry; @@ -11,15 +13,15 @@ import org.jabref.model.entry.identifier.DOI; import me.friwi.jcefmaven.CefAppBuilder; -import me.friwi.jcefmaven.MavenCefAppHandlerAdapter; import org.cef.CefApp; import org.cef.CefClient; import org.cef.CefSettings; import org.cef.browser.CefBrowser; import org.cef.browser.CefFrame; -import org.cef.callback.CefStringVisitor; +import org.cef.handler.CefAppHandlerAdapter; import org.cef.handler.CefDisplayHandlerAdapter; import org.cef.handler.CefLoadHandlerAdapter; +import org.cef.network.CefRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,14 +42,30 @@ public class ACS implements FulltextFetcher { public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); - if (!doi.isPresent()) { + if (doi.isEmpty()) { return Optional.empty(); } + System.setProperty("jcef.logSeverity", "VERBOSE"); + System.setProperty("jcef.logFile", "jcef.log"); + String source = SOURCE.formatted(doi.get().getDOI()); CefAppBuilder builder = new CefAppBuilder(); - builder.setAppHandler(new MavenCefAppHandlerAdapter(){}); + CefApp.addAppHandler(new CefAppHandlerAdapter(null) { + @Override + public void stateHasChanged(org.cef.CefApp.CefAppState state) { + // Shutdown the app if the native CEF part is terminated + if (state == CefApp.CefAppState.TERMINATED) { + // calling System.exit(0) appears to be causing assert errors, + // as its firing before all of the CEF objects shutdown. + //System.exit(0); + } + } + }); + + builder.getCefSettings().windowless_rendering_enabled = false; + CefApp cefApp; try { cefApp = builder.build(); @@ -57,9 +75,28 @@ public Optional findFullText(BibEntry entry) throws IOException { } CefClient client = cefApp.createClient(); - CefBrowser browser = client.createBrowser(source, false, false); + CefBrowser browser = client.createBrowser("about:blank", false, false); + CompletableFuture result = new CompletableFuture<>(); client.addLoadHandler(new CefLoadHandlerAdapter() { + @Override + public void onLoadingStateChange(CefBrowser browser, boolean isLoading, boolean canGoBack, boolean canGoForward) { + super.onLoadingStateChange(browser, isLoading, canGoBack, canGoForward); + System.out.println("Loading state changed is loading " + isLoading); + } + + @Override + public void onLoadStart(CefBrowser browser, CefFrame frame, CefRequest.TransitionType transitionType) { + super.onLoadStart(browser, frame, transitionType); + System.out.println("Load start"); + } + + @Override + public void onLoadError(CefBrowser browser, CefFrame frame, ErrorCode errorCode, String errorText, String failedUrl) { + super.onLoadError(browser, frame, errorCode, errorText, failedUrl); + System.out.println("Load error"); + } + @Override public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) { System.out.println("lalala"); @@ -70,6 +107,7 @@ public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) { 0 ); } + result.complete(null); } }); @@ -82,13 +120,19 @@ public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity leve } }); - browser.loadURL(source); + // browser.loadURL(source); + browser.loadURL("https://www.jabref.org"); + + cefApp.doMessageLoopWork(1000); try { - Thread.sleep(5000); + result.get(); } catch ( InterruptedException e) { throw new RuntimeException(e); + } catch ( + ExecutionException e) { + throw new RuntimeException(e); } return Optional.empty(); diff --git a/src/main/resources/csl-locales b/src/main/resources/csl-locales index 7eeb36257a6..7b5a477f2d9 160000 --- a/src/main/resources/csl-locales +++ b/src/main/resources/csl-locales @@ -1 +1 @@ -Subproject commit 7eeb36257a68cb1907bd04f0eaa08d9ed238cbe6 +Subproject commit 7b5a477f2d9a8882b52bcecdc50f08d4422cc822 diff --git a/src/main/resources/csl-styles b/src/main/resources/csl-styles index 2b15b1fbc19..713bf5738ac 160000 --- a/src/main/resources/csl-styles +++ b/src/main/resources/csl-styles @@ -1 +1 @@ -Subproject commit 2b15b1fbc190e003b555486f46ce1112fd95defc +Subproject commit 713bf5738ac0b13c502e364cded9445c48d18193 From 6bc1db1b96b2eec1958eb995bb38b7bb984f10ad Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 8 Sep 2024 10:03:00 +0200 Subject: [PATCH 4/6] Add some "--exports" --- build.gradle | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build.gradle b/build.gradle index eafd40f02db..56664e653ce 100644 --- a/build.gradle +++ b/build.gradle @@ -77,6 +77,11 @@ application { '--add-exports=javafx.base/com.sun.javafx.event=org.jabref.merged.module', '--add-exports=javafx.controls/com.sun.javafx.scene.control=org.jabref.merged.module', + // Required by https://github.com/jcefmaven/jcefmaven?tab=readme-ov-file#limitations + '--add-exports=java.base/java.lang=org.jabref.merged.module', + '--add-exports=java.desktop/sun.awt=org.jabref.merged.module', + '--add-exports=java.desktop/sun.java2d=org.jabref.merged.module', + // Fix for https://github.com/JabRef/jabref/issues/11198 '--add-opens=javafx.graphics/javafx.scene=org.jabref.merged.module', '--add-opens=javafx.controls/javafx.scene.control=org.jabref.merged.module', From 0ac060a839a209ca873286f4488584be5677b642 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 8 Sep 2024 10:03:19 +0200 Subject: [PATCH 5/6] Try some more - does not work --- .../jabref/logic/importer/fetcher/ACS.java | 88 ++++++++++++++----- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index ddd36ecf914..6100e258c04 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -1,11 +1,13 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.net.URL; import java.util.Objects; import java.util.Optional; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; + +import javax.swing.SwingUtilities; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.model.entry.BibEntry; @@ -13,12 +15,13 @@ import org.jabref.model.entry.identifier.DOI; import me.friwi.jcefmaven.CefAppBuilder; +import me.friwi.jcefmaven.MavenCefAppHandlerAdapter; import org.cef.CefApp; import org.cef.CefClient; import org.cef.CefSettings; import org.cef.browser.CefBrowser; import org.cef.browser.CefFrame; -import org.cef.handler.CefAppHandlerAdapter; +import org.cef.browser.CefMessageRouter; import org.cef.handler.CefDisplayHandlerAdapter; import org.cef.handler.CefLoadHandlerAdapter; import org.cef.network.CefRequest; @@ -51,10 +54,44 @@ public Optional findFullText(BibEntry entry) throws IOException { String source = SOURCE.formatted(doi.get().getDOI()); + CompletableFuture result = new CompletableFuture<>(); + + System.out.println(Thread.currentThread().getName()); + try { + SwingUtilities.invokeAndWait(() -> { + try { + startBrowser(result); + } catch ( + IOException e) { + throw new RuntimeException(e); + } + }); + } catch ( + InterruptedException e) { + throw new RuntimeException(e); + } catch ( + InvocationTargetException e) { + throw new RuntimeException(e); + } + + try { + Thread.sleep(10000); + } catch ( + InterruptedException e) { + throw new RuntimeException(e); + } + + return Optional.empty(); + } + + private static void startBrowser(CompletableFuture result) throws IOException { CefAppBuilder builder = new CefAppBuilder(); - CefApp.addAppHandler(new CefAppHandlerAdapter(null) { + + // Set an app handler. Do not use CefApp.addAppHandler(...), it will break your code on MacOSX! + builder.setAppHandler(new MavenCefAppHandlerAdapter() { @Override - public void stateHasChanged(org.cef.CefApp.CefAppState state) { + public void stateHasChanged(CefApp.CefAppState state) { + System.out.println(state); // Shutdown the app if the native CEF part is terminated if (state == CefApp.CefAppState.TERMINATED) { // calling System.exit(0) appears to be causing assert errors, @@ -64,7 +101,7 @@ public void stateHasChanged(org.cef.CefApp.CefAppState state) { } }); - builder.getCefSettings().windowless_rendering_enabled = false; + // builder.getCefSettings().windowless_rendering_enabled = true; CefApp cefApp; try { @@ -74,26 +111,39 @@ public void stateHasChanged(org.cef.CefApp.CefAppState state) { throw new IOException(e); } + /* + new Thread(() -> { + while (true) { + try { + cefApp.doMessageLoopWork(100); + Thread.sleep(10); // Sleep for 10ms between calls + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }).start(); + */ + CefClient client = cefApp.createClient(); - CefBrowser browser = client.createBrowser("about:blank", false, false); + CefMessageRouter msgRouter = CefMessageRouter.create(); + client.addMessageRouter(msgRouter); + + CefBrowser browser = client.createBrowser("ftps://lalala.notfound", true, false); + // (3) Create a simple message router to receive messages from CEF. - CompletableFuture result = new CompletableFuture<>(); client.addLoadHandler(new CefLoadHandlerAdapter() { @Override public void onLoadingStateChange(CefBrowser browser, boolean isLoading, boolean canGoBack, boolean canGoForward) { - super.onLoadingStateChange(browser, isLoading, canGoBack, canGoForward); System.out.println("Loading state changed is loading " + isLoading); } @Override public void onLoadStart(CefBrowser browser, CefFrame frame, CefRequest.TransitionType transitionType) { - super.onLoadStart(browser, frame, transitionType); System.out.println("Load start"); } @Override public void onLoadError(CefBrowser browser, CefFrame frame, ErrorCode errorCode, String errorText, String failedUrl) { - super.onLoadError(browser, frame, errorCode, errorText, failedUrl); System.out.println("Load error"); } @@ -120,22 +170,12 @@ public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity leve } }); - // browser.loadURL(source); - browser.loadURL("https://www.jabref.org"); - - cefApp.doMessageLoopWork(1000); + // cefApp.doMessageLoopWork(); - try { - result.get(); - } catch ( - InterruptedException e) { - throw new RuntimeException(e); - } catch ( - ExecutionException e) { - throw new RuntimeException(e); - } + // browser.loadURL(source); + browser.loadURL("ftps://lalala.notfound"); - return Optional.empty(); + // cefApp.doMessageLoopWork(1000); } @Override From 7e84a6015944a1a171edc8b4d92fbf06c40b9416 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 8 Sep 2024 10:56:11 +0200 Subject: [PATCH 6/6] Try jbrowserdriver (WIP) --- build.gradle | 7 +- src/main/java/module-info.java | 6 +- .../jabref/logic/importer/fetcher/ACS.java | 147 ++---------------- 3 files changed, 18 insertions(+), 142 deletions(-) diff --git a/build.gradle b/build.gradle index 56664e653ce..a14365f4a56 100644 --- a/build.gradle +++ b/build.gradle @@ -77,11 +77,6 @@ application { '--add-exports=javafx.base/com.sun.javafx.event=org.jabref.merged.module', '--add-exports=javafx.controls/com.sun.javafx.scene.control=org.jabref.merged.module', - // Required by https://github.com/jcefmaven/jcefmaven?tab=readme-ov-file#limitations - '--add-exports=java.base/java.lang=org.jabref.merged.module', - '--add-exports=java.desktop/sun.awt=org.jabref.merged.module', - '--add-exports=java.desktop/sun.java2d=org.jabref.merged.module', - // Fix for https://github.com/JabRef/jabref/issues/11198 '--add-opens=javafx.graphics/javafx.scene=org.jabref.merged.module', '--add-opens=javafx.controls/javafx.scene.control=org.jabref.merged.module', @@ -261,7 +256,7 @@ dependencies { implementation 'org.controlsfx:controlsfx:11.2.1' // region HTTP clients - implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping; https://github.com/jcefmaven/jcefmaven + implementation 'com.machinepublishers:jbrowserdriver:1.1.1' // used for web scraping; https://github.com/jcefmaven/jcefmaven implementation 'org.jsoup:jsoup:1.18.1' implementation 'com.konghq:unirest-java-core:4.4.4' implementation 'com.konghq:unirest-modules-gson:4.4.4' diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index 34224f33dc2..917c976d138 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -90,7 +90,10 @@ requires org.glassfish.hk2.api; // region: http clients - requires jcefmaven; + requires transitive jbrowserdriver; + requires org.openqa.selenium.core; + requires org.openqa.grid.selenium; + requires org.openqa.selenium.remote; requires org.apache.httpcomponents.core5.httpcore5; requires org.jsoup; requires unirest.java.core; @@ -184,6 +187,5 @@ requires mslinks; requires org.antlr.antlr4.runtime; requires org.libreoffice.uno; - requires jcef; // endregion } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 6100e258c04..266bbda282f 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -1,35 +1,25 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; -import java.lang.reflect.InvocationTargetException; import java.net.URL; import java.util.Objects; import java.util.Optional; -import java.util.concurrent.CompletableFuture; - -import javax.swing.SwingUtilities; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; -import me.friwi.jcefmaven.CefAppBuilder; -import me.friwi.jcefmaven.MavenCefAppHandlerAdapter; -import org.cef.CefApp; -import org.cef.CefClient; -import org.cef.CefSettings; -import org.cef.browser.CefBrowser; -import org.cef.browser.CefFrame; -import org.cef.browser.CefMessageRouter; -import org.cef.handler.CefDisplayHandlerAdapter; -import org.cef.handler.CefLoadHandlerAdapter; -import org.cef.network.CefRequest; +import com.machinepublishers.jbrowserdriver.JBrowserDriver; +import com.machinepublishers.jbrowserdriver.Settings; +import com.machinepublishers.jbrowserdriver.Timezone; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * FulltextFetcher implementation that attempts to find a PDF URL at ACS. + * + * Alternatives concidered: https://stackoverflow.com/a/53099311/873282 */ public class ACS implements FulltextFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(ACS.class); @@ -54,130 +44,19 @@ public Optional findFullText(BibEntry entry) throws IOException { String source = SOURCE.formatted(doi.get().getDOI()); - CompletableFuture result = new CompletableFuture<>(); - - System.out.println(Thread.currentThread().getName()); - try { - SwingUtilities.invokeAndWait(() -> { - try { - startBrowser(result); - } catch ( - IOException e) { - throw new RuntimeException(e); - } - }); - } catch ( - InterruptedException e) { - throw new RuntimeException(e); - } catch ( - InvocationTargetException e) { - throw new RuntimeException(e); - } + // You can optionally pass a Settings object here, + // constructed using Settings.Builder + JBrowserDriver driver = new JBrowserDriver(Settings.builder(). + timezone(Timezone.AMERICA_NEWYORK).build()); - try { - Thread.sleep(10000); - } catch ( - InterruptedException e) { - throw new RuntimeException(e); - } + driver.get(source); + System.out.println(driver.getStatusCode()); + System.out.println(driver.getPageSource()); + driver.quit(); return Optional.empty(); } - private static void startBrowser(CompletableFuture result) throws IOException { - CefAppBuilder builder = new CefAppBuilder(); - - // Set an app handler. Do not use CefApp.addAppHandler(...), it will break your code on MacOSX! - builder.setAppHandler(new MavenCefAppHandlerAdapter() { - @Override - public void stateHasChanged(CefApp.CefAppState state) { - System.out.println(state); - // Shutdown the app if the native CEF part is terminated - if (state == CefApp.CefAppState.TERMINATED) { - // calling System.exit(0) appears to be causing assert errors, - // as its firing before all of the CEF objects shutdown. - //System.exit(0); - } - } - }); - - // builder.getCefSettings().windowless_rendering_enabled = true; - - CefApp cefApp; - try { - cefApp = builder.build(); - } catch (Exception e) { - LOGGER.error("Could not initialize CEF", e); - throw new IOException(e); - } - - /* - new Thread(() -> { - while (true) { - try { - cefApp.doMessageLoopWork(100); - Thread.sleep(10); // Sleep for 10ms between calls - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - }).start(); - */ - - CefClient client = cefApp.createClient(); - CefMessageRouter msgRouter = CefMessageRouter.create(); - client.addMessageRouter(msgRouter); - - CefBrowser browser = client.createBrowser("ftps://lalala.notfound", true, false); - // (3) Create a simple message router to receive messages from CEF. - - client.addLoadHandler(new CefLoadHandlerAdapter() { - @Override - public void onLoadingStateChange(CefBrowser browser, boolean isLoading, boolean canGoBack, boolean canGoForward) { - System.out.println("Loading state changed is loading " + isLoading); - } - - @Override - public void onLoadStart(CefBrowser browser, CefFrame frame, CefRequest.TransitionType transitionType) { - System.out.println("Load start"); - } - - @Override - public void onLoadError(CefBrowser browser, CefFrame frame, ErrorCode errorCode, String errorText, String failedUrl) { - System.out.println("Load error"); - } - - @Override - public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) { - System.out.println("lalala"); - if (frame.isMain()) { - frame.executeJavaScript( - "document.documentElement.outerHTML;", - frame.getURL(), - 0 - ); - } - result.complete(null); - } - }); - - client.addDisplayHandler(new CefDisplayHandlerAdapter() { - @Override - public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity level, String message, String source, int line) { - // Capture the result of the JavaScript execution in the console message - System.out.println("Page HTML content:\n" + message); - return true; - } - }); - - // cefApp.doMessageLoopWork(); - - // browser.loadURL(source); - browser.loadURL("ftps://lalala.notfound"); - - // cefApp.doMessageLoopWork(1000); - } - @Override public TrustLevel getTrustLevel() { return TrustLevel.PUBLISHER;