From 4c557e3c00690c2ec704d5f789a2290c1bb3b154 Mon Sep 17 00:00:00 2001 From: Siedlerchr Date: Wed, 15 Dec 2021 21:22:31 +0100 Subject: [PATCH] Fix ACM fetcher Fixes #8259 --- .../importer/fileformat/ACMPortalParser.java | 41 ++++++++----------- .../fetcher/ACMPortalFetcherTest.java | 2 +- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/ACMPortalParser.java b/src/main/java/org/jabref/logic/importer/fileformat/ACMPortalParser.java index 86da688f60c..0e46e834134 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/ACMPortalParser.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/ACMPortalParser.java @@ -1,22 +1,17 @@ package org.jabref.logic.importer.fileformat; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.net.CookieHandler; import java.net.CookieManager; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.StringJoiner; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.jabref.logic.importer.FetcherException; @@ -35,13 +30,15 @@ import com.google.gson.JsonObject; import com.google.gson.JsonParser; import org.apache.http.client.utils.URIBuilder; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class ACMPortalParser implements Parser { + private static final String HOST = "https://dl.acm.org"; private static final String DOI_URL = "https://dl.acm.org/action/exportCiteProcCitation"; - private static final Pattern DOI_HTML_PATTERN = Pattern.compile(""; - private static final int MAX_ITEM_CNT_PER_PAGE = 20; /** * Parse the DOI of the ACM Portal search result page and obtain the corresponding BibEntry @@ -68,24 +65,20 @@ public List parseEntries(InputStream stream) throws ParseException { */ public List parseDoiSearchPage(InputStream stream) throws ParseException { List doiList = new ArrayList<>(); - String htmlLine; - try (BufferedReader in = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) { - int cnt = 0; - while ((htmlLine = in.readLine()) != null) { - if (ITEM_HTML.equals(htmlLine)) { - Matcher matcher = DOI_HTML_PATTERN.matcher(in.readLine()); - if (matcher.find()) { - doiList.add(matcher.group(1)); - cnt++; - if (cnt >= MAX_ITEM_CNT_PER_PAGE) { - break; - } - } - } + + try { + Document doc = Jsoup.parse(stream, null, HOST); + Elements doiHrefs = doc.select("div.issue-item__content-right > h5 > span > a"); + + for (Element elem : doiHrefs) { + String fullSegement = elem.attr("href"); + String doi = fullSegement.substring(fullSegement.indexOf("10")); + doiList.add(doi); } - } catch (IOException e) { - throw new ParseException(e); + } catch (IOException ex) { + throw new ParseException(ex); } + return doiList; } diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java index b758f7bcc06..63a3354654f 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ACMPortalFetcherTest.java @@ -4,6 +4,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.util.List; +import java.util.Optional; import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.fileformat.ACMPortalParser; @@ -12,7 +13,6 @@ import org.jabref.model.entry.types.StandardEntryType; import org.jabref.testutils.category.FetcherTest; -import com.google.common.base.Optional; import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException; import org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser; import org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser;