diff --git a/CHANGELOG.md b/CHANGELOG.md index 68228bd83b4..62cb1e5e539 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,18 +43,11 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `# - The field `issue` is now always exported to the corresponding `issue` field in MS-Office XML. - We fixed an issue with repeated escaping of the %-sign when running the LaTeXCleanup more than once. [#2451](https://github.com/JabRef/jabref/issues/2451) - We fixed the import of MS-Office XML files, when the `month` field contained an invalid value. - - - + - ArXiV fetcher now checks similarity of entry when using DOI retrieval to avoid false positives [#2575](https://github.com/JabRef/jabref/issues/2575) - Sciencedirect/Elsevier fetcher is now able to scrape new HTML structure [#2576](https://github.com/JabRef/jabref/issues/2576) - - - - - ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java index 2b37771aade..ef0037a131e 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java @@ -25,6 +25,7 @@ import org.jabref.logic.importer.util.OAI2Handler; import org.jabref.logic.util.DOI; import org.jabref.logic.util.io.XMLUtil; +import org.jabref.logic.util.strings.StringSimilarity; import org.jabref.model.entry.ArXivIdentifier; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexEntryTypes; @@ -63,13 +64,14 @@ public ArXiv(ImportFormatPreferences importFormatPreferences) { @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); + Optional pdfUrl = Optional.empty(); // 1. Eprint Optional identifier = entry.getField(FieldName.EPRINT); if (StringUtil.isNotBlank(identifier)) { try { // Get pdf of entry with the specified id - Optional pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl); + pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl); if (pdfUrl.isPresent()) { LOGGER.info("Fulltext PDF found @ arXiv."); return pdfUrl; @@ -85,17 +87,28 @@ public Optional findFullText(BibEntry entry) throws IOException { String doiString = doi.get().getDOI(); // Search for an entry in the ArXiv which is linked to the doi try { - Optional pdfUrl = searchForEntry("doi:" + doiString).flatMap(ArXivEntry::getPdfUrl); - if (pdfUrl.isPresent()) { - LOGGER.info("Fulltext PDF found @ arXiv."); - return pdfUrl; + Optional arxivEntry = searchForEntry("doi:" + doiString); + + if (arxivEntry.isPresent()) { + // Check if entry is a match + StringSimilarity match = new StringSimilarity(); + String arxivTitle = arxivEntry.get().title.orElse(""); + String entryTitle = entry.getField(FieldName.TITLE).orElse(""); + + if (match.isSimilar(arxivTitle, entryTitle)) { + pdfUrl = arxivEntry.get().getPdfUrl(); + if (pdfUrl.isPresent()) { + LOGGER.info("Fulltext PDF found @ arXiv."); + return pdfUrl; + } + } } } catch (FetcherException e) { LOGGER.warn("arXiv DOI API request failed", e); } } - return Optional.empty(); + return pdfUrl; } private Optional searchForEntry(String searchQuery) throws FetcherException { diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CrossRef.java b/src/main/java/org/jabref/logic/importer/fetcher/CrossRef.java index a73361056fc..7f2eb496947 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/CrossRef.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/CrossRef.java @@ -1,11 +1,11 @@ package org.jabref.logic.importer.fetcher; -import java.util.Locale; import java.util.Objects; import java.util.Optional; import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter; import org.jabref.logic.util.DOI; +import org.jabref.logic.util.strings.StringSimilarity; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; @@ -13,7 +13,6 @@ import com.mashape.unirest.http.JsonNode; import com.mashape.unirest.http.Unirest; import com.mashape.unirest.http.exceptions.UnirestException; -import info.debatty.java.stringsimilarity.Levenshtein; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.json.JSONArray; @@ -32,10 +31,6 @@ public class CrossRef { // number of results to lookup from crossref API private static final int API_RESULTS = 5; - private static final Levenshtein METRIC_DISTANCE = new Levenshtein(); - // edit distance threshold for entry title comnparison - private static final int METRIC_THRESHOLD = 4; - private static final RemoveBracesFormatter REMOVE_BRACES_FORMATTER = new RemoveBracesFormatter(); public static Optional findDOI(BibEntry entry) { @@ -92,6 +87,7 @@ private static String enhanceQuery(String query, BibEntry entry) { private static Optional findMatchingEntry(BibEntry entry, JSONArray results) { final String entryTitle = REMOVE_BRACES_FORMATTER.format(entry.getLatexFreeField(FieldName.TITLE).orElse("")); + final StringSimilarity stringSimilarity = new StringSimilarity(); for (int i = 0; i < results.length(); i++) { // currently only title-based @@ -102,7 +98,7 @@ private static Optional findMatchingEntry(BibEntry entry, JSONArray resu JSONObject data = results.getJSONObject(i); String dataTitle = data.getJSONArray("title").getString(0); - if (editDistanceIgnoreCase(entryTitle, dataTitle) <= METRIC_THRESHOLD) { + if (stringSimilarity.isSimilar(entryTitle, dataTitle)) { return Optional.of(data.getString("DOI")); } @@ -111,7 +107,7 @@ private static Optional findMatchingEntry(BibEntry entry, JSONArray resu if (data.getJSONArray("subtitle").length() > 0) { String dataWithSubTitle = dataTitle + " " + data.getJSONArray("subtitle").getString(0); - if (editDistanceIgnoreCase(entryTitle, dataWithSubTitle) <= METRIC_THRESHOLD) { + if (stringSimilarity.isSimilar(entryTitle, dataWithSubTitle)) { return Optional.of(data.getString("DOI")); } } @@ -123,9 +119,4 @@ private static Optional findMatchingEntry(BibEntry entry, JSONArray resu return Optional.empty(); } - - private static double editDistanceIgnoreCase(String a, String b) { - // TODO: locale is dependent on the language of the strings?! - return METRIC_DISTANCE.distance(a.toLowerCase(Locale.ENGLISH), b.toLowerCase(Locale.ENGLISH)); - } } diff --git a/src/main/java/org/jabref/logic/util/strings/StringSimilarity.java b/src/main/java/org/jabref/logic/util/strings/StringSimilarity.java new file mode 100644 index 00000000000..c17443f07fb --- /dev/null +++ b/src/main/java/org/jabref/logic/util/strings/StringSimilarity.java @@ -0,0 +1,27 @@ +package org.jabref.logic.util.strings; + +import java.util.Locale; + +import info.debatty.java.stringsimilarity.Levenshtein; + +public class StringSimilarity { + private final Levenshtein METRIC_DISTANCE = new Levenshtein(); + // edit distance threshold for entry title comnparison + private final int METRIC_THRESHOLD = 4; + + /** + * String similarity based on Levenshtein, ignoreCase, and fixed metric threshold of 4. + * + * @param a String to compare + * @param b String to compare + * @return true if Strings are considered as similar by the algorithm + */ + public boolean isSimilar(String a, String b) { + return editDistanceIgnoreCase(a, b) <= METRIC_THRESHOLD; + } + + private double editDistanceIgnoreCase(String a, String b) { + // TODO: locale is dependent on the language of the strings?! + return METRIC_DISTANCE.distance(a.toLowerCase(Locale.ENGLISH), b.toLowerCase(Locale.ENGLISH)); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java index e43528c4537..47af5898eea 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java @@ -9,6 +9,7 @@ import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BiblatexEntryTypes; +import org.jabref.model.entry.FieldName; import org.jabref.testutils.category.FetcherTests; import org.junit.Assert; @@ -51,7 +52,7 @@ public void setUp() { } @Test - public void doiNotPresent() throws IOException { + public void noIdentifierPresent() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } @@ -63,7 +64,8 @@ public void rejectNullParameter() throws IOException { @Test public void findByDOI() throws IOException { - entry.setField("doi", "10.1529/biophysj.104.047340"); + entry.setField(FieldName.DOI, "10.1529/biophysj.104.047340"); + entry.setField(FieldName.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry)); } @@ -103,6 +105,15 @@ public void notFoundByUnknownId() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } + @Test + public void findByDOINotAvailableInCatalog() throws IOException { + entry.setField(FieldName.DOI, "10.1016/0370-2693(77)90015-6"); + entry.setField(FieldName.TITLE, "Superspace formulation of supergravity"); + + + assertEquals(Optional.empty(), finder.findFullText(entry)); + } + @Test public void searchEntryByPartOfTitle() throws Exception { assertEquals(Collections.singletonList(sliceTheoremPaper),