Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check similarity of entry when using DOI retrieval with ArXiV #2575

Merged
merged 6 commits into from
Feb 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,11 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The field `issue` is now always exported to the corresponding `issue` field in MS-Office XML.
- We fixed an issue with repeated escaping of the %-sign when running the LaTeXCleanup more than once. [#2451](https://github.com/JabRef/jabref/issues/2451)
- We fixed the import of MS-Office XML files, when the `month` field contained an invalid value.



- ArXiV fetcher now checks similarity of entry when using DOI retrieval to avoid false positives [#2575](https://github.com/JabRef/jabref/issues/2575)
- Sciencedirect/Elsevier fetcher is now able to scrape new HTML structure [#2576](https://github.com/JabRef/jabref/issues/2576)








### Removed


Expand Down
25 changes: 19 additions & 6 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.jabref.logic.importer.util.OAI2Handler;
import org.jabref.logic.util.DOI;
import org.jabref.logic.util.io.XMLUtil;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.ArXivIdentifier;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibtexEntryTypes;
Expand Down Expand Up @@ -63,13 +64,14 @@ public ArXiv(ImportFormatPreferences importFormatPreferences) {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfUrl = Optional.empty();

// 1. Eprint
Optional<String> identifier = entry.getField(FieldName.EPRINT);
if (StringUtil.isNotBlank(identifier)) {
try {
// Get pdf of entry with the specified id
Optional<URL> pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl);
pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl);
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
return pdfUrl;
Expand All @@ -85,17 +87,28 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
String doiString = doi.get().getDOI();
// Search for an entry in the ArXiv which is linked to the doi
try {
Optional<URL> pdfUrl = searchForEntry("doi:" + doiString).flatMap(ArXivEntry::getPdfUrl);
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
return pdfUrl;
Optional<ArXivEntry> arxivEntry = searchForEntry("doi:" + doiString);

if (arxivEntry.isPresent()) {
// Check if entry is a match
StringSimilarity match = new StringSimilarity();
String arxivTitle = arxivEntry.get().title.orElse("");
String entryTitle = entry.getField(FieldName.TITLE).orElse("");

if (match.isSimilar(arxivTitle, entryTitle)) {
pdfUrl = arxivEntry.get().getPdfUrl();
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
return pdfUrl;
}
}
}
} catch (FetcherException e) {
LOGGER.warn("arXiv DOI API request failed", e);
}
}

return Optional.empty();
return pdfUrl;
}

private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherException {
Expand Down
17 changes: 4 additions & 13 deletions src/main/java/org/jabref/logic/importer/fetcher/CrossRef.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
package org.jabref.logic.importer.fetcher;

import java.util.Locale;
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
import org.jabref.logic.util.DOI;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;

import com.mashape.unirest.http.HttpResponse;
import com.mashape.unirest.http.JsonNode;
import com.mashape.unirest.http.Unirest;
import com.mashape.unirest.http.exceptions.UnirestException;
import info.debatty.java.stringsimilarity.Levenshtein;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
Expand All @@ -32,10 +31,6 @@ public class CrossRef {
// number of results to lookup from crossref API
private static final int API_RESULTS = 5;

private static final Levenshtein METRIC_DISTANCE = new Levenshtein();
// edit distance threshold for entry title comnparison
private static final int METRIC_THRESHOLD = 4;

private static final RemoveBracesFormatter REMOVE_BRACES_FORMATTER = new RemoveBracesFormatter();

public static Optional<DOI> findDOI(BibEntry entry) {
Expand Down Expand Up @@ -92,6 +87,7 @@ private static String enhanceQuery(String query, BibEntry entry) {

private static Optional<String> findMatchingEntry(BibEntry entry, JSONArray results) {
final String entryTitle = REMOVE_BRACES_FORMATTER.format(entry.getLatexFreeField(FieldName.TITLE).orElse(""));
final StringSimilarity stringSimilarity = new StringSimilarity();

for (int i = 0; i < results.length(); i++) {
// currently only title-based
Expand All @@ -102,7 +98,7 @@ private static Optional<String> findMatchingEntry(BibEntry entry, JSONArray resu
JSONObject data = results.getJSONObject(i);
String dataTitle = data.getJSONArray("title").getString(0);

if (editDistanceIgnoreCase(entryTitle, dataTitle) <= METRIC_THRESHOLD) {
if (stringSimilarity.isSimilar(entryTitle, dataTitle)) {
return Optional.of(data.getString("DOI"));
}

Expand All @@ -111,7 +107,7 @@ private static Optional<String> findMatchingEntry(BibEntry entry, JSONArray resu
if (data.getJSONArray("subtitle").length() > 0) {
String dataWithSubTitle = dataTitle + " " + data.getJSONArray("subtitle").getString(0);

if (editDistanceIgnoreCase(entryTitle, dataWithSubTitle) <= METRIC_THRESHOLD) {
if (stringSimilarity.isSimilar(entryTitle, dataWithSubTitle)) {
return Optional.of(data.getString("DOI"));
}
}
Expand All @@ -123,9 +119,4 @@ private static Optional<String> findMatchingEntry(BibEntry entry, JSONArray resu

return Optional.empty();
}

private static double editDistanceIgnoreCase(String a, String b) {
// TODO: locale is dependent on the language of the strings?!
return METRIC_DISTANCE.distance(a.toLowerCase(Locale.ENGLISH), b.toLowerCase(Locale.ENGLISH));
}
}
27 changes: 27 additions & 0 deletions src/main/java/org/jabref/logic/util/strings/StringSimilarity.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.jabref.logic.util.strings;

import java.util.Locale;

import info.debatty.java.stringsimilarity.Levenshtein;

public class StringSimilarity {
private final Levenshtein METRIC_DISTANCE = new Levenshtein();
// edit distance threshold for entry title comnparison
private final int METRIC_THRESHOLD = 4;

/**
* String similarity based on Levenshtein, ignoreCase, and fixed metric threshold of 4.
*
* @param a String to compare
* @param b String to compare
* @return true if Strings are considered as similar by the algorithm
*/
public boolean isSimilar(String a, String b) {
return editDistanceIgnoreCase(a, b) <= METRIC_THRESHOLD;
}

private double editDistanceIgnoreCase(String a, String b) {
// TODO: locale is dependent on the language of the strings?!
return METRIC_DISTANCE.distance(a.toLowerCase(Locale.ENGLISH), b.toLowerCase(Locale.ENGLISH));
}
}
15 changes: 13 additions & 2 deletions src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.FieldName;
import org.jabref.testutils.category.FetcherTests;

import org.junit.Assert;
Expand Down Expand Up @@ -51,7 +52,7 @@ public void setUp() {
}

@Test
public void doiNotPresent() throws IOException {
public void noIdentifierPresent() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

Expand All @@ -63,7 +64,8 @@ public void rejectNullParameter() throws IOException {

@Test
public void findByDOI() throws IOException {
entry.setField("doi", "10.1529/biophysj.104.047340");
entry.setField(FieldName.DOI, "10.1529/biophysj.104.047340");
entry.setField(FieldName.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}
Expand Down Expand Up @@ -103,6 +105,15 @@ public void notFoundByUnknownId() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findByDOINotAvailableInCatalog() throws IOException {
entry.setField(FieldName.DOI, "10.1016/0370-2693(77)90015-6");
entry.setField(FieldName.TITLE, "Superspace formulation of supergravity");


assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void searchEntryByPartOfTitle() throws Exception {
assertEquals(Collections.singletonList(sliceTheoremPaper),
Expand Down