diff --git a/src/main/java/org/jabref/logic/cleanup/DoiCleanup.java b/src/main/java/org/jabref/logic/cleanup/DoiCleanup.java index df745a36a47..c95f2e05960 100644 --- a/src/main/java/org/jabref/logic/cleanup/DoiCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/DoiCleanup.java @@ -1,5 +1,7 @@ package org.jabref.logic.cleanup; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -14,23 +16,57 @@ import org.jabref.model.entry.identifier.DOI; /** - * Formats the DOI (e.g. removes http part) and also moves DOIs from note, url or ee field to the doi field. + * Formats + * the + * DOI + * (e.g. + * removes + * http + * part) + * and + * also + * moves + * DOIs + * from + * note, + * url + * or + * ee + * field + * to + * the + * doi + * field. */ public class DoiCleanup implements CleanupJob { /** - * Fields to check for DOIs. + * Fields + * to + * check + * for + * DOIs. */ private static final List FIELDS = Arrays.asList(StandardField.NOTE, StandardField.URL, new UnknownField("ee")); @Override public List cleanup(BibEntry entry) { + List changes = new ArrayList<>(); // First check if the Doi Field is empty if (entry.hasField(StandardField.DOI)) { String doiFieldValue = entry.getField(StandardField.DOI).orElse(null); + String decodeDoiFieldValue = ""; + try { + decodeDoiFieldValue = URLDecoder.decode(doiFieldValue, "UTF-8"); + } catch ( + UnsupportedEncodingException e) { + decodeDoiFieldValue = doiFieldValue; + } + doiFieldValue = decodeDoiFieldValue; + Optional doi = DOI.parse(doiFieldValue); if (doi.isPresent()) { @@ -45,7 +81,7 @@ public List cleanup(BibEntry entry) { // Doi field seems to contain Doi -> cleanup note, url, ee field for (Field field : FIELDS) { entry.getField(field).flatMap(DOI::parse) - .ifPresent(unused -> removeFieldValue(entry, field, changes)); + .ifPresent(unused -> removeFieldValue(entry, field, changes)); } } } else { @@ -68,4 +104,11 @@ private void removeFieldValue(BibEntry entry, Field field, List cha CleanupJob eraser = new FieldFormatterCleanup(field, new ClearFormatter()); changes.addAll(eraser.cleanup(entry)); } + + private String decodeDoi(String doiValue) throws UnsupportedEncodingException { + if (doiValue == null) { + return null; + } + return URLDecoder.decode(doiValue, "UTF-8"); + } } diff --git a/src/main/java/org/jabref/model/entry/identifier/DOI.java b/src/main/java/org/jabref/model/entry/identifier/DOI.java index f8285d6dc93..d94a1ecee2b 100644 --- a/src/main/java/org/jabref/model/entry/identifier/DOI.java +++ b/src/main/java/org/jabref/model/entry/identifier/DOI.java @@ -2,12 +2,15 @@ import java.net.URI; import java.net.URISyntaxException; +import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; @@ -15,7 +18,19 @@ import org.slf4j.LoggerFactory; /** - * Class for working with Digital object identifiers (DOIs) and Short DOIs + * Class + * for + * working + * with + * Digital + * object + * identifiers + * (DOIs) + * and + * Short + * DOIs */ public class DOI implements Identifier { @@ -93,8 +108,8 @@ public class DOI implements Identifier { // See https://stackoverflow.com/questions/3203190/regex-any-ascii-character for the regexp that includes ASCII characters only // Another reference for regexp for ASCII characters: https://howtodoinjava.com/java/regex/java-clean-ascii-text-non-printable-chars/ private static final String CHARS_TO_REMOVE = "[\\s+" // remove white space characters, i.e, \t, \n, \x0B, \f, \r . + is a greedy quantifier - + "[^\\x00-\\x7F]" // strips off all non-ASCII characters - + "]"; + + "[^\\x00-\\x7F]" // strips off all non-ASCII characters + + "]"; // DOI private final String doi; @@ -102,11 +117,38 @@ public class DOI implements Identifier { private boolean isShortDoi = false; /** - * Creates a DOI from various schemes including URL, URN, and plain DOIs/Short DOIs. + * Creates + * a + * DOI + * from + * various + * schemes + * including + * URL, + * URN, + * and + * plain + * DOIs/Short + * DOIs. * - * @param doi the DOI/Short DOI string - * @throws NullPointerException if DOI/Short DOI is null - * @throws IllegalArgumentException if doi does not include a valid DOI/Short DOI + * @param doi the + * DOI/Short + * DOI + * string + * @throws NullPointerException if + * DOI/Short + * DOI + * is + * null + * @throws IllegalArgumentException if + * doi + * does + * not + * include + * a + * valid + * DOI/Short + * DOI */ public DOI(String doi) { Objects.requireNonNull(doi); @@ -120,7 +162,8 @@ public DOI(String doi) { // decodes path segment URI url = new URI(trimmedDoi); trimmedDoi = url.getScheme() + "://" + url.getHost() + url.getPath(); - } catch (URISyntaxException e) { + } catch ( + URISyntaxException e) { throw new IllegalArgumentException(doi + " is not a valid HTTP DOI/Short DOI."); } } @@ -150,39 +193,119 @@ public DOI(String doi) { } /** - * Creates an Optional<DOI> from various schemes including URL, URN, and plain DOIs. + * Creates + * an + * Optional<DOI> + * from + * various + * schemes + * including + * URL, + * URN, + * and + * plain + * DOIs. *

- * Useful for suppressing the IllegalArgumentException of the Constructor and checking for Optional.isPresent() instead. + * Useful + * for + * suppressing + * the + * IllegalArgumentException + * of + * the + * Constructor + * and + * checking + * for + * Optional.isPresent() + * instead. * - * @param doi the DOI/Short DOI string - * @return an Optional containing the DOI or an empty Optional + * @param doi the + * DOI/Short + * DOI + * string + * @return an + * Optional + * containing + * the + * DOI + * or + * an + * empty + * Optional */ public static Optional parse(String doi) { try { + LatexToUnicodeFormatter formatter = new LatexToUnicodeFormatter(); String cleanedDOI = doi; + cleanedDOI = URLDecoder.decode(cleanedDOI, StandardCharsets.UTF_8); + cleanedDOI = formatter.format(cleanedDOI); cleanedDOI = cleanedDOI.replaceAll(CHARS_TO_REMOVE, ""); return Optional.of(new DOI(cleanedDOI)); - } catch (IllegalArgumentException | NullPointerException e) { + } catch ( + IllegalArgumentException | + NullPointerException e) { return Optional.empty(); } } /** - * Determines whether a DOI/Short DOI is valid or not + * Determines + * whether + * a + * DOI/Short + * DOI + * is + * valid + * or + * not * - * @param doi the DOI/Short DOI string - * @return true if DOI is valid, false otherwise + * @param doi the + * DOI/Short + * DOI + * string + * @return true + * if + * DOI + * is + * valid, + * false + * otherwise */ public static boolean isValid(String doi) { return parse(doi).isPresent(); } /** - * Tries to find a DOI/Short DOI inside the given text. + * Tries + * to + * find + * a + * DOI/Short + * DOI + * inside + * the + * given + * text. * - * @param text the Text which might contain a DOI/Short DOI - * @return an Optional containing the DOI or an empty Optional + * @param text the + * Text + * which + * might + * contain + * a + * DOI/Short + * DOI + * @return an + * Optional + * containing + * the + * DOI + * or + * an + * empty + * Optional */ public static Optional findInText(String text) { Optional result = Optional.empty(); @@ -214,27 +337,63 @@ public String toString() { } /** - * Return the plain DOI/Short DOI + * Return + * the + * plain + * DOI/Short + * DOI * - * @return the plain DOI/Short DOI value. + * @return the + * plain + * DOI/Short + * DOI + * value. */ public String getDOI() { return doi; } /** - * Determines whether DOI is short DOI or not + * Determines + * whether + * DOI + * is + * short + * DOI + * or + * not * - * @return true if DOI is short DOI, false otherwise + * @return true + * if + * DOI + * is + * short + * DOI, + * false + * otherwise */ public boolean isShortDoi() { return isShortDoi; } /** - * Return a URI presentation for the DOI/Short DOI + * Return + * a + * URI + * presentation + * for + * the + * DOI/Short + * DOI * - * @return an encoded URI representation of the DOI/Short DOI + * @return an + * encoded + * URI + * representation + * of + * the + * DOI/Short + * DOI */ @Override public Optional getExternalURI() { @@ -251,7 +410,8 @@ private Optional getExternalURIFromBase(URI base) { try { URI uri = new URI(base.getScheme(), base.getHost(), "/" + doi, null); return Optional.of(uri); - } catch (URISyntaxException e) { + } catch ( + URISyntaxException e) { // should never happen LOGGER.error(doi + " could not be encoded as URI.", e); return Optional.empty(); @@ -259,9 +419,24 @@ private Optional getExternalURIFromBase(URI base) { } /** - * Return an ASCII URL presentation for the DOI/Short DOI + * Return + * an + * ASCII + * URL + * presentation + * for + * the + * DOI/Short + * DOI * - * @return an encoded URL representation of the DOI/Short DOI + * @return an + * encoded + * URL + * representation + * of + * the + * DOI/Short + * DOI */ public String getURIAsASCIIString() { return getExternalURI().map(URI::toASCIIString).orElse(""); @@ -278,7 +453,13 @@ public String getNormalized() { } /** - * DOIs are case-insensitive. Thus, 10.1109/cloud.2017.89 equals 10.1109/CLOUD.2017.89 + * DOIs + * are + * case-insensitive. + * Thus, + * 10.1109/cloud.2017.89 + * equals + * 10.1109/CLOUD.2017.89 */ @Override public boolean equals(Object o) { diff --git a/src/test/java/org/jabref/logic/cleanup/DoiDecodeCleanupTest.java b/src/test/java/org/jabref/logic/cleanup/DoiDecodeCleanupTest.java new file mode 100644 index 00000000000..4a9b68f8ce0 --- /dev/null +++ b/src/test/java/org/jabref/logic/cleanup/DoiDecodeCleanupTest.java @@ -0,0 +1,67 @@ +package org.jabref.logic.cleanup; + +import java.util.stream.Stream; + +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class DoiDecodeCleanupTest { + + @ParameterizedTest + @MethodSource("provideDoiForAllLowers") + public void testChangeDoi(BibEntry expected, BibEntry doiInfoField) { + DoiCleanup cleanUp = new DoiCleanup(); + cleanUp.cleanup(doiInfoField); + + assertEquals(expected, doiInfoField); + } + + private static Stream provideDoiForAllLowers() { + UnknownField unknownField = new UnknownField("ee"); + BibEntry doiResult = new BibEntry().withField(StandardField.DOI, "10.18726/2018_3"); + + return Stream.of( + // cleanup for Doi field only + Arguments.of(doiResult, new BibEntry().withField( + StandardField.URL, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3")), + + // cleanup with Doi and URL to all entries + Arguments.of(doiResult, new BibEntry() + .withField(StandardField.DOI, "10.18726/2018%7B%5Ctextunderscore%7D3") + .withField(StandardField.URL, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3") + .withField(StandardField.NOTE, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3") + .withField(unknownField, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3")), + + // cleanup with Doi and no URL to entries + Arguments.of( + new BibEntry() + .withField(StandardField.DOI, "10.18726/2018_3") + .withField(StandardField.NOTE, "This is a random note to this Doi") + .withField(unknownField, "This is a random ee field for this Doi"), + new BibEntry() + .withField(StandardField.DOI, "10.18726/2018_3") + .withField(StandardField.NOTE, "This is a random note to this Doi") + .withField(unknownField, "This is a random ee field for this Doi")), + + // cleanup with spaced Doi + Arguments.of(doiResult, new BibEntry() + .withField(StandardField.DOI, "10.18726/2018%7B%5Ctextunderscore%7D3")), + + // cleanup just Note field with URL + Arguments.of(doiResult, new BibEntry() + .withField(StandardField.NOTE, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3")), + + // cleanup just ee field with URL + Arguments.of(doiResult, new BibEntry() + .withField(unknownField, "https://doi.org/10.18726/2018%7B%5Ctextunderscore%7D3")) + ); + } + +}