JabRef · harsh1898 · Jan 23, 2024 · Jan 23, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
 
 - We added a fetcher for [ISIDORE](https://isidore.science/), simply paste in the link into the text field or the last 6 digits in the link that identify that paper. [#10423](https://github.com/JabRef/jabref/issues/10423)
 - When importing entries form the "Citation relations" tab, the field [cites](https://docs.jabref.org/advanced/entryeditor/entrylinks) is now filled according to the relationship between the entries. [#10572](https://github.com/JabRef/jabref/pull/10752)
+- We added a new integrity check and clean up option for non NFC format values. [#10506](https://github.com/JabRef/jabref/issues/10506)
 
 ### Changed
 

diff --git a/src/main/java/org/jabref/logic/formatter/Formatters.java b/src/main/java/org/jabref/logic/formatter/Formatters.java
@@ -22,6 +22,7 @@
 import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter;
 import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter;
 import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter;
+import org.jabref.logic.formatter.bibtexfields.NormalizeUnicodeFormatter;
 import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter;
 import org.jabref.logic.formatter.bibtexfields.RegexFormatter;
 import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
@@ -87,6 +88,7 @@ public static List<Formatter> getOthers() {
                 new EscapeAmpersandsFormatter(),
                 new EscapeDollarSignFormatter(),
                 new ShortenDOIFormatter(),
+                new NormalizeUnicodeFormatter(),
                 new ReplaceUnicodeLigaturesFormatter(),
                 new UnprotectTermsFormatter()
         );

diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/NormalizeUnicodeFormatter.java
@@ -0,0 +1,41 @@
+package org.jabref.logic.formatter.bibtexfields;
+
+import java.text.Normalizer;
+import java.util.Objects;
+
+import org.jabref.logic.cleanup.Formatter;
+
+/**
+ * Clean up field values by formatting Unicode values with Normalize Unicode
+ */
+public class NormalizeUnicodeFormatter extends Formatter {
+
+    @Override
+    public String getName() {
+        return "Normalize Unicode";
+    }
+
+    @Override
+    public String getKey() {
+        return "NORMALIZE_UNICODE";
+    }
+
+    @Override
+    public String getDescription() {
+        return "Normalize Unicode characters in BibTeX fields.";
+    }
+
+    @Override
+    public String getExampleInput() {
+        return "H\u00E9ll\u00F4 W\u00F6rld";
+    }
+
+    @Override
+    public String format(String value) {
+        Objects.requireNonNull(value);
+
+        String normalizedValue = Normalizer.normalize(value, Normalizer.Form.NFC);
+
+        return normalizedValue;
+    }
+}
diff --git a/src/main/java/org/jabref/logic/integrity/FieldCheckers.java b/src/main/java/org/jabref/logic/integrity/FieldCheckers.java
@@ -53,6 +53,8 @@ private static Multimap<Field, ValueChecker> getAllMap(BibDatabaseContext databa
             fieldCheckers.put(StandardField.URLDATE, new DateChecker());
             fieldCheckers.put(StandardField.EVENTDATE, new DateChecker());
             fieldCheckers.put(StandardField.ORIGDATE, new DateChecker());
+//            fieldCheckers.put(StandardField.TITLE, new UnicodeNormalFormCCheck(databaseContext));
+//            fieldCheckers.put(StandardField.AUTHOR, new UnicodeNormalFormCCheck(databaseContext));
         }
 
         return fieldCheckers;

diff --git a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
@@ -52,6 +52,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
             entryCheckers.addAll(List.of(
                     new ASCIICharacterChecker(),
                     new NoBibtexFieldChecker(),
+                    new UnicodeNormalFormCCheck(),
                     new BibTeXEntryTypeChecker())
             );
         }

diff --git a/src/main/java/org/jabref/logic/integrity/PersonNamesChecker.java b/src/main/java/org/jabref/logic/integrity/PersonNamesChecker.java
@@ -41,7 +41,6 @@ public Optional<String> checkValue(String value) {
                 && !authorList.getAsFirstLastNamesWithAnd().equals(value)) {
             return Optional.of(Localization.lang("Names are not in the standard %0 format.", bibMode.getFormattedName()));
         }
-
         return Optional.empty();
     }
 }
diff --git a/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCCheck.java b/src/main/java/org/jabref/logic/integrity/UnicodeNormalFormCCheck.java
@@ -0,0 +1,29 @@
+package org.jabref.logic.integrity;
+
+import java.text.Normalizer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.jabref.logic.l10n.Localization;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.Field;
+
+/**
+ * Detect any Unicode characters that is not in NFC format
+ */
+public class UnicodeNormalFormCCheck implements EntryChecker {
+
+    @Override
+    public List<IntegrityMessage> check(BibEntry entry) {
+        List<IntegrityMessage> results = new ArrayList<>();
+        for (Map.Entry<Field, String> field : entry.getFieldMap().entrySet()) {
+            String normalizedString = Normalizer.normalize(field.getValue(), Normalizer.Form.NFC);
+            if (!(field.getValue().equals(normalizedString))) {
+                results.add(new IntegrityMessage(Localization.lang("Value is not in Normal Form C (NFC) format"), entry,
+                        field.getKey()));
+            }
+        }
+        return results;
+    }
+}
diff --git a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java
@@ -40,7 +40,7 @@ public class HTMLUnicodeConversionMaps {
             {"168", "uml", "{\\\"{}}"}, // diaeresis = spacing diaeresis,
             //                                 U+00A8 ISOdia
             {"169", "copy", "{\\copyright}"}, // copyright sign, U+00A9 ISOnum
-            {"170", "ordf", "{\\textordfeminine}"}, // feminine ordinal indicator, U+00AA ISOnum
+            {"170", "ordf", "{\\textordfeminine}"}, // feminine ordinal indicator, U+00A A ISOnum
             {"171", "laquo", "{\\guillemotleft}"}, // left-pointing double angle quotation mark
             //                                 = left pointing guillemet, U+00AB ISOnum
             {"172", "not", "$\\neg$"}, // not sign, U+00AC ISOnum
@@ -134,6 +134,8 @@ public class HTMLUnicodeConversionMaps {
             //                                 U+00D8 ISOlat1
             {"217", "Ugrave", "{{\\`{U}}}"}, // latin capital letter U with grave,
             //                                 U+00D9 ISOlat1
+
+
             {"218", "Uacute", "{{\\'{U}}}"}, // latin capital letter U with acute,
             //                                 U+00DA ISOlat1
             {"219", "Ucirc", "{{\\^{U}}}"}, // latin capital letter U with circumflex,

diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
@@ -2633,4 +2633,6 @@ Show\ user\ comments\ field=Show user comments field
 
 More\ options...=More options...
 Treat\ all\ duplicates\ entries\ the\ same\ way=Treat all duplicates entries the same way
-Ask\ every\ time=Ask every time
+Ask\ every\ time=Ask every time
+
+Value\ is\ not\ in\ Normal\ Form\ C\ (NFC)\ format=Value is not in Normal Form C (NFC) format
diff --git a/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCCheckTest.java b/src/test/java/org/jabref/logic/integrity/UnicodeNormalFormCCheckTest.java
@@ -0,0 +1,34 @@
+package org.jabref.logic.integrity;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.StandardField;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+public class UnicodeNormalFormCCheckTest {
+    UnicodeNormalFormCCheck checker = new UnicodeNormalFormCCheck();
+    BibEntry entry = new BibEntry();
+
+    @Test
+    void checkWithNormalizedStringShouldReturnEmptyList() {
+        entry.setField(StandardField.TITLE, "Some Title");
+        entry.setField(StandardField.AUTHOR, "John Doe");
+
+        assertEquals(Collections.emptyList(), checker.check(entry));
+    }
+
+    @Test
+    void checkWithNonNormalizedStringShouldReturnIntegrityMessage() {
+        entry.setField(StandardField.TITLE, "Café");
+        entry.setField(StandardField.AUTHOR, "John Doe");
+
+        assertFalse(checker.check(entry).isEmpty());
+        assertEquals(List.of(new IntegrityMessage("Value is not in Normal Form C (NFC) format", entry, StandardField.TITLE)), checker.check(entry));
+    }
+}