From 037a6874014524a42ed7c3f09b75434696af1488 Mon Sep 17 00:00:00 2001 From: GregoryTravis Date: Thu, 3 Aug 2023 14:07:00 -0400 Subject: [PATCH] Expose Unicode normalization methods on Texts (#7425) Exposes Text_Utils.normalize(). --- CHANGELOG.md | 2 + .../Base/0.0.0-dev/src/Data/Text.enso | 18 ++++++++ .../src/Data/Text/Normalization.enso | 41 +++++++++++++++++++ .../lib/Standard/Base/0.0.0-dev/src/Main.enso | 2 + .../main/java/org/enso/base/Text_Utils.java | 14 +++++++ test/Tests/src/Data/Text_Spec.enso | 17 ++++++++ 6 files changed, 94 insertions(+) create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Normalization.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index c1034101c382..b4ac4d776b08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -546,6 +546,7 @@ - [Implemented `replace` on database columns.][7275] - [Retire `Column_Selector` and allow regex based selection of columns.][7295] - [`Text.parse_to_table` can take a `Regex`.][7297] +- [Expose `Text.normalize`.][7425] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -778,6 +779,7 @@ [7275]: https://github.com/enso-org/enso/pull/7275 [7295]: https://github.com/enso-org/enso/pull/7295 [7297]: https://github.com/enso-org/enso/pull/7297 +[7425]: https://github.com/enso-org/enso/pull/7425 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso index acef479c2035..47973bc04eff 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso @@ -2,6 +2,7 @@ import project.Any.Any import project.Data.Locale.Locale import project.Data.Numbers.Integer import project.Data.Ordering.Ordering +import project.Data.Text.Normalization.Normalization import project.Error.Error import project.Errors.Common.Type_Error import project.Meta @@ -142,3 +143,20 @@ type Text "14.95€".is_normalized is_normalized : Boolean is_normalized self = @Builtin_Method "Text.is_normalized" + + ## ALIAS normalise + + Perform Unicode normalization on the string, using the specified method. + + Arguments: + - normalization: The `Normalization` method to use. + + > Example + Normalize a string. + + 'aśb'.normalize == 'as\u0301b' + normalize : Text -> Normalization -> Text + normalize self normalization=Normalization.NFD = + mode = normalization.get_java_mode + name = normalization.get_mode_name + Text_Utils.normalizeWithMode self name mode diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Normalization.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Normalization.enso new file mode 100644 index 000000000000..7e921b1210be --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Normalization.enso @@ -0,0 +1,41 @@ +import project.Any.Any + +import project.Data.Vector.Vector + +polyglot java import com.ibm.icu.text.Normalizer2.Mode + +## Enso representations of `com.ibm.icu.text.Normalizer2.Mode`. Does not + include all values of `Mode`. +type Normalization + ## Unicode NFC normalization mode. + NFC + + ## Unicode NFD normalization mode. + NFD + + ## Unicode NFKC normalization mode. + NFKC + + ## Unicode NFKD normalization mode. + NFKD + + ## Unicode NFKC_CF normalization mode. + NFKCCasefold + + ## PRIVATE + get_java_mode : Any + get_java_mode self = case self of + Normalization.NFC -> Mode.COMPOSE + Normalization.NFD -> Mode.DECOMPOSE + Normalization.NFKC -> Mode.COMPOSE + Normalization.NFKD -> Mode.DECOMPOSE + Normalization.NFKCCasefold -> Mode.COMPOSE + + ## PRIVATE + get_mode_name : Any + get_mode_name self = case self of + Normalization.NFC -> "nfc" + Normalization.NFD -> "nfc" + Normalization.NFKC -> "nfkc" + Normalization.NFKD -> "nfkc" + Normalization.NFKCCasefold -> "nfkc_cf" diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso index 0f23a5386d05..0bd199a21271 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso @@ -31,6 +31,7 @@ import project.Data.Text.Encoding.Encoding import project.Data.Text.Line_Ending_Style.Line_Ending_Style import project.Data.Text.Location.Location import project.Data.Text.Matching_Mode.Matching_Mode +import project.Data.Text.Normalization.Normalization import project.Data.Text.Regex.Regex import project.Data.Text.Text import project.Data.Text.Text_Ordering.Text_Ordering @@ -118,6 +119,7 @@ export project.Data.Text.Encoding.Encoding export project.Data.Text.Line_Ending_Style.Line_Ending_Style export project.Data.Text.Location.Location export project.Data.Text.Matching_Mode.Matching_Mode +export project.Data.Text.Normalization.Normalization export project.Data.Text.Regex.Regex export project.Data.Text.Text export project.Data.Text.Text_Ordering.Text_Ordering diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index c686abef3e31..60aea905fc92 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -5,6 +5,7 @@ import com.ibm.icu.text.CaseMap.Fold; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Normalizer2.Mode; import com.ibm.icu.text.StringSearch; import java.util.ArrayList; import java.util.List; @@ -611,6 +612,19 @@ public static String normalize(String str) { return Normalizer2.getNFDInstance().normalize(str); } + /** + * Normalizes the string to its canonical Unicode form using the specified name and mode. + * + * @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.html + * @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.Mode.html + * + * @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf" + * @param mode the normalization mode + */ + public static String normalizeWithMode(String str, String name, Mode mode) { + return Normalizer2.getInstance(null, name, mode).normalize(str); + } + /** * Checks if the given string consists only of whitespace characters. * diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 6cb577e887c9..d4845531398b 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -134,6 +134,23 @@ spec = Ordering.compare common_prefix+complex_letter_3+later_suffix common_prefix+complex_letter_1+earlier_suffix . should_equal Ordering.Greater Ordering.compare common_prefix+complex_letter_1+later_suffix common_prefix+complex_letter_2+earlier_suffix . should_equal Ordering.Greater + Test.specify "normalizes correctly" <| + 'aśb'.codepoints . should_equal [97, 347, 98] + 'as\u0301b'.codepoints . should_equal [97, 115, 769, 98] + 'aśb'.normalize.codepoints . should_equal [97, 115, 769, 98] + 'as\u0301b'.normalize.codepoints . should_equal [97, 115, 769, 98] + + Test.specify "normalizes correctly using different standard Unicode normalization modes" <| + ## GREEK UPSILON WITH ACUTE AND HOOK SYMBOL behaves differently with each mode. + See https://unicode.org/faq/normalization.html + s = 'ϓ' + s.normalize . codepoints . should_equal [978, 769] + s.normalize Normalization.NFC . codepoints . should_equal [979] + s.normalize Normalization.NFD . codepoints . should_equal [978, 769] + s.normalize Normalization.NFKC . codepoints . should_equal [910] + s.normalize Normalization.NFKD . codepoints . should_equal [933, 769] + s.normalize Normalization.NFKCCasefold . codepoints . should_equal [973] + Test.specify "should correctly handle case-insensitive equality" <| "aBc" . equals_ignore_case "Abc" . should_be_true "abc" . equals_ignore_case "abd" . should_be_false