Skip to content

Commit

Permalink
Expose Unicode normalization methods on Texts (#7425)
Browse files Browse the repository at this point in the history
Exposes Text_Utils.normalize().
  • Loading branch information
GregoryTravis authored Aug 3, 2023
1 parent 9e9cf0b commit 037a687
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@
- [Implemented `replace` on database columns.][7275]
- [Retire `Column_Selector` and allow regex based selection of columns.][7295]
- [`Text.parse_to_table` can take a `Regex`.][7297]
- [Expose `Text.normalize`.][7425]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -778,6 +779,7 @@
[7275]: https://github.com/enso-org/enso/pull/7275
[7295]: https://github.com/enso-org/enso/pull/7295
[7297]: https://github.com/enso-org/enso/pull/7297
[7425]: https://github.com/enso-org/enso/pull/7425

#### Enso Compiler

Expand Down
18 changes: 18 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import project.Any.Any
import project.Data.Locale.Locale
import project.Data.Numbers.Integer
import project.Data.Ordering.Ordering
import project.Data.Text.Normalization.Normalization
import project.Error.Error
import project.Errors.Common.Type_Error
import project.Meta
Expand Down Expand Up @@ -142,3 +143,20 @@ type Text
"14.95€".is_normalized
is_normalized : Boolean
is_normalized self = @Builtin_Method "Text.is_normalized"

## ALIAS normalise

Perform Unicode normalization on the string, using the specified method.

Arguments:
- normalization: The `Normalization` method to use.

> Example
Normalize a string.

'aśb'.normalize == 'as\u0301b'
normalize : Text -> Normalization -> Text
normalize self normalization=Normalization.NFD =
mode = normalization.get_java_mode
name = normalization.get_mode_name
Text_Utils.normalizeWithMode self name mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import project.Any.Any

import project.Data.Vector.Vector

polyglot java import com.ibm.icu.text.Normalizer2.Mode

## Enso representations of `com.ibm.icu.text.Normalizer2.Mode`. Does not
include all values of `Mode`.
type Normalization
## Unicode NFC normalization mode.
NFC

## Unicode NFD normalization mode.
NFD

## Unicode NFKC normalization mode.
NFKC

## Unicode NFKD normalization mode.
NFKD

## Unicode NFKC_CF normalization mode.
NFKCCasefold

## PRIVATE
get_java_mode : Any
get_java_mode self = case self of
Normalization.NFC -> Mode.COMPOSE
Normalization.NFD -> Mode.DECOMPOSE
Normalization.NFKC -> Mode.COMPOSE
Normalization.NFKD -> Mode.DECOMPOSE
Normalization.NFKCCasefold -> Mode.COMPOSE

## PRIVATE
get_mode_name : Any
get_mode_name self = case self of
Normalization.NFC -> "nfc"
Normalization.NFD -> "nfc"
Normalization.NFKC -> "nfkc"
Normalization.NFKD -> "nfkc"
Normalization.NFKCCasefold -> "nfkc_cf"
2 changes: 2 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import project.Data.Text.Encoding.Encoding
import project.Data.Text.Line_Ending_Style.Line_Ending_Style
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Normalization.Normalization
import project.Data.Text.Regex.Regex
import project.Data.Text.Text
import project.Data.Text.Text_Ordering.Text_Ordering
Expand Down Expand Up @@ -118,6 +119,7 @@ export project.Data.Text.Encoding.Encoding
export project.Data.Text.Line_Ending_Style.Line_Ending_Style
export project.Data.Text.Location.Location
export project.Data.Text.Matching_Mode.Matching_Mode
export project.Data.Text.Normalization.Normalization
export project.Data.Text.Regex.Regex
export project.Data.Text.Text
export project.Data.Text.Text_Ordering.Text_Ordering
Expand Down
14 changes: 14 additions & 0 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import com.ibm.icu.text.CaseMap.Fold;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Normalizer2.Mode;
import com.ibm.icu.text.StringSearch;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -611,6 +612,19 @@ public static String normalize(String str) {
return Normalizer2.getNFDInstance().normalize(str);
}

/**
* Normalizes the string to its canonical Unicode form using the specified name and mode.
*
* @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.html
* @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.Mode.html
*
* @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf"
* @param mode the normalization mode
*/
public static String normalizeWithMode(String str, String name, Mode mode) {
return Normalizer2.getInstance(null, name, mode).normalize(str);
}

/**
* Checks if the given string consists only of whitespace characters.
*
Expand Down
17 changes: 17 additions & 0 deletions test/Tests/src/Data/Text_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,23 @@ spec =
Ordering.compare common_prefix+complex_letter_3+later_suffix common_prefix+complex_letter_1+earlier_suffix . should_equal Ordering.Greater
Ordering.compare common_prefix+complex_letter_1+later_suffix common_prefix+complex_letter_2+earlier_suffix . should_equal Ordering.Greater

Test.specify "normalizes correctly" <|
'aśb'.codepoints . should_equal [97, 347, 98]
'as\u0301b'.codepoints . should_equal [97, 115, 769, 98]
'aśb'.normalize.codepoints . should_equal [97, 115, 769, 98]
'as\u0301b'.normalize.codepoints . should_equal [97, 115, 769, 98]

Test.specify "normalizes correctly using different standard Unicode normalization modes" <|
## GREEK UPSILON WITH ACUTE AND HOOK SYMBOL behaves differently with each mode.
See https://unicode.org/faq/normalization.html
s = 'ϓ'
s.normalize . codepoints . should_equal [978, 769]
s.normalize Normalization.NFC . codepoints . should_equal [979]
s.normalize Normalization.NFD . codepoints . should_equal [978, 769]
s.normalize Normalization.NFKC . codepoints . should_equal [910]
s.normalize Normalization.NFKD . codepoints . should_equal [933, 769]
s.normalize Normalization.NFKCCasefold . codepoints . should_equal [973]

Test.specify "should correctly handle case-insensitive equality" <|
"aBc" . equals_ignore_case "Abc" . should_be_true
"abc" . equals_ignore_case "abd" . should_be_false
Expand Down

0 comments on commit 037a687

Please sign in to comment.