From 03e6e123fc55fd8c949c53f28a88b8e9f2be4105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 18 Feb 2022 11:17:22 +0100 Subject: [PATCH] Fix tests, implement exact matching --- .../0.0.0-dev/src/Data/Text/Extensions.enso | 18 +++++++++++++----- .../main/java/org/enso/base/Text_Utils.java | 4 ++++ test/Tests/src/Data/Text_Spec.enso | 8 ++++++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 01c2c0e37f37..3e09579512f7 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -780,12 +780,20 @@ Text.ends_with suffix = Text_Utils.ends_with this suffix > Example See if the text "Hello!" contains the text 'LO', ignoring case sensitivity. - "Hello!".contains 'LO' (Text_Matcher Case_Insensitive.new) + "Hello!".contains "LO" (Text_Matcher Case_Insensitive.new) + + > Example + See if the text "Hello!" contains any lowercase letters, using a regex. + + "Hello!".contains "[a-z]" Regex_Matcher.new Text.contains : Text -> (Text_Matcher | Regex_Matcher) -> Boolean -Text.contains term="" matcher=Text_Matcher.new = - # TODO - _ = matcher - Text_Utils.contains this term +Text.contains term="" matcher=Text_Matcher.new = case matcher of + Text_Matcher case_sensitivity -> case case_sensitivity of + True -> Text_Utils.contains this term + Case_Insensitive locale -> + Text_Utils.contains (this.to_lower_case locale) (term.to_lower_case locale) + Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments -> + Error.throw "TODO" ## Text to JSON conversion. diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index 8ca32a7e087f..687610393bca 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -1,6 +1,7 @@ package org.enso.base; import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Normalizer2; import java.nio.charset.StandardCharsets; import java.util.regex.Pattern; @@ -210,6 +211,9 @@ public static int compare_normalized(String a, String b) { * @return whether {@code substring} is a substring of {@code string}. */ public static boolean contains(String string, String substring) { + Normalizer2 normalizer = Normalizer2.getNFDInstance(); + string = normalizer.normalize(string); + substring = normalizer.normalize(substring); return string.contains(substring); } diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 4bce3ceeb1d8..f11d767c6708 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -206,11 +206,15 @@ spec = Test.specify "should check for contains using Unicode normalization" <| "Hello".contains "ell" . should_be_true + "Cześć".contains 's\u{301}' . should_be_true + "Cześć".contains 'c\u{301}' . should_be_true "Cześć".contains 'ść' . should_be_true - "Czes\u{301}c\u{301}".contains 'ść' . should_be_true + 'Czes\u{301}c\u{301}'.contains 'ś' . should_be_true + 'Czes\u{301}c\u{301}'.contains 'ć' . should_be_true + 'Czes\u{301}c\u{301}'.contains 'ść' . should_be_true "Cześć".contains 'sc' . should_be_false - "Czes\u{301}c\u{301}".contains 'sc' . should_be_false + 'Czes\u{301}c\u{301}'.contains 'sc' . should_be_false "ABC" . contains "a" . should_be_false "" . contains "foo" . should_be_false