Skip to content

Commit

Permalink
Change implementation to use take, add some doc and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed Feb 23, 2022
1 parent d4bde3c commit 52c108c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -722,16 +722,30 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
ensures that different ways of expressing the same character in the
underlying binary representation are considered equal.

This however is not always well handled by the regex engine. The behaviour
is as follows:

'ś' . starts_with 's' == False
's\u{301}' . starts_with 's' == False
's\u{301}' . starts_with 'ś' == True
'ś' . starts_with 's\u{301}' == True

'ś' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True

> Example
See if the text "Hello" starts with the prefix "hi".

"Hello".starts_with "hi"
Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitivity -> case case_sensitivity of
True -> Text_Utils.starts_with this prefix
True ->
this.take (Text_Sub_Range.First prefix.length) == prefix
Case_Insensitive locale ->
Text_Utils.starts_with (this.to_case_insensitive_key locale) (prefix.to_case_insensitive_key locale)
this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
Regex_Matcher _ _ _ _ _ ->
preprocessed_pattern = "\A(?:" + prefix + ")"
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
Expand Down Expand Up @@ -761,9 +775,10 @@ Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitivity -> case case_sensitivity of
True -> Text_Utils.ends_with this suffix
True ->
this.take (Text_Sub_Range.Last suffix.length) == suffix
Case_Insensitive locale ->
Text_Utils.ends_with (this.to_case_insensitive_key locale) (suffix.to_case_insensitive_key locale)
this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
Regex_Matcher _ _ _ _ _ ->
preprocessed_pattern = "(?:" + suffix + ")\z"
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
Expand Down
34 changes: 0 additions & 34 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -157,40 +157,6 @@ public static String from_chars(char[] chars) {
return String.valueOf(chars);
}

/**
* Checks whether {@code prefix} is a prefix of {@code str}.
*
* @param str the string to check
* @param prefix the potential prefix
* @return whether {@code prefix} is a prefix of {@code str}
*/
public static boolean starts_with(String str, String prefix) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (prefix.length() == 0) return true;
if (str.length() == 0) return false;
StringSearch searcher = new StringSearch(prefix, str);
return searcher.first() == 0;
}

/**
* Checks whether {@code suffix} is a suffix of {@code str}.
*
* @param str the string to check
* @param suffix the potential suffix
* @return whether {@code suffix} is a suffix of {@code str}
*/
public static boolean ends_with(String str, String suffix) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (suffix.length() == 0) return true;
if (str.length() == 0) return false;
StringSearch searcher = new StringSearch(suffix, str);
int pos = searcher.last();
int endPos = pos + searcher.getMatchLength();
return endPos == str.length();
}

/**
* Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
* normalization.
Expand Down
21 changes: 19 additions & 2 deletions test/Tests/src/Data/Text_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -472,12 +472,29 @@ spec =
"123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
"foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false

# Correct non-regex behaviour for reference.
'ś' . starts_with 's' == False
's\u{301}' . starts_with 's' == False
's\u{301}' . starts_with 'ś' == True
'ś' . starts_with 's\u{301}' == True

# These two behave as expected.
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True

## These two are included to document the current behaviour
(even though ideally, we would want them to return False).
'ś' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True

"ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
"ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
# TODO ugh...

## These two tests below are disabled due to how regex is handling
letters with accents. See the tests above for explanation.
#"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
# 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false

"fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
Expand Down

0 comments on commit 52c108c

Please sign in to comment.