From 247b2843163c0e9838839250472f0807f984d7cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Sat, 12 Mar 2022 20:42:00 +0100 Subject: [PATCH] Data analysts should be able to use `Text.location_of` to find indexes within string using various matchers (#3324) Implements https://www.pivotaltracker.com/n/projects/2539304/stories/181266029 --- CHANGELOG.md | 4 +- .../0.0.0-dev/src/Data/Text/Extensions.enso | 235 ++++++++++++++-- .../src/Data/Text/Matching_Mode.enso | 5 + .../src/Data/Text/Regex/Engine/Default.enso | 48 ++-- .../0.0.0-dev/src/Data/Text/Regex/Mode.enso | 4 +- .../Base/0.0.0-dev/src/Data/Text/Span.enso | 137 ++++++++-- .../src/Data/Text/Text_Sub_Range.enso | 24 +- .../src/main/resources/application.conf | 2 +- .../main/java/org/enso/base/Text_Utils.java | 257 +++++++++++++++++- .../org/enso/base/text/CaseFoldedString.java | 135 +++++++++ .../java/org/enso/base/text/GraphemeSpan.java | 28 ++ .../org/enso/base/text/IntArrayBuilder.java | 65 +++++ .../java/org/enso/base/text/StringSlice.java | 34 +++ .../java/org/enso/base/text/Utf16Span.java | 18 ++ .../Data/Text/Default_Regex_Engine_Spec.enso | 34 ++- test/Tests/src/Data/Text/Regex_Spec.enso | 1 + test/Tests/src/Data/Text/Span_Spec.enso | 30 +- test/Tests/src/Data/Text/Utils_Spec.enso | 61 +++++ test/Tests/src/Data/Text_Spec.enso | 220 ++++++++++++++- test/Tests/src/Examples_Spec.enso | 1 + test/Tests/src/Main.enso | 2 + 21 files changed, 1236 insertions(+), 109 deletions(-) create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso create mode 100644 std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java create mode 100644 std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java create mode 100644 std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java create mode 100644 std-bits/base/src/main/java/org/enso/base/text/StringSlice.java create mode 100644 std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java create mode 100644 test/Tests/src/Data/Text/Utils_Spec.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cb8119fe28f..12ea6b1a9c5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ - [Implemented `Bool.compare_to` method][3317] - [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to also compute mode, percentile, minimum, maximum.][3318] +- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -100,7 +101,8 @@ [3236]: https://github.com/enso-org/enso/pull/3236 [3311]: https://github.com/enso-org/enso/pull/3311 [3317]: https://github.com/enso-org/enso/pull/3317 -[3317]: https://github.com/enso-org/enso/pull/3318 +[3318]: https://github.com/enso-org/enso/pull/3318 +[3324]: https://github.com/enso-org/enso/pull/3324 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 930e5caf1a2a..a7e53d5c6199 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -5,9 +5,11 @@ from Standard.Builtins import Text, Prim_Text_Helpers import Standard.Base.Data.Text.Regex import Standard.Base.Data.Text.Regex.Mode +import Standard.Base.Data.Text.Matching_Mode import Standard.Base.Data.Text.Case import Standard.Base.Data.Text.Location import Standard.Base.Data.Text.Line_Ending_Style +from Standard.Base.Data.Text.Span as Span_Module import Span import Standard.Base.Data.Text.Split_Kind import Standard.Base.Data.Text.Text_Sub_Range import Standard.Base.Data.Locale @@ -15,6 +17,7 @@ import Standard.Base.Meta from Standard.Builtins export Text +export Standard.Base.Data.Text.Matching_Mode export Standard.Base.Data.Text.Case export Standard.Base.Data.Text.Location export Standard.Base.Data.Text.Split_Kind @@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te (('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True Text.equals_ignore_case : Text -> Locale -> Boolean Text.equals_ignore_case that locale=Locale.default = - (this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale) + Text_Utils.equals_ignore_case this that locale.java_locale ## ADVANCED PRIVATE @@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default = used to perform case-insensitive comparisons. Text.to_case_insensitive_key : Locale -> Text Text.to_case_insensitive_key locale=Locale.default = - this.to_case Case.Lower locale . to_case Case.Upper locale + Text_Utils.case_insensitive_key this locale.java_locale ## Compare two texts to discover their ordering. @@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of Text_Matcher case_sensitivity -> case case_sensitivity of True -> Text_Utils.contains this term Case_Insensitive locale -> - Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale) + Text_Utils.contains_case_insensitive this term locale.java_locale Regex_Matcher _ _ _ _ _ -> compiled_pattern = matcher.compile term match = compiled_pattern.match this Mode.First @@ -952,27 +955,6 @@ Text.repeat count=1 = https://www.pivotaltracker.com/story/show/181435598 0.up_to (count.max 0) . fold "" acc-> _-> acc + this -## PRIVATE - Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points -range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error -range_to_char_indices text range = - len = text.length - start = if range.start < 0 then range.start + len else range.start - end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end) - is_valid = (Range 0 len+1).contains - - case (Pair (is_valid start) (is_valid end)) of - Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len) - Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len) - Pair True True -> - if start>=end then (Range 0 0) else - iterator = BreakIterator.getCharacterInstance - iterator.setText text - - start_index = iterator.next start - end_index = iterator.next (end - start) - Range start_index end_index - ## ALIAS first, last, left, right, mid, substring Creates a new Text by selecting the specified range of the input. @@ -1009,7 +991,7 @@ range_to_char_indices text range = Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error Text.take range = char_range = case range of - Range _ _ -> here.range_to_char_indices this range + Range _ _ -> Span_Module.range_to_char_indices this range _ -> range.to_char_range this Text_Utils.substring this char_range.start char_range.end @@ -1049,7 +1031,7 @@ Text.take range = Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error Text.drop range = char_range = case range of - Range _ _ -> here.range_to_char_indices this range + Range _ _ -> Span_Module.range_to_char_indices this range _ -> range.to_char_range this if char_range.start == 0 then Text_Utils.drop_first this char_range.end else prefix = Text_Utils.substring this 0 char_range.start @@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace = loop current break_iterator.previous if start_index >= end_index then "" else Text_Utils.substring this start_index end_index + +## ALIAS find, index_of, position_of, span_of + Find the location of the `term` in the input. + Returns a Span representing the location at which the term was found, or + `Nothing` if the term was not found in the input. + + Arguments: + - term: The term to find. + - mode: Specifies if the first or last occurrence of the term should be + returned if there are multiple occurrences within the input. The first + occurrence is returned by default. + - matcher: Specifies how the term is matched against the input: + - If a `Text_Matcher`, the text is compared using case-sensitively rules + specified in the matcher. + - If a `Regex_Matcher`, the `term` is used as a regular expression and + matched using the associated options. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Finding location of a substring. + + "Hello World!".location_of "J" == Nothing + "Hello World!".location_of "o" == Span (Range 4 5) "Hello World!" + "Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!" + + ! Match Length + The function returns not only the index of the match but a `Span` instance + which contains both the start and end indices, allowing to determine the + length of the match. This is useful not only with regex matches (where a + regular expression can have matches of various lengths) but also for case + insensitive matching. In case insensitive mode, a single character can + match multiple characters, for example `ß` will match `ss` and `SS`, and + the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive + mode, the length of the match can be shorter or longer than the term that + was being matched, so it is extremely important to not rely on the length + of the matched term when analysing the matches as they may have different + lengths. + + > Example + Match length differences in case insensitive matching. + + term = "straße" + text = "MONUMENTENSTRASSE 42" + match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new) + term.length == 6 + match.length == 7 + + ! Matching Grapheme Clusters + In case insensitive mode, a single character can match multiple characters, + for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match + `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to + match only a part of some single grapheme cluster, for example in the text + `ffia` the term `ia` will match just one-third of the first grapheme `ffi`. + Since we do not have the resolution to distinguish such partial matches + (as that would require non-integer indices), so a match which matched just + a part of some grapheme cluster is extended and treated as if it matched + the whole grapheme cluster. + + > Example + Extending matches to full grapheme clusters. + + ligatures = "ffiffl" + ligatures.length == 2 + term_1 = "IFF" + match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new) + term_1.length == 3 + match_1.length == 2 + term_2 = "ffiffl" + match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new) + term_2.length == 6 + match_2.length == 2 + # After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters. + match_1 == match_2 +Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing +Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of + Text_Matcher case_sensitive -> case case_sensitive of + True -> + codepoint_span = case mode of + Matching_Mode.First -> Text_Utils.span_of this term + Matching_Mode.Last -> Text_Utils.last_span_of this term + if codepoint_span.is_nothing then Nothing else + start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start + ## While the codepoint_span may have different code unit length + from our term, the `length` counted in grapheme clusters is + guaranteed to be the same. + end = start + term.length + Span (Range start end) this + Case_Insensitive locale -> case term.is_empty of + True -> case mode of + Matching_Mode.First -> Span (Range 0 0) this + Matching_Mode.Last -> + end = this.length + Span (Range end end) this + False -> + search_for_last = case mode of + Matching_Mode.First -> False + Matching_Mode.Last -> True + case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of + Nothing -> Nothing + grapheme_span -> + Span (Range grapheme_span.start grapheme_span.end) this + Regex_Matcher _ _ _ _ _ -> case mode of + Matching_Mode.First -> + case matcher.compile term . match this Mode.First of + Nothing -> Nothing + match -> match.span 0 . to_grapheme_span + Matching_Mode.Last -> + case matcher.compile term . match this Mode.All of + Nothing -> Nothing + matches -> matches.last.span 0 . to_grapheme_span + +## ALIAS find_all, index_of_all, position_of_all, span_of_all + Finds all the locations of the `term` in the input. + If not found, the function returns an empty Vector. + + Arguments: + - term: The term to find. + - matcher: Specifies how the term is matched against the input: + - If a `Text_Matcher`, the text is compared using case-sensitively rules + specified in the matcher. + - If a `Regex_Matcher`, the `term` is used as a regular expression and + matched using the associated options. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + > Example + Finding locations of all occurrences of a substring. + + "Hello World!".location_of_all "J" == [] + "Hello World!".location_of_all "o" . map .start == [4, 7] + + ! Match Length + The function returns not only the index of the match but a `Span` instance + which contains both the start and end indices, allowing to determine the + length of the match. This is useful not only with regex matches (where a + regular expression can have matches of various lengths) but also for case + insensitive matching. In case insensitive mode, a single character can + match multiple characters, for example `ß` will match `ss` and `SS`, and + the ligature `ffi` will match `ffi` or `f` etc. Thus in case insensitive + mode, the length of the match can be shorter or longer than the term that + was being matched, so it is extremely important to not rely on the length + of the matched term when analysing the matches as they may have different + lengths. + + > Example + Match length differences in case insensitive matching. + + term = "strasse" + text = "MONUMENTENSTRASSE ist eine große Straße." + match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new) + term.length == 7 + match . map .length == [7, 6] + + ! Matching Grapheme Clusters + In case insensitive mode, a single character can match multiple characters, + for example `ß` will match `ss` and `SS`, and the ligature `ffi` will match + `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to + match only a part of some single grapheme cluster, for example in the text + `ffia` the term `ia` will match just one-third of the first grapheme `ffi`. + Since we do not have the resolution to distinguish such partial matches + (as that would require non-integer indices), so a match which matched just + a part of some grapheme cluster is extended and treated as if it matched + the whole grapheme cluster. + + > Example + Extending matches to full grapheme clusters. + + ligatures = "ffifflFFIFF" + ligatures.length == 7 + match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new) + match_1 . map .length == [2, 3] + match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new) + match_2 . map .length == [2, 5] +Text.location_of_all : Text -> Matcher -> [Span] +Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of + Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of + True -> + codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term + grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array + ## While the codepoint_spans may have different code unit lengths + from our term, the `length` counted in grapheme clusters is + guaranteed to be the same. + offset = term.length + grahpeme_ixes . map start-> + end = start+offset + Span (Range start end) this + Case_Insensitive locale -> + grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale + grapheme_spans.map grapheme_span-> + Span (Range grapheme_span.start grapheme_span.end) this + Regex_Matcher _ _ _ _ _ -> + case matcher.compile term . match this Mode.All of + Nothing -> [] + matches -> matches.map m-> m.span 0 . to_grapheme_span diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso new file mode 100644 index 000000000000..d6b0a31b50b4 --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso @@ -0,0 +1,5 @@ +## Matches the first found instance. +type First + +## Matches the last found instance. +type Last diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso index e4b1c1ff65b1..102318973833 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso @@ -40,7 +40,7 @@ import Standard.Base.Data.Text.Regex.Engine import Standard.Base.Data.Text.Regex.Option as Global_Option import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Polyglot.Java as Java_Ext -import Standard.Base.Data.Text.Span +from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span from Standard.Builtins import Java @@ -183,8 +183,13 @@ type Pattern on the encoding, we normalize all input. build_matcher : Text -> Integer -> Integer -> Java_Matcher build_matcher input start end = - normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else - Text_Utils.normalize input + ## TODO [RW] Normalization had to be disabled - since start and end are + in code unit space, normalization could shift these indices! + This should be addressed when reviewing + See: https://www.pivotaltracker.com/story/show/181524498 + #normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else + # Text_Utils.normalize input + normalized_input = input internal_matcher = this.internal_pattern.matcher normalized_input . region start end if this.options.contains No_Anchoring_Bounds then @@ -262,7 +267,7 @@ type Pattern internal_matcher = this.build_matcher input start end if internal_matcher . find start . not then Nothing else - Match internal_matcher start end + Match internal_matcher start end input Integer -> if mode < 0 then Panic.throw <| Mode_Error "Cannot match a negative number of times." @@ -272,13 +277,16 @@ type Pattern go : Integer -> Integer -> Nothing go offset remaining_count = should_continue = remaining_count > 0 - if should_continue.not || (offset > end) then Nothing else + if should_continue.not || (offset >= end) then Nothing else internal_matcher = this.build_matcher input start end found = internal_matcher.find offset if found.not then Nothing else - builder.append (Match internal_matcher start end) - @Tail_Call go (internal_matcher.end 0) remaining_count-1 + builder.append (Match internal_matcher start end input) + match_end = internal_matcher.end 0 + # Ensure progress even if the match is an empty string. + new_offset = if match_end > offset then match_end else offset+1 + @Tail_Call go new_offset remaining_count-1 go start mode vector = builder.to_vector @@ -294,8 +302,11 @@ type Pattern found = internal_matcher.find offset if found.not then Nothing else - builder.append (Match internal_matcher start end) - @Tail_Call go (internal_matcher.end 0) + builder.append (Match internal_matcher start end input) + match_end = internal_matcher.end 0 + # Ensure progress even if the match is an empty string. + new_offset = if match_end > offset then match_end else offset+1 + @Tail_Call go new_offset go start vector = builder.to_vector @@ -304,7 +315,7 @@ type Pattern Mode.Full -> internal_matcher = this.build_matcher input start end if internal_matcher.matches.not then Nothing else - Match internal_matcher start end + Match internal_matcher start end input Mode.Bounded _ _ _ -> Panic.throw <| Mode_Error "Modes cannot be recursive." @@ -312,7 +323,7 @@ type Pattern Mode.Bounded start end sub_mode -> if start < end then do_match_mode sub_mode start end else Panic.throw Invalid_Bounds_Error - _ -> do_match_mode mode 0 input.length + _ -> do_match_mode mode 0 (Text_Utils.char_length input) ## ADVANCED @@ -334,7 +345,7 @@ type Pattern pattern.matches input matches : Text -> Boolean matches input = case this.match input mode=Mode.Full of - Match _ _ _ -> True + Match _ _ _ _ -> True Vector.Vector _ -> True _ -> False @@ -405,7 +416,7 @@ type Pattern find input mode=Mode.All = matches = this.match input mode case matches of - Match _ _ _ -> matches.group 0 + Match _ _ _ _ -> matches.group 0 Vector.Vector _ -> matches.map (_.group 0) _ -> matches @@ -548,7 +559,7 @@ type Pattern internal_matcher.replaceAll replacement Mode.Full -> case this.match input mode=Mode.Full of - Match _ _ _ -> replacement + Match _ _ _ _ -> replacement Nothing -> input Mode.Bounded _ _ _ -> Panic.throw <| Mode_Error "Modes cannot be recursive." @@ -556,7 +567,7 @@ type Pattern case mode of Mode.Bounded _ _ _ -> Panic.throw <| Mode_Error "Bounded replacements are not well-formed." - _ -> do_replace_mode mode 0 input.length + _ -> do_replace_mode mode 0 (Text_Utils.char_length input) ## The default implementation of the `Data.Text.Regex.Engine.Match` interface. type Match @@ -570,7 +581,8 @@ type Match match. - region_start: The start of the region over which the match was made. - region_end: The end of the region over which the match was made. - type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) + - input: The input text that was being matched. + type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text) ## Gets the text matched by the group with the provided identifier, or `Nothing` if the group did not participate in the match. If no such group @@ -743,10 +755,10 @@ type Match example_Span = match = Examples.match match.span 0 - span : Integer | Text -> Span | Nothing ! Regex.No_Such_Group_Error + span : Integer | Text -> Utf_16_Span | Nothing ! Regex.No_Such_Group_Error span id = case this.group id of Nothing -> Nothing - _ -> Span.new (this.start id) (this.end id) (this.group 0) + _ -> Utf_16_Span (Range (this.start id) (this.end id)) this.input ## Returns the start character index of the match's region. diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso index 342eb2223920..1db9cf80eaab 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso @@ -4,11 +4,13 @@ to matching on the `Full` content of the input text. from Standard.Base import all +from Standard.Base.Data.Text.Matching_Mode import First +from Standard.Base.Data.Text.Matching_Mode export First type Mode ## The regex will only match the first instance it finds. - type First + First ## The regex will match up to some `Integer` number of instances. Integer diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso index 4875a7eeb8fc..f357d139719e 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso @@ -7,30 +7,14 @@ example_span = text = "Hello!" - Span.new 0 3 text + Span 0 3 text from Standard.Base import all -import Standard.Base.Data.Range +from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error -## Construct a new `Span`. - - Arguments: - - start: The index of the first character included in the span. - - end: The index of the first character after `start` that is _not_ included - in the span. - - text: The `Text` over which the span exists. This is _optional_. - - > Example - Creating a span over the first three characters of the text "hello!". - - import Standard.Base.Data.Text.Span - - example_span = - text = "Hello!" - Span.new 0 3 text -new : Integer -> Integer -> Text | Nothing -> Span -new start end text=Nothing = Span (start.up_to end) text +polyglot java import org.enso.base.Text_Utils +polyglot java import com.ibm.icu.text.BreakIterator type Span @@ -38,7 +22,7 @@ type Span Arguments: - range: The range of characters over which the span exists. - - text: The text over which the span exists. This is _optional_. + - text: The text over which the span exists. ! What is a Character? A character is defined as an Extended Grapheme Cluster, see Unicode @@ -54,7 +38,7 @@ type Span text = "Hello!" range = 0.up_to 3 Span.Span range text - type Span (range : Range.Range) (text : (Text | Nothing) = Nothing) + type Span (range : Range.Range) (text : Text) ## The index of the first character included in the span. @@ -74,3 +58,112 @@ type Span meaning in most text-processing applications. end : Integer end = this.range.end + + ## The length of the span in extended grapheme clusters. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + length : Integer + length = this.range.length + + ## Converts the span of extended grapheme clusters to a corresponding span + of UTF-16 code units. + + > Example + Find the span of code units corresponding to the span of extended grapheme clusters. + + text = 'ae\u{301}fz' + (Span (Range 1 3) text).to_utf_16_span == (Utf_16_Span (Range 1 4) text) + to_utf_16_span : Utf_16_Span + to_utf_16_span = + Utf_16_Span (here.range_to_char_indices this.text this.range) this.text + +type Utf_16_Span + + ## A representation of a span of UTF-16 code units in Enso's `Text` type. + + Arguments: + - range: The range of code units over which the span exists. + - text: The text over which the span exists. + + > Example + Creating a span over the first three code units of the text 'a\u{301}bc'. + + import Standard.Base.Data.Text.Span + + example_span = + text = 'a\u{301}bc' + Span.Utf_16_Span (Range 0 3) text + type Utf_16_Span (range : Range.Range) (text : Text) + + ## The index of the first code unit included in the span. + start : Integer + start = this.range.start + + ## The index of the first code unit after `start` that is _not_ included in + the span. + end : Integer + end = this.range.end + + ## The length of the span in UTF-16 code units. + length : Integer + length = this.range.length + + ## Returns a span of extended grapheme clusters which is the closest + approximation of this span of code units. + + The resulting span is extended in such a way that every code unit that + was contained by the original span is also contained in a new span. Since + some grapheme clusters consist of multiple code units, after the span was + extended it may also contain code units which were not contained inside + of the original span. + + > Example + Convert a codepoint span to graphemes and back. + + text = 'a\u{301}e\u{302}o\u{303}' + span = Utf_16_Span (Range 1 5) text # The span contains the units [\u{301}, e, \u{302}, o]. + extended = span.to_grapheme_span + extended == Span (Range 0 3) text # The span is extended to the whole string since it contained code units from every grapheme cluster. + extended.to_utf_16_span == Utf_16_Span (Range 0 6) text + to_grapheme_span : Span + to_grapheme_span = if (this.start < 0) || (this.end > Text_Utils.char_length this.text) then Error.throw (Illegal_State_Error "Utf_16_Span indices are out of range of the associated text.") else + if this.end < this.start then Error.throw (Illegal_State_Error "Utf_16_Span invariant violation: start <= end") else + case this.start == this.end of + True -> + grapheme_ix = Text_Utils.utf16_index_to_grapheme_index this.text this.start + Span (Range grapheme_ix grapheme_ix) this.text + False -> + grapheme_ixes = Text_Utils.utf16_indices_to_grapheme_indices this.text [this.start, this.end - 1].to_array + grapheme_first = grapheme_ixes.at 0 + grapheme_last = grapheme_ixes.at 1 + ## We find the grapheme index of the last code unit actually contained within our span and set the + end grapheme to the first grapheme after that. This ensures that if code units associated with + only a part of a grapheme were contained in our original span, the resulting span will be + extended to contain this whole grapheme. + grapheme_end = grapheme_last + 1 + Span (Range grapheme_first grapheme_end) this.text + +## PRIVATE + Utility function taking a range pointing at grapheme clusters and converting + to a range on the underlying code units. +range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error +range_to_char_indices text range = + len = text.length + start = if range.start < 0 then range.start + len else range.start + end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end) + is_valid = (Range 0 len+1).contains + + case (Pair (is_valid start) (is_valid end)) of + Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len) + Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len) + Pair True True -> + if start>=end then (Range 0 0) else + iterator = BreakIterator.getCharacterInstance + iterator.setText text + + start_index = iterator.next start + end_index = iterator.next (end - start) + Range start_index end_index diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso index 6db8ddbfbf96..2f22d84d9343 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso @@ -79,24 +79,24 @@ type Text_Sub_Range Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text) Before delimiter -> if delimiter.is_empty then (Range 0 0) else - index = Text_Utils.index_of text delimiter - if index == -1 then (Range 0 (Text_Utils.char_length text)) else - (Range 0 index) + span = Text_Utils.span_of text delimiter + if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else + (Range 0 span.start) Before_Last delimiter -> if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else - index = Text_Utils.last_index_of text delimiter - if index == -1 then (Range 0 (Text_Utils.char_length text)) else - (Range 0 index) + span = Text_Utils.last_span_of text delimiter + if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else + (Range 0 span.start) After delimiter -> if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else - index = Text_Utils.index_of text delimiter - if index == -1 then (Range 0 0) else - (Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text)) + span = Text_Utils.span_of text delimiter + if span.is_nothing then (Range 0 0) else + (Range span.end (Text_Utils.char_length text)) After_Last delimiter -> if delimiter.is_empty then (Range 0 0) else - index = Text_Utils.last_index_of text delimiter - if index == -1 then (Range 0 0) else - (Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text)) + span = Text_Utils.last_span_of text delimiter + if span.is_nothing then (Range 0 0) else + (Range span.end (Text_Utils.char_length text)) While predicate -> indices = find_sub_range_end text _-> start-> end-> predicate (Text_Utils.substring text start end) . not diff --git a/engine/launcher/src/main/resources/application.conf b/engine/launcher/src/main/resources/application.conf index ef9a0daca38b..bf3de5e69207 100644 --- a/engine/launcher/src/main/resources/application.conf +++ b/engine/launcher/src/main/resources/application.conf @@ -1,7 +1,7 @@ akka { loggers = ["akka.event.slf4j.Slf4jLogger"] logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" - version = "2.6.6" + version = "2.6.18" stdout-loglevel = "ERROR" } diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index 8f7b1a858fc0..cab662fb181d 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -1,11 +1,19 @@ package org.enso.base; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.CaseMap.Fold; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.StringSearch; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; import java.util.regex.Pattern; +import org.enso.base.text.CaseFoldedString; +import org.enso.base.text.GraphemeSpan; +import org.enso.base.text.Utf16Span; /** Utils for standard library operations on Text. */ public class Text_Utils { @@ -117,6 +125,23 @@ public static boolean equals(String str1, Object str2) { } } + /** + * Checks whether two strings are equal up to Unicode canonicalization and ignoring case. + * + * @param str1 the first string + * @param str2 the second string + * @param locale the locale to use for case folding + * @return the result of comparison + */ + public static boolean equals_ignore_case(String str1, Object str2, Locale locale) { + if (str2 instanceof String) { + Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale); + return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0; + } else { + return false; + } + } + /** * Converts an array of codepoints into a string. * @@ -176,6 +201,36 @@ public static boolean contains(String string, String substring) { return searcher.first() != StringSearch.DONE; } + /** + * Checks if {@code substring} is a substring of {@code string}. + * + * @param string the containing string. + * @param substring the contained string. + * @return whether {@code substring} is a substring of {@code string}. + */ + public static boolean contains_case_insensitive(String string, String substring, Locale locale) { + // {@code StringSearch} does not handle empty strings as we would want, so we need these special + // cases. + if (substring.isEmpty()) return true; + if (string.isEmpty()) return false; + + Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale); + StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string)); + return searcher.first() != StringSearch.DONE; + } + + /** + * Transforms the provided string into a form which can be used for case insensitive comparisons. + * + * @param string the string to transform + * @param locale the locale to use - needed to distinguish a special case when handling Turkish + * 'i' characters + * @return a transformed string that can be used for case insensitive comparisons + */ + public static String case_insensitive_key(String string, Locale locale) { + return CaseFoldedString.simpleFold(string, locale); + } + /** * Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}. * @@ -200,37 +255,215 @@ public static long char_length(String str) { } /** - * Find the first index of needle in the haystack + * Find the first occurrence of needle in the haystack * * @param haystack the string to search * @param needle the substring that is searched for - * @return index of the first needle or -1 if not found. + * @return a UTF-16 code unit span of the first needle or null if not found. */ - public static long index_of(String haystack, String needle) { + public static Utf16Span span_of(String haystack, String needle) { + if (needle.isEmpty()) return new Utf16Span(0, 0); + if (haystack.isEmpty()) return null; + StringSearch search = new StringSearch(needle, haystack); int pos = search.first(); - return pos == StringSearch.DONE ? -1 : pos; + if (pos == StringSearch.DONE) return null; + return new Utf16Span(pos, pos + search.getMatchLength()); } /** - * Find the last index of needle in the haystack + * Find the last occurrence of needle in the haystack * * @param haystack the string to search * @param needle the substring that is searched for - * @return index of the last needle or -1 if not found. + * @return a UTF-16 code unit span of the last needle or null if not found. */ - public static long last_index_of(String haystack, String needle) { + public static Utf16Span last_span_of(String haystack, String needle) { + if (needle.isEmpty()) { + int afterLast = haystack.length(); + return new Utf16Span(afterLast, afterLast); + } + if (haystack.isEmpty()) return null; + StringSearch search = new StringSearch(needle, haystack); - int pos = search.first(); + int pos = search.last(); + if (pos == StringSearch.DONE) return null; + return new Utf16Span(pos, pos + search.getMatchLength()); + } + + /** + * Find spans of all occurrences of the needle within the haystack. + * + * @param haystack the string to search + * @param needle the substring that is searched for + * @return a list of UTF-16 code unit spans at which the needle occurs in the haystack + */ + public static List span_of_all(String haystack, String needle) { + if (needle.isEmpty()) + throw new IllegalArgumentException( + "The operation `index_of_all` does not support searching for an empty term."); + if (haystack.isEmpty()) return List.of(); + + StringSearch search = new StringSearch(needle, haystack); + ArrayList occurrences = new ArrayList<>(); + long ix; + while ((ix = search.next()) != StringSearch.DONE) { + occurrences.add(new Utf16Span(ix, ix + search.getMatchLength())); + } + return occurrences; + } + + /** + * Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to. + * + * @param text the text associated with the index + * @param codeunit_index the UTF-16 index + * @return an index of an extended grapheme cluster that contains the code unit from the input + */ + public static long utf16_index_to_grapheme_index(String text, long codeunit_index) { + BreakIterator breakIterator = BreakIterator.getCharacterInstance(); + breakIterator.setText(text); + if (codeunit_index < 0 || codeunit_index > text.length()) { + throw new IndexOutOfBoundsException( + "Index " + codeunit_index + " is outside of the provided text."); + } + + int grapheme_end = breakIterator.next(); + long grapheme_index = 0; + + while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) { + grapheme_index++; + grapheme_end = breakIterator.next(); + } + return grapheme_index; + } + + /** + * Converts a series of UTF-16 code unit indices to indices of graphemes that these code units + * belong to. + * + *

For performance, it assumes that the provided indices are sorted in a non-decreasing order + * (duplicate entries are permitted). Behaviour is unspecified if an unsorted list is provided. + * + *

The behaviour is unspecified if indices provided on the input are outside of the range [0, + * text.length()]. + * + * @param text the text associated with the indices + * @param codeunit_indices the array of UTF-16 code unit indices, sorted in non-decreasing order + * @return an array of grapheme indices corresponding to the UTF-16 units from the input + */ + public static long[] utf16_indices_to_grapheme_indices(String text, List codeunit_indices) { + BreakIterator breakIterator = BreakIterator.getCharacterInstance(); + breakIterator.setText(text); + + int grapheme_end = breakIterator.next(); + long grapheme_index = 0; + + long[] result = new long[codeunit_indices.size()]; + int result_ix = 0; + + for (long codeunit_index : codeunit_indices) { + while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) { + grapheme_index++; + grapheme_end = breakIterator.next(); + } + result[result_ix++] = grapheme_index; + } + + return result; + } + + /** + * Find the first or last occurrence of needle in the haystack. + * + * @param haystack the string to search + * @param needle the substring that is searched for + * @param locale the locale used for case-insensitive comparisons + * @param searchForLast if set to true, will search for the last occurrence; otherwise searches + * for the first one + * @return an extended-grapheme-cluster span of the first or last needle, or null if none found. + */ + public static GraphemeSpan span_of_case_insensitive( + String haystack, String needle, Locale locale, boolean searchForLast) { + if (needle.isEmpty()) + throw new IllegalArgumentException( + "The operation `span_of_case_insensitive` does not support searching for an empty term."); + if (haystack.isEmpty()) return null; + + CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale); + String foldedNeedle = CaseFoldedString.simpleFold(needle, locale); + StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString()); + int pos; + if (searchForLast) { + pos = search.last(); + } else { + pos = search.first(); + } if (pos == StringSearch.DONE) { - return -1; + return null; + } else { + return findExtendedSpan(foldedHaystack, pos, search.getMatchLength()); } + } + + /** + * Find all occurrences of needle in the haystack + * + * @param haystack the string to search + * @param needle the substring that is searched for + * @param locale the locale used for case-insensitive comparisons + * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack + */ + public static List span_of_all_case_insensitive( + String haystack, String needle, Locale locale) { + if (needle.isEmpty()) + throw new IllegalArgumentException( + "The operation `span_of_all_case_insensitive` does not support searching for an empty term."); + if (haystack.isEmpty()) return List.of(); + + CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale); + String foldedNeedle = CaseFoldedString.simpleFold(needle, locale); + + StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString()); + ArrayList result = new ArrayList<>(); - for (int next = search.next(); next != StringSearch.DONE; next = search.next()) { - pos = next; + int pos; + while ((pos = search.next()) != StringSearch.DONE) { + result.add(findExtendedSpan(foldedHaystack, pos, search.getMatchLength())); } - return pos; + return result; + } + + /** + * Finds the grapheme span corresponding to the found match indexed with code units. + * + *

It extends the found span to ensure that graphemes associated with all found code units are + * included in the resulting span. Thus, some additional code units which were not present in the + * original match may also be present due to the extension. + * + *

The extension to the left is trivial - we just find the grapheme associated with the first + * code unit and even if that code unit is not the first one of that grapheme, by returning it we + * correctly extend to the left. The extension to the right works by finding the index of the + * grapheme associated with the last code unit actually present in the span, then the end of the + * returned span is set to the next grapheme after it. This correctly handles the edge case where + * only a part of some grapheme was matched. + * + * @param string the folded string with which the positions are associated, containing a cache of + * position mappings + * @param position the position of the match (in code units) + * @param length the length of the match (in code units) + * @return a minimal {@code GraphemeSpan} which contains all code units from the match + */ + private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) { + int firstGrapheme = string.codeUnitToGraphemeIndex(position); + if (length == 0) { + return new GraphemeSpan(firstGrapheme, firstGrapheme); + } else { + int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1); + int endGrapheme = lastGrapheme + 1; + return new GraphemeSpan(firstGrapheme, endGrapheme); + } } /** diff --git a/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java b/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java new file mode 100644 index 000000000000..75a9aa101a44 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java @@ -0,0 +1,135 @@ +package org.enso.base.text; + +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.CaseMap; +import com.ibm.icu.text.CaseMap.Fold; +import java.util.Locale; + +/** + * Represents a string transformed using Unicode Case Folding which can be used for case insensitive + * comparisons. + * + *

It contains facilities for converting indices in the transformed string to corresponding + * indices back in the original string. + */ +public class CaseFoldedString { + private final String foldedString; + + /** + * A mapping from code units in the transformed string to their corresponding graphemes in the + * original string. + * + *

The mapping must be valid from indices from 0 to @{code foldedString.length()+1} + * (inclusive). + */ + private final int[] graphemeIndexMapping; + + /** + * Constructs a new instance of the folded string. + * + * @param foldeString the string after applying the case folding transformation + * @param graphemeIndexMapping a mapping created during the transformation which maps code units + * in the transformed string to their corresponding graphemes in the original string + */ + private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) { + this.foldedString = foldeString; + this.graphemeIndexMapping = graphemeIndexMapping; + } + + /** + * Maps a code unit in the folded string to the corresponding grapheme in the original string. + * + * @param codeunitIndex the index of the code unit in the folded string, valid indices range from + * 0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the + * position of the end code unit which is located right after the end of the string - which + * should always map to the analogous end grapheme. + * @return the index of the grapheme from the original string that after applying the + * transformation contains the requested code unit + */ + public int codeUnitToGraphemeIndex(int codeunitIndex) { + if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) { + throw new IndexOutOfBoundsException(codeunitIndex); + } + return graphemeIndexMapping[codeunitIndex]; + } + + /** Returns the transformed string. */ + public String getFoldedString() { + return foldedString; + } + + /** + * Folds a string remembering the mapping from code units to its original grapheme cluster + * indices. + * + * @param charSequence a sequence of UTF-16 characters to transform + * @param locale the locale to use as a reference for case folding; it is needed because Turkish + * and Azerbaijani locales handle casing of the letter `i` in a different way than other + * locales + * @return a {@code CaseFoldedString} instance which contains the transformed string and allows to + * map its code units to original grapheme clusters + */ + public static CaseFoldedString fold(CharSequence charSequence, Locale locale) { + BreakIterator breakIterator = BreakIterator.getCharacterInstance(); + breakIterator.setText(charSequence); + StringBuilder stringBuilder = new StringBuilder(charSequence.length()); + Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale); + IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1); + + // We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of + // each grapheme cluster is independent of surrounding ones. Regular casing is + // context-sensitive. + int current = breakIterator.current(); + int next; + int grapheme_index = 0; + while ((next = breakIterator.next()) != BreakIterator.DONE) { + CharSequence grapheme = new StringSlice(charSequence, current, next); + String foldedGrapheme = foldAlgorithm.apply(grapheme); + stringBuilder.append(foldedGrapheme); + for (int i = 0; i < foldedGrapheme.length(); ++i) { + index_mapping.add(grapheme_index); + } + + grapheme_index++; + current = next; + } + + // The mapping should also be able to handle a {@code str.length()} query, so we add one more + // element to the mapping pointing to a non-existent grapheme after the end of the text. + index_mapping.add(grapheme_index); + + return new CaseFoldedString( + stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder()); + } + + /** + * A helper function which folds the string without remembering the index mapping. + * + *

It should be used when the index mapping is not needed, as its implementation is much more + * efficient. + * + * @param charSequence a sequence of UTF-16 characters to transform + * @param locale the locale to use as a reference for case folding; it is needed because Turkish + * and Azerbaijani locales handle casing of the letter `i` in a different way than the others + * @return the folded string + */ + public static String simpleFold(CharSequence string, Locale locale) { + return caseFoldAlgorithmForLocale(locale).apply(string); + } + + private static final Locale AZ_LOCALE = new Locale("az"); + private static final Locale TR_LOCALE = new Locale("tr"); + + /** + * Returns a case folding algorithm appropriate for the given locale. + * + *

The algorithm is locale-dependent because Turkish and Azerbaijani locales handle casing of + * the letter `i` in a different way than other locales. + */ + public static Fold caseFoldAlgorithmForLocale(Locale locale) { + if (locale.equals(AZ_LOCALE) || locale.equals(TR_LOCALE)) { + return CaseMap.fold().turkic(); + } + return CaseMap.fold(); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java b/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java new file mode 100644 index 000000000000..8ba21e802415 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java @@ -0,0 +1,28 @@ +package org.enso.base.text; + +/** + * Represents a span of characters (understood as extended grapheme clusters) within a Text. + * + *

The start index indicates the first grapheme of the span and the end index indicates the first + * grapheme after the end of the span. + * + *

Represents an empty span if start and end indices are equal. Such an empty span refers to the + * space just before the grapheme corresponding to index start. + */ +public class GraphemeSpan { + + public final long start, end; + + /** + * Constructs a span of characters (understood as extended grapheme clusters). + * + * @param start index of the first extended grapheme cluster contained within the span (or + * location of the span if it is empty) + * @param end index of the first extended grapheme cluster after start that is not contained + * within the span + */ + public GraphemeSpan(long start, long end) { + this.start = start; + this.end = end; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java b/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java new file mode 100644 index 000000000000..23b56fdaac0d --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java @@ -0,0 +1,65 @@ +package org.enso.base.text; + +/** A helper to efficiently build an array of unboxed integers of arbitrary length. */ +public class IntArrayBuilder { + private int[] storage; + private int length; + + /** + * Constructs an empty builder with a given initial capacity. + * + * @param initialCapacity the initial capacity of the builder, can be used to avoid expanding the + * storage if the amount of elements can be estimated in advance. + */ + public IntArrayBuilder(int initialCapacity) { + length = 0; + storage = new int[initialCapacity]; + } + + /** Adds a new element to the array, expanding it if necessary. */ + public void add(int x) { + if (length >= storage.length) { + grow(); + } + + storage[length++] = x; + } + + /** + * Expands the storage to fit more elements. + * + *

The storage grows by 50% and is always increased by at least one. The 50% growth is chosen + * so that the amortized cost of adding a new element to the array stays constant. + */ + private void grow() { + int newCapacity = storage.length + (storage.length / 2); + if (newCapacity <= storage.length) { + newCapacity = storage.length + 1; + } + + int[] newStorage = new int[newCapacity]; + System.arraycopy(this.storage, 0, newStorage, 0, length); + this.storage = newStorage; + } + + /** Returns the amount of elements already added to the storage. */ + public int getLength() { + return length; + } + + /** + * Returns the underlying storage of the builder. + * + *

This method avoids copying for performance so it should be used with care. The storage can + * actually have more elements than were added, so the user should be careful to only query the + * first {@code getLength()} elements. Querying other elements results in an unspecified result. + * + *

After calling this method, the builder is invalidated and cannot be used anymore. Any usage + * of the builder afterwards will result in a {@code NullPointerException}. + */ + public int[] unsafeGetStorageAndInvalidateTheBuilder() { + int[] tmp = storage; + this.storage = null; + return tmp; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java b/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java new file mode 100644 index 000000000000..5374e3ff1129 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java @@ -0,0 +1,34 @@ +package org.enso.base.text; + +/** A char sequence which allows to access a slice of another char sequence without copying. */ +class StringSlice implements CharSequence { + private final CharSequence text; + private final int subStart, subEnd; + + /** Constructs a slice of the given text. */ + public StringSlice(CharSequence text, int start, int end) { + this.text = text; + this.subStart = start; + this.subEnd = end; + } + + @Override + public int length() { + return subEnd - subStart; + } + + @Override + public char charAt(int index) { + return text.charAt(subStart + index); + } + + @Override + public CharSequence subSequence(int start, int end) { + return new StringSlice(text, subStart + start, subStart + end); + } + + @Override + public String toString() { + return text.subSequence(subStart, subEnd).toString(); + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java b/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java new file mode 100644 index 000000000000..a4a3b31419fa --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java @@ -0,0 +1,18 @@ +package org.enso.base.text; + +/** + * Represents a span of UTF-16 code units within a String. + * + *

The start index indicates the first code unit of the span and the end index indicates the + * first code unit after the end of the span. + */ +public class Utf16Span { + + public final long start, end; + + /** Constructs a span of UTF-16 code units. */ + public Utf16Span(long start, long end) { + this.start = start; + this.end = end; + } +} diff --git a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso b/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso index 9cd711138329..1116d350254c 100644 --- a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso +++ b/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso @@ -6,7 +6,7 @@ import Standard.Base.Data.Text.Regex import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Text.Regex.Option as Global_Option -import Standard.Base.Data.Text.Span +from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span polyglot java import java.util.regex.Pattern as Java_Pattern @@ -182,6 +182,22 @@ spec = match.at 1 . group 0 . should_equal "ef" match.at 2 . group 0 . should_equal "gh" + Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <| + pattern = engine.compile "" [] + match_1 = pattern.match "" mode=Mode.All + match_1.length . should_equal 1 + match_1.at 0 . start 0 . should_equal 0 + match_1.at 0 . end 0 . should_equal 0 + + match_2 = pattern.match "ABC" mode=Mode.All + match_2.length . should_equal 4 + match_2.at 0 . start 0 . should_equal 0 + match_2.at 0 . end 0 . should_equal 0 + match_2.at 1 . start 0 . should_equal 1 + match_2.at 1 . end 0 . should_equal 1 + match_2.at 3 . start 0 . should_equal 3 + match_2.at 3 . end 0 . should_equal 3 + Test.group "The default regex engine's Pattern.find" <| engine = Default_Engine.new @@ -261,11 +277,23 @@ spec = match.at 1 . should_equal "ef" match.at 2 . should_equal "gh" + match_2 = pattern.find input mode=(Mode.Bounded 2 8 mode=10) + match_2.length . should_equal 3 + match_2.at 0 . should_equal "cd" + match_2.at 1 . should_equal "ef" + match_2.at 2 . should_equal "gh" + + match_3 = pattern.find input mode=(Mode.Bounded 2 8 mode=2) + match_3.length . should_equal 2 + match_3.at 0 . should_equal "cd" + match_3.at 1 . should_equal "ef" + Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <| engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"] + engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"] engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"] @@ -501,10 +529,10 @@ spec = match . should_be_a Default_Engine.Match Test.specify "should get the span of a group by index" <| - match.span 1 . should_equal (Span.new 0 6 input) + match.span 1 . should_equal (Utf_16_Span (Range 0 6) input) Test.specify "should get the span of a group by name" <| - match.span "letters" . should_equal (Span.new 6 18 input) + match.span "letters" . should_equal (Utf_16_Span (Range 6 18) input) Test.specify "should return Nothing if the group didn't match" <| match.span 3 . should_equal Nothing diff --git a/test/Tests/src/Data/Text/Regex_Spec.enso b/test/Tests/src/Data/Text/Regex_Spec.enso index 55aa8a6a93fb..4d7b77cf2c45 100644 --- a/test/Tests/src/Data/Text/Regex_Spec.enso +++ b/test/Tests/src/Data/Text/Regex_Spec.enso @@ -26,3 +26,4 @@ spec = pattern = "http://example.com" Regex.escape pattern . should_equal "\Qhttp://example.com\E" +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Text/Span_Spec.enso b/test/Tests/src/Data/Text/Span_Spec.enso index e8ae45a9c48e..2de0ac5096e8 100644 --- a/test/Tests/src/Data/Text/Span_Spec.enso +++ b/test/Tests/src/Data/Text/Span_Spec.enso @@ -2,20 +2,36 @@ from Standard.Base import all import Standard.Test -import Standard.Base.Data.Text.Span +from Standard.Base.Data.Text.Span as Span_Module import Span, Utf_16_Span spec = Test.group "Text.Span" <| Test.specify "should be able to be created over a text" <| text = "Hello!" - span = Span.new 0 3 text + span = Span (Range 0 3) text span.start . should_equal 0 span.end . should_equal 3 span.text . should_equal text - Test.specify "should be able to be created without a text" <| - span = Span.new 5 8 - span.start . should_equal 5 - span.end . should_equal 8 - span.text . should_equal Nothing + Test.specify "should be able to be converted to code units" <| + text = 'ae\u{301}fz' + (Span (Range 1 3) text).to_utf_16_span . should_equal (Utf_16_Span (Range 1 4) text) + Test.specify "should expand to the associated grapheme clusters" <| + text = 'a\u{301}e\u{302}o\u{303}' + span = Utf_16_Span (Range 1 5) text + extended = span.to_grapheme_span + extended . should_equal (Span (Range 0 3) text) + extended.to_utf_16_span . should_equal (Utf_16_Span (Range 0 6) text) + + Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text) + Utf_16_Span (Range 0 1) text . to_grapheme_span . should_equal (Span (Range 0 1) text) + Utf_16_Span (Range 0 0) text . to_grapheme_span . should_equal (Span (Range 0 0) text) + Utf_16_Span (Range 1 1) text . to_grapheme_span . should_equal (Span (Range 0 0) text) + Utf_16_Span (Range 2 2) text . to_grapheme_span . should_equal (Span (Range 1 1) text) + + Utf_16_Span (Range 0 4) text . to_grapheme_span . should_equal (Span (Range 0 2) text) + Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text) + Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text) + +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Text/Utils_Spec.enso b/test/Tests/src/Data/Text/Utils_Spec.enso new file mode 100644 index 000000000000..b15d7ced5177 --- /dev/null +++ b/test/Tests/src/Data/Text/Utils_Spec.enso @@ -0,0 +1,61 @@ +from Standard.Base import all + +polyglot java import org.enso.base.Text_Utils +polyglot java import org.enso.base.text.CaseFoldedString + +import Standard.Test + +polyglot java import com.ibm.icu.text.BreakIterator +spec = + Test.group "Text_Utils" <| + kshi = '\u0915\u094D\u0937\u093F' + facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F' + text = "a"+kshi+facepalm+'e\u{301}Z' + codepoints_to_graphemes = _.flatten <| text.characters.map_with_index ix-> grapheme-> + codepoints_count = grapheme.utf_16.length + Vector.new codepoints_count _->ix + + Test.specify "should correctly translate an codepoint index to a grapheme index" <| + codepoints_to_graphemes . each_with_index codepoint_ix-> grapheme_ix-> + found_grapheme_ix = Text_Utils.utf16_index_to_grapheme_index text codepoint_ix + found_grapheme_ix.should_equal grapheme_ix + + Text_Utils.utf16_index_to_grapheme_index text text.utf_16.length . should_equal text.length + Text_Utils.utf16_index_to_grapheme_index "" 0 . should_equal 0 + + Text_Utils.utf16_index_to_grapheme_index 'ą' 0 . should_equal 0 + Text_Utils.utf16_index_to_grapheme_index 'ą' 1 . should_equal 1 + + Text_Utils.utf16_index_to_grapheme_index "aB" 0 . should_equal 0 + Text_Utils.utf16_index_to_grapheme_index "aB" 1 . should_equal 1 + Text_Utils.utf16_index_to_grapheme_index "aB" 2 . should_equal 2 + + Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 0 . should_equal 0 + Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 1 . should_equal 0 + Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 2 . should_equal 1 + + Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <| + translate_indices text ixes = + Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array + codepoint_indices = Vector.new text.utf_16.length ix->ix + translate_indices text codepoint_indices . should_equal codepoints_to_graphemes + + translate_indices "" [0] . should_equal [0] + translate_indices 'ą' [0, 1] . should_equal [0, 1] + translate_indices "aB" [0, 1, 2] . should_equal [0, 1, 2] + translate_indices 'a\u{301}' [0, 1, 2] . should_equal [0, 0, 1] + + Test.specify "should correctly case-fold a string and translate codeunits to graphemes" <| + text = 'a\u{301}AZßffią' + folded = CaseFoldedString.fold text Locale.default.java_locale + folded.getFoldedString . should_equal 'a\u{301}azssffią' + + codeunits = Vector.new folded.getFoldedString.utf_16.length+1 ix->ix + grapheme_ixes = codeunits.map ix-> + folded.codeUnitToGraphemeIndex ix + grapheme_ixes . should_equal [0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6] + + Test.expect_panic_with (folded.codeUnitToGraphemeIndex -1) Polyglot_Error + Test.expect_panic_with (folded.codeUnitToGraphemeIndex folded.getFoldedString.utf_16.length+1) Polyglot_Error + +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 86a1fb209ba5..63ddb2514a33 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -4,7 +4,10 @@ from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Locale import Standard.Base.Data.Text.Split_Kind +from Standard.Base.Data.Text.Span as Span_Module import Span from Standard.Base.Data.Text.Text_Sub_Range import all +import Standard.Base.Data.Text.Regex.Mode +import Standard.Base.Data.Text.Matching_Mode import Standard.Test type Auto a @@ -87,9 +90,8 @@ spec = 'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false "I" . equals_ignore_case "i" . should_be_true - "I" . equals_ignore_case "ı" . should_be_true - "İ" . equals_ignore_case "i" . should_be_false "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true + "I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false "Kongressstraße"=="Kongressstrasse" . should_be_false @@ -199,15 +201,20 @@ spec = 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H' + 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'ê') . should_equal 'H' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'ö') . should_equal 'He\u{302}llo\u{308} W' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!' + 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'ê') . should_equal 'llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal '' 'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'ö') . should_equal 'rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='ê') . should_equal 'H' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}' 'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld' @@ -232,6 +239,30 @@ spec = '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺' '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉' + Test.specify "take should correctly handle edge cases" <| + "".take First.new . should_equal "" + "".take Last.new . should_equal "" + + "".take (After "a") . should_equal "" + "".take (After_Last "a") . should_equal "" + "".take (Before "a") . should_equal "" + "".take (Before_Last "a") . should_equal "" + + "".take (After "") . should_equal "" + "".take (After_Last "") . should_equal "" + "".take (Before "") . should_equal "" + "".take (Before_Last "") . should_equal "" + + "".take (While _->True) . should_equal "" + + "".take (Range 0 0) . should_equal "" + 'ABC\u{301}'.take (Range 0 0) . should_equal "" + + 'ABC\u{301}'.take (After "") . should_equal 'ABC\u{301}' + 'ABC\u{301}'.take (After_Last "") . should_equal "" + 'ABC\u{301}'.take (Before "") . should_equal "" + 'ABC\u{301}'.take (Before_Last "") . should_equal 'ABC\u{301}' + Test.specify "drop should work as in the examples" <| "Hello World!".drop First.new . should_equal "ello World!" "Hello World!".drop (First 5) . should_equal " World!" @@ -269,15 +300,20 @@ spec = 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} ' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!' + 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'ê') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'ö') . should_equal 'o\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal '' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}' + 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'ê') . should_equal 'He\u{302}' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'ö') . should_equal 'He\u{302}llo\u{308} Wo\u{308}' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!' + 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='ê') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal '' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!' 'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!' @@ -301,6 +337,30 @@ spec = '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎' '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺' + Test.specify "drop should correctly handle edge cases" <| + "".drop First.new . should_equal "" + "".drop Last.new . should_equal "" + + "".drop (After "a") . should_equal "" + "".drop (After_Last "a") . should_equal "" + "".drop (Before "a") . should_equal "" + "".drop (Before_Last "a") . should_equal "" + + "".drop (After "") . should_equal "" + "".drop (After_Last "") . should_equal "" + "".drop (Before "") . should_equal "" + "".drop (Before_Last "") . should_equal "" + + "".drop (While _->True) . should_equal "" + + "".drop (Range 0 0) . should_equal "" + 'ABC\u{301}'.drop (Range 0 0) . should_equal 'ABC\u{301}' + + 'ABC\u{301}'.drop (After "") . should_equal '' + 'ABC\u{301}'.drop (After_Last "") . should_equal 'ABC\u{301}' + 'ABC\u{301}'.drop (Before "") . should_equal 'ABC\u{301}' + 'ABC\u{301}'.drop (Before_Last "") . should_equal '' + Test.specify "should correctly convert character case" <| "FooBar Baz".to_case Case.Lower . should_equal "foobar baz" "FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ" @@ -465,10 +525,7 @@ spec = ## This shows what regex is doing by default and we cannot easily fix that. 's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true - ## This would normally be false, but we perform input normalization - to get results that are consistent regardless of if the input was - normalized or not. - 'ś' . contains 's' (Regex_Matcher.new) . should_be_true + 'ś' . contains 's' (Regex_Matcher.new) . should_be_false 's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true 'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true @@ -767,6 +824,157 @@ spec = '✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧' + Test.specify "location_of should work as shown in examples" <| + example_1 = + "Hello World!".location_of "J" == Nothing + "Hello World!".location_of "o" == Span (Range 4 5) "Hello World!" + "Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 4 5) "Hello World!" + + example_2 = + term = "straße" + text = "MONUMENTENSTRASSE 42" + match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new) + term.length . should_equal 6 + match.length . should_equal 7 + + example_3 = + ligatures = "ffiffl" + ligatures.length . should_equal 2 + term_1 = "IFF" + match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new) + term_1.length . should_equal 3 + match_1.length . should_equal 2 + term_2 = "ffiffl" + match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new) + term_2.length . should_equal 6 + match_2.length . should_equal 2 + match_1 . should_equal match_2 + + example_4 = + "Hello World!".location_of_all "J" . should_equal [] + "Hello World!".location_of_all "o" . map .start . should_equal [4, 7] + + example_5 = + term = "strasse" + text = "MONUMENTENSTRASSE ist eine große Straße." + match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new) + term.length . should_equal 7 + match . map .length . should_equal [7, 6] + + example_6 = + ligatures = "ffifflFFIFF" + ligatures.length . should_equal 7 + match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new) + match_1 . map .length . should_equal [2, 3] + match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new) + match_2 . map .length . should_equal [2, 5] + + # Put them in blocks to avoid name clashes. + example_1 + example_2 + example_3 + example_4 + example_5 + example_6 + + Test.specify "should allow to find location_of occurrences within a text" <| + "Hello World!".location_of_all "J" . should_equal [] + "Hello World!".location_of_all "o" . map .start . should_equal [4, 7] + + accents = 'a\u{301}e\u{301}o\u{301}' + accents.location_of accent_1 . should_equal (Span (Range 1 2) accents) + + "".location_of "foo" . should_equal Nothing + "".location_of "foo" mode=Matching_Mode.Last . should_equal Nothing + "".location_of_all "foo" . should_equal [] + "".location_of "" . should_equal (Span (Range 0 0) "") + "".location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "") + "".location_of_all "" . should_equal [Span (Range 0 0) ""] + abc = 'A\u{301}ßC' + abc.location_of "" . should_equal (Span (Range 0 0) abc) + abc.location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc) + abc.location_of_all "" . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc] + + Test.specify "should allow case insensitive matching in location_of" <| + hello = "Hello WORLD!" + case_insensitive = Text_Matcher Case_Insensitive.new + hello.location_of "world" . should_equal Nothing + hello.location_of "world" matcher=case_insensitive . should_equal (Span (Range 6 11) hello) + + hello.location_of "o" mode=Mode.First matcher=case_insensitive . should_equal (Span (Range 4 5) hello) + hello.location_of "o" mode=Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 7 8) hello) + + accents = 'A\u{301}E\u{301}O\u{301}' + accents.location_of accent_1 matcher=case_insensitive . should_equal (Span (Range 1 2) accents) + + "Strasse".location_of "ß" matcher=case_insensitive . should_equal (Span (Range 4 6) "Strasse") + "Monumentenstraße 42".location_of "STRASSE" matcher=case_insensitive . should_equal (Span (Range 10 16) "Monumentenstraße 42") + + '\u0390'.location_of '\u03B9\u0308\u0301' matcher=case_insensitive . should_equal (Span (Range 0 1) '\u0390') + 'ԵՒ'.location_of 'և' . should_equal Nothing + 'ԵՒ'.location_of 'և' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ԵՒ') + 'և'.location_of 'ԵՒ' matcher=case_insensitive . should_equal (Span (Range 0 1) 'և') + + ligatures = 'ffafffiflffifflſtstZ' + ligatures.location_of 'FFI' matcher=case_insensitive . should_equal (Span (Range 3 5) ligatures) + ligatures.location_of 'FF' matcher=case_insensitive . should_equal (Span (Range 0 2) ligatures) + ligatures.location_of 'ff' matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 7 8) ligatures) + ligatures.location_of_all 'ff' . should_equal [Span (Range 0 2) ligatures] + ligatures.location_of_all 'FF' matcher=case_insensitive . should_equal [Span (Range 0 2) ligatures, Span (Range 3 4) ligatures, Span (Range 6 7) ligatures, Span (Range 7 8) ligatures] + ligatures.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 3 5) ligatures, Span (Range 6 7) ligatures] + 'fffi'.location_of_all 'ff' matcher=case_insensitive . should_equal [Span (Range 0 2) 'fffi'] + 'fffi'.location_of_all 'ffi' . should_equal [] + 'fffi'.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 1 4) 'fffi'] + 'FFFI'.location_of 'ffi' matcher=case_insensitive . should_equal (Span (Range 1 4) 'FFFI') + + 'ffiffl'.location_of 'IF' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ffiffl') + 'ffiffl'.location_of 'F' Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 1 2) 'ffiffl') + 'ffiffl'.location_of_all 'F' matcher=case_insensitive . should_equal [Span (Range 0 1) 'ffiffl', Span (Range 0 1) 'ffiffl', Span (Range 1 2) 'ffiffl', Span (Range 1 2) 'ffiffl'] + 'aaffibb'.location_of_all 'af' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb'] + 'aaffibb'.location_of_all 'affi' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaffibb'] + 'aaffibb'.location_of_all 'ib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb'] + 'aaffibb'.location_of_all 'ffib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaffibb'] + + "".location_of "foo" matcher=case_insensitive . should_equal Nothing + "".location_of "foo" matcher=case_insensitive mode=Matching_Mode.Last . should_equal Nothing + "".location_of_all "foo" matcher=case_insensitive . should_equal [] + "".location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) "") + "".location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "") + "".location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) ""] + abc = 'A\u{301}ßC' + abc.location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) abc) + abc.location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc) + abc.location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc] + + Test.specify "should allow regexes in location_of" <| + hello = "Hello World!" + regex = Regex_Matcher.new + regex_insensitive = Regex_Matcher.new case_sensitive=Case_Insensitive.new + hello.location_of ".o" Matching_Mode.First matcher=regex . should_equal (Span (Range 3 5) hello) + hello.location_of ".o" Matching_Mode.Last matcher=regex . should_equal (Span (Range 6 8) hello) + hello.location_of_all ".o" matcher=regex . map .start . should_equal [3, 6] + + "foobar".location_of "BAR" Mode.First matcher=regex_insensitive . should_equal (Span (Range 3 6) "foobar") + + ## Regex matching does not do case folding + "Strasse".location_of "ß" Mode.First matcher=regex_insensitive . should_equal Nothing + + ## But it should handle the Unicode normalization + accents = 'a\u{301}e\u{301}o\u{301}' + accents.location_of accent_1 Mode.First matcher=regex . should_equal (Span (Range 1 2) accents) + Test.specify "should correctly handle regex edge cases in location_of" pending="Figure out how to make Regex correctly handle empty patterns." <| + regex = Regex_Matcher.new + "".location_of "foo" matcher=regex . should_equal Nothing + "".location_of "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing + "".location_of_all "foo" matcher=regex . should_equal [] + "".location_of "" matcher=regex . should_equal (Span (Range 0 0) "") + "".location_of_all "" matcher=regex . should_equal [Span (Range 0 0) ""] + "".location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "") + abc = 'A\u{301}ßC' + abc.location_of "" matcher=regex . should_equal (Span (Range 0 0) abc) + abc.location_of_all "" matcher=regex . should_equal [Span (Range 0 0) abc, Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc] + abc.location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc) + Test.group "Regex matching" <| Test.specify "should be possible on text" <| match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First diff --git a/test/Tests/src/Examples_Spec.enso b/test/Tests/src/Examples_Spec.enso index 43820cebf97b..34acbc2125bd 100644 --- a/test/Tests/src/Examples_Spec.enso +++ b/test/Tests/src/Examples_Spec.enso @@ -128,3 +128,4 @@ spec = Test.group "Examples" <| match.groups.length . should_equal 5 match.named_groups.size . should_equal 2 +main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso index b8558f158eac..0060b1872ce4 100644 --- a/test/Tests/src/Main.enso +++ b/test/Tests/src/Main.enso @@ -34,6 +34,7 @@ import project.Data.Text_Spec import project.Data.Time.Spec as Time_Spec import project.Data.Vector_Spec import project.Data.Text.Regex_Spec +import project.Data.Text.Utils_Spec import project.Data.Text.Default_Regex_Engine_Spec import project.Data.Text.Matching_Spec import project.Data.Text.Span_Spec @@ -87,6 +88,7 @@ main = Test.Suite.run_main <| Runtime_Spec.spec Span_Spec.spec Stack_Traces_Spec.spec + Utils_Spec.spec Text_Spec.spec Time_Spec.spec Uri_Spec.spec