From 247b2843163c0e9838839250472f0807f984d7cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= <radoslaw.wasko@enso.org>
Date: Sat, 12 Mar 2022 20:42:00 +0100
Subject: [PATCH] Data analysts should be able to use `Text.location_of` to
 find indexes within string using various matchers (#3324)

Implements https://www.pivotaltracker.com/n/projects/2539304/stories/181266029
---
 CHANGELOG.md                                  |   4 +-
 .../0.0.0-dev/src/Data/Text/Extensions.enso   | 235 ++++++++++++++--
 .../src/Data/Text/Matching_Mode.enso          |   5 +
 .../src/Data/Text/Regex/Engine/Default.enso   |  48 ++--
 .../0.0.0-dev/src/Data/Text/Regex/Mode.enso   |   4 +-
 .../Base/0.0.0-dev/src/Data/Text/Span.enso    | 137 ++++++++--
 .../src/Data/Text/Text_Sub_Range.enso         |  24 +-
 .../src/main/resources/application.conf       |   2 +-
 .../main/java/org/enso/base/Text_Utils.java   | 257 +++++++++++++++++-
 .../org/enso/base/text/CaseFoldedString.java  | 135 +++++++++
 .../java/org/enso/base/text/GraphemeSpan.java |  28 ++
 .../org/enso/base/text/IntArrayBuilder.java   |  65 +++++
 .../java/org/enso/base/text/StringSlice.java  |  34 +++
 .../java/org/enso/base/text/Utf16Span.java    |  18 ++
 .../Data/Text/Default_Regex_Engine_Spec.enso  |  34 ++-
 test/Tests/src/Data/Text/Regex_Spec.enso      |   1 +
 test/Tests/src/Data/Text/Span_Spec.enso       |  30 +-
 test/Tests/src/Data/Text/Utils_Spec.enso      |  61 +++++
 test/Tests/src/Data/Text_Spec.enso            | 220 ++++++++++++++-
 test/Tests/src/Examples_Spec.enso             |   1 +
 test/Tests/src/Main.enso                      |   2 +
 21 files changed, 1236 insertions(+), 109 deletions(-)
 create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso
 create mode 100644 std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java
 create mode 100644 std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java
 create mode 100644 std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java
 create mode 100644 std-bits/base/src/main/java/org/enso/base/text/StringSlice.java
 create mode 100644 std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java
 create mode 100644 test/Tests/src/Data/Text/Utils_Spec.enso

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3cb8119fe28f..12ea6b1a9c5e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -63,6 +63,7 @@
 - [Implemented `Bool.compare_to` method][3317]
 - [Implemented `Map.first`, `Map.last` functions. Expanded `Table.group_by` to
   also compute mode, percentile, minimum, maximum.][3318]
+- [Implemented `Text.location_of` and `Text.location_of_all` methods.][3324]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -100,7 +101,8 @@
 [3236]: https://github.com/enso-org/enso/pull/3236
 [3311]: https://github.com/enso-org/enso/pull/3311
 [3317]: https://github.com/enso-org/enso/pull/3317
-[3317]: https://github.com/enso-org/enso/pull/3318
+[3318]: https://github.com/enso-org/enso/pull/3318
+[3324]: https://github.com/enso-org/enso/pull/3324
 
 #### Enso Compiler
 
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
index 930e5caf1a2a..a7e53d5c6199 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@@ -5,9 +5,11 @@ from Standard.Builtins import Text, Prim_Text_Helpers
 
 import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Mode
+import Standard.Base.Data.Text.Matching_Mode
 import Standard.Base.Data.Text.Case
 import Standard.Base.Data.Text.Location
 import Standard.Base.Data.Text.Line_Ending_Style
+from Standard.Base.Data.Text.Span as Span_Module import Span
 import Standard.Base.Data.Text.Split_Kind
 import Standard.Base.Data.Text.Text_Sub_Range
 import Standard.Base.Data.Locale
@@ -15,6 +17,7 @@ import Standard.Base.Meta
 
 from Standard.Builtins export Text
 
+export Standard.Base.Data.Text.Matching_Mode
 export Standard.Base.Data.Text.Case
 export Standard.Base.Data.Text.Location
 export Standard.Base.Data.Text.Split_Kind
@@ -546,7 +549,7 @@ Text.== that = if Meta.is_same_object this Text then Meta.is_same_object that Te
          (('É' . equals_ignore_case 'é') && ('é' . equals_ignore_case 'e\u0301')) == True
 Text.equals_ignore_case : Text -> Locale -> Boolean
 Text.equals_ignore_case that locale=Locale.default =
-    (this.to_case_insensitive_key locale) == (that.to_case_insensitive_key locale)
+    Text_Utils.equals_ignore_case this that locale.java_locale
 
 ## ADVANCED
    PRIVATE
@@ -555,7 +558,7 @@ Text.equals_ignore_case that locale=Locale.default =
    used to perform case-insensitive comparisons.
 Text.to_case_insensitive_key : Locale -> Text
 Text.to_case_insensitive_key locale=Locale.default =
-    this.to_case Case.Lower locale . to_case Case.Upper locale
+    Text_Utils.case_insensitive_key this locale.java_locale
 
 ## Compare two texts to discover their ordering.
 
@@ -895,7 +898,7 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
     Text_Matcher case_sensitivity -> case case_sensitivity of
         True -> Text_Utils.contains this term
         Case_Insensitive locale ->
-            Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
+            Text_Utils.contains_case_insensitive this term locale.java_locale
     Regex_Matcher _ _ _ _ _ ->
         compiled_pattern = matcher.compile term
         match = compiled_pattern.match this Mode.First
@@ -952,27 +955,6 @@ Text.repeat count=1 =
        https://www.pivotaltracker.com/story/show/181435598
     0.up_to (count.max 0) . fold "" acc-> _-> acc + this
 
-## PRIVATE
-   Utility function taking a range pointing at grapheme clusters and converting to a range on the underlying code points
-range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
-range_to_char_indices text range =
-    len = text.length
-    start = if range.start < 0 then range.start + len else range.start
-    end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
-    is_valid = (Range 0 len+1).contains
-
-    case (Pair (is_valid start) (is_valid end)) of
-        Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
-        Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
-        Pair True True ->
-            if start>=end then (Range 0 0) else
-                iterator = BreakIterator.getCharacterInstance
-                iterator.setText text
-
-                start_index = iterator.next start
-                end_index = iterator.next (end - start)
-                Range start_index end_index
-
 ## ALIAS first, last, left, right, mid, substring
    Creates a new Text by selecting the specified range of the input.
 
@@ -1009,7 +991,7 @@ range_to_char_indices text range =
 Text.take : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
 Text.take range =
     char_range = case range of
-        Range _ _ -> here.range_to_char_indices this range
+        Range _ _ -> Span_Module.range_to_char_indices this range
         _ -> range.to_char_range this
     Text_Utils.substring this char_range.start char_range.end
 
@@ -1049,7 +1031,7 @@ Text.take range =
 Text.drop : (Text_Sub_Range | Range) -> Text ! Index_Out_Of_Bounds_Error
 Text.drop range =
     char_range = case range of
-        Range _ _ -> here.range_to_char_indices this range
+        Range _ _ -> Span_Module.range_to_char_indices this range
         _ -> range.to_char_range this
     if char_range.start == 0 then Text_Utils.drop_first this char_range.end else
         prefix = Text_Utils.substring this 0 char_range.start
@@ -1184,3 +1166,204 @@ Text.trim where=Location.Both what=_.is_whitespace =
             loop current break_iterator.previous
     if start_index >= end_index then "" else
         Text_Utils.substring this start_index end_index
+
+## ALIAS find, index_of, position_of, span_of
+   Find the location of the `term` in the input.
+   Returns a Span representing the location at which the term was found, or
+   `Nothing` if the term was not found in the input.
+
+   Arguments:
+   - term: The term to find.
+   - mode: Specifies if the first or last occurrence of the term should be
+     returned if there are multiple occurrences within the input. The first
+     occurrence is returned by default.
+   - matcher: Specifies how the term is matched against the input:
+     - If a `Text_Matcher`, the text is compared using case-sensitively rules
+       specified in the matcher.
+     - If a `Regex_Matcher`, the `term` is used as a regular expression and
+       matched using the associated options.
+
+   ! What is a Character?
+     A character is defined as an Extended Grapheme Cluster, see Unicode
+     Standard Annex 29. This is the smallest unit that still has semantic
+     meaning in most text-processing applications.
+
+   > Example
+     Finding location of a substring.
+
+         "Hello World!".location_of "J" == Nothing
+         "Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
+         "Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 7 8) "Hello World!"
+
+   ! Match Length
+     The  function returns not only the index of the match but a `Span` instance
+     which contains both the start and end indices, allowing to determine the
+     length of the match. This is useful not only with regex matches (where a
+     regular expression can have matches of various lengths) but also for case
+     insensitive matching. In case insensitive mode, a single character can
+     match multiple characters, for example `ß` will match `ss` and `SS`, and
+     the ligature `ﬃ` will match `ffi` or `f` etc. Thus in case insensitive
+     mode, the length of the match can be shorter or longer than the term that
+     was being matched, so it is extremely important to not rely on the length
+     of the matched term when analysing the matches as they may have different
+     lengths.
+
+   > Example
+     Match length differences in case insensitive matching.
+
+         term = "straße"
+         text = "MONUMENTENSTRASSE 42"
+         match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
+         term.length == 6
+         match.length == 7
+
+   ! Matching Grapheme Clusters
+     In case insensitive mode, a single character can match multiple characters,
+     for example `ß` will match `ss` and `SS`, and the ligature `ﬃ` will match
+     `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
+     match only a part of some single grapheme cluster, for example in the text
+     `ﬃa` the term `ia` will match just one-third of the first grapheme `ﬃ`.
+     Since we do not have the resolution to distinguish such partial matches
+     (as that would require non-integer indices), so a match which matched just
+     a part of some grapheme cluster is extended and treated as if it matched
+     the whole grapheme cluster.
+
+   > Example
+     Extending matches to full grapheme clusters.
+
+         ligatures = "ﬃﬄ"
+         ligatures.length == 2
+         term_1 = "IFF"
+         match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
+         term_1.length == 3
+         match_1.length == 2
+         term_2 = "ffiffl"
+         match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
+         term_2.length == 6
+         match_2.length == 2
+         # After being extended to full grapheme clusters, both terms "IFF" and "ffiffl" match the same span of grapheme clusters.
+         match_1 == match_2
+Text.location_of : Text -> (Matching_Mode.First | Matching_Mode.Last) -> Matcher -> Span | Nothing
+Text.location_of term="" mode=Matching_Mode.First matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitive -> case case_sensitive of
+        True ->
+            codepoint_span = case mode of
+                Matching_Mode.First -> Text_Utils.span_of this term
+                Matching_Mode.Last -> Text_Utils.last_span_of this term
+            if codepoint_span.is_nothing then Nothing else
+                start = Text_Utils.utf16_index_to_grapheme_index this codepoint_span.start
+                ## While the codepoint_span may have different code unit length
+                   from our term, the `length` counted in grapheme clusters is
+                   guaranteed to be the same.
+                end = start + term.length
+                Span (Range start end) this
+        Case_Insensitive locale -> case term.is_empty of
+            True -> case mode of
+                Matching_Mode.First -> Span (Range 0 0) this
+                Matching_Mode.Last ->
+                    end = this.length
+                    Span (Range end end) this
+            False ->
+                search_for_last = case mode of
+                    Matching_Mode.First -> False
+                    Matching_Mode.Last -> True
+                case Text_Utils.span_of_case_insensitive this term locale.java_locale search_for_last of
+                    Nothing -> Nothing
+                    grapheme_span ->
+                        Span (Range grapheme_span.start grapheme_span.end) this
+    Regex_Matcher _ _ _ _ _ -> case mode of
+        Matching_Mode.First ->
+            case matcher.compile term . match this Mode.First of
+                Nothing -> Nothing
+                match -> match.span 0 . to_grapheme_span
+        Matching_Mode.Last ->
+            case matcher.compile term . match this Mode.All of
+                Nothing -> Nothing
+                matches -> matches.last.span 0 . to_grapheme_span
+
+## ALIAS find_all, index_of_all, position_of_all, span_of_all
+   Finds all the locations of the `term` in the input.
+   If not found, the function returns an empty Vector.
+
+   Arguments:
+   - term: The term to find.
+   - matcher: Specifies how the term is matched against the input:
+     - If a `Text_Matcher`, the text is compared using case-sensitively rules
+       specified in the matcher.
+     - If a `Regex_Matcher`, the `term` is used as a regular expression and
+       matched using the associated options.
+
+   ! What is a Character?
+     A character is defined as an Extended Grapheme Cluster, see Unicode
+     Standard Annex 29. This is the smallest unit that still has semantic
+     meaning in most text-processing applications.
+
+   > Example
+     Finding locations of all occurrences of a substring.
+
+         "Hello World!".location_of_all "J" == []
+         "Hello World!".location_of_all "o" . map .start == [4, 7]
+
+   ! Match Length
+     The  function returns not only the index of the match but a `Span` instance
+     which contains both the start and end indices, allowing to determine the
+     length of the match. This is useful not only with regex matches (where a
+     regular expression can have matches of various lengths) but also for case
+     insensitive matching. In case insensitive mode, a single character can
+     match multiple characters, for example `ß` will match `ss` and `SS`, and
+     the ligature `ﬃ` will match `ffi` or `f` etc. Thus in case insensitive
+     mode, the length of the match can be shorter or longer than the term that
+     was being matched, so it is extremely important to not rely on the length
+     of the matched term when analysing the matches as they may have different
+     lengths.
+
+   > Example
+     Match length differences in case insensitive matching.
+
+         term = "strasse"
+         text = "MONUMENTENSTRASSE ist eine große Straße."
+         match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
+         term.length == 7
+         match . map .length == [7, 6]
+
+   ! Matching Grapheme Clusters
+     In case insensitive mode, a single character can match multiple characters,
+     for example `ß` will match `ss` and `SS`, and the ligature `ﬃ` will match
+     `ffi` or `f` etc. Thus in this mode, it is sometimes possible for a term to
+     match only a part of some single grapheme cluster, for example in the text
+     `ﬃa` the term `ia` will match just one-third of the first grapheme `ﬃ`.
+     Since we do not have the resolution to distinguish such partial matches
+     (as that would require non-integer indices), so a match which matched just
+     a part of some grapheme cluster is extended and treated as if it matched
+     the whole grapheme cluster.
+
+   > Example
+     Extending matches to full grapheme clusters.
+
+         ligatures = "ﬃﬄFFIFF"
+         ligatures.length == 7
+         match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
+         match_1 . map .length == [2, 3]
+         match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
+         match_2 . map .length == [2, 5]
+Text.location_of_all : Text -> Matcher -> [Span]
+Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
+        True ->
+            codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
+            grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
+            ## While the codepoint_spans may have different code unit lengths
+               from our term, the `length` counted in grapheme clusters is
+               guaranteed to be the same.
+            offset = term.length
+            grahpeme_ixes . map start->
+                end = start+offset
+                Span (Range start end) this
+        Case_Insensitive locale ->
+            grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
+            grapheme_spans.map grapheme_span->
+                Span (Range grapheme_span.start grapheme_span.end) this
+    Regex_Matcher _ _ _ _ _ ->
+        case matcher.compile term . match this Mode.All of
+            Nothing -> []
+            matches -> matches.map m-> m.span 0 . to_grapheme_span
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso
new file mode 100644
index 000000000000..d6b0a31b50b4
--- /dev/null
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching_Mode.enso
@@ -0,0 +1,5 @@
+## Matches the first found instance.
+type First
+
+## Matches the last found instance.
+type Last
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso
index e4b1c1ff65b1..102318973833 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso
@@ -40,7 +40,7 @@ import Standard.Base.Data.Text.Regex.Engine
 import Standard.Base.Data.Text.Regex.Option as Global_Option
 import Standard.Base.Data.Text.Regex.Mode
 import Standard.Base.Polyglot.Java as Java_Ext
-import Standard.Base.Data.Text.Span
+from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
 
 from Standard.Builtins import Java
 
@@ -183,8 +183,13 @@ type Pattern
          on the encoding, we normalize all input.
     build_matcher : Text -> Integer -> Integer -> Java_Matcher
     build_matcher input start end =
-        normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
-            Text_Utils.normalize input
+        ## TODO [RW] Normalization had to be disabled - since start and end are
+           in code unit space, normalization could shift these indices!
+           This should be addressed when reviewing
+           See: https://www.pivotaltracker.com/story/show/181524498
+        #normalized_input = if this.options.contains Global_Option.Ascii_Matching then input else
+        #    Text_Utils.normalize input
+        normalized_input = input
         internal_matcher = this.internal_pattern.matcher normalized_input . region start end
 
         if this.options.contains No_Anchoring_Bounds then
@@ -262,7 +267,7 @@ type Pattern
                 internal_matcher = this.build_matcher input start end
 
                 if internal_matcher . find start . not then Nothing else
-                    Match internal_matcher start end
+                    Match internal_matcher start end input
             Integer ->
                 if mode < 0 then Panic.throw <|
                     Mode_Error "Cannot match a negative number of times."
@@ -272,13 +277,16 @@ type Pattern
                 go : Integer -> Integer -> Nothing
                 go offset remaining_count =
                     should_continue = remaining_count > 0
-                    if should_continue.not || (offset > end) then Nothing else
+                    if should_continue.not || (offset >= end) then Nothing else
                         internal_matcher = this.build_matcher input start end
                         found = internal_matcher.find offset
 
                         if found.not then Nothing else
-                            builder.append (Match internal_matcher start end)
-                            @Tail_Call go (internal_matcher.end 0) remaining_count-1
+                            builder.append (Match internal_matcher start end input)
+                            match_end = internal_matcher.end 0
+                            # Ensure progress even if the match is an empty string.
+                            new_offset = if match_end > offset then match_end else offset+1
+                            @Tail_Call go new_offset remaining_count-1
 
                 go start mode
                 vector = builder.to_vector
@@ -294,8 +302,11 @@ type Pattern
                         found = internal_matcher.find offset
 
                         if found.not then Nothing else
-                            builder.append (Match internal_matcher start end)
-                            @Tail_Call go (internal_matcher.end 0)
+                            builder.append (Match internal_matcher start end input)
+                            match_end = internal_matcher.end 0
+                            # Ensure progress even if the match is an empty string.
+                            new_offset = if match_end > offset then match_end else offset+1
+                            @Tail_Call go new_offset
 
                 go start
                 vector = builder.to_vector
@@ -304,7 +315,7 @@ type Pattern
             Mode.Full ->
                 internal_matcher = this.build_matcher input start end
                 if internal_matcher.matches.not then Nothing else
-                    Match internal_matcher start end
+                    Match internal_matcher start end input
             Mode.Bounded _ _ _ -> Panic.throw <|
                 Mode_Error "Modes cannot be recursive."
 
@@ -312,7 +323,7 @@ type Pattern
             Mode.Bounded start end sub_mode ->
                 if start < end then do_match_mode sub_mode start end else
                     Panic.throw Invalid_Bounds_Error
-            _ -> do_match_mode mode 0 input.length
+            _ -> do_match_mode mode 0 (Text_Utils.char_length input)
 
     ## ADVANCED
 
@@ -334,7 +345,7 @@ type Pattern
                  pattern.matches input
     matches : Text -> Boolean
     matches input = case this.match input mode=Mode.Full of
-        Match _ _ _ -> True
+        Match _ _ _ _ -> True
         Vector.Vector _ -> True
         _ -> False
 
@@ -405,7 +416,7 @@ type Pattern
     find input mode=Mode.All =
         matches = this.match input mode
         case matches of
-            Match _ _ _ -> matches.group 0
+            Match _ _ _ _ -> matches.group 0
             Vector.Vector _ -> matches.map (_.group 0)
             _ -> matches
 
@@ -548,7 +559,7 @@ type Pattern
                 internal_matcher.replaceAll replacement
             Mode.Full ->
                 case this.match input mode=Mode.Full of
-                    Match _ _ _ -> replacement
+                    Match _ _ _ _ -> replacement
                     Nothing -> input
             Mode.Bounded _ _ _ -> Panic.throw <|
                 Mode_Error "Modes cannot be recursive."
@@ -556,7 +567,7 @@ type Pattern
         case mode of
             Mode.Bounded _ _ _ -> Panic.throw <|
                 Mode_Error "Bounded replacements are not well-formed."
-            _ -> do_replace_mode mode 0 input.length
+            _ -> do_replace_mode mode 0 (Text_Utils.char_length input)
 
 ## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
 type Match
@@ -570,7 +581,8 @@ type Match
          match.
        - region_start: The start of the region over which the match was made.
        - region_end: The end of the region over which the match was made.
-    type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer)
+       - input: The input text that was being matched.
+    type Match (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
 
     ## Gets the text matched by the group with the provided identifier, or
        `Nothing` if the group did not participate in the match. If no such group
@@ -743,10 +755,10 @@ type Match
              example_Span =
                  match = Examples.match
                  match.span 0
-    span : Integer | Text -> Span | Nothing ! Regex.No_Such_Group_Error
+    span : Integer | Text -> Utf_16_Span | Nothing ! Regex.No_Such_Group_Error
     span id = case this.group id of
         Nothing -> Nothing
-        _ -> Span.new (this.start id) (this.end id) (this.group 0)
+        _ -> Utf_16_Span (Range (this.start id) (this.end id)) this.input
 
     ## Returns the start character index of the match's region.
 
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso
index 342eb2223920..1db9cf80eaab 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Mode.enso
@@ -4,11 +4,13 @@
    to matching on the `Full` content of the input text.
 
 from Standard.Base import all
+from Standard.Base.Data.Text.Matching_Mode import First
+from Standard.Base.Data.Text.Matching_Mode export First
 
 type Mode
 
     ## The regex will only match the first instance it finds.
-    type First
+    First
 
     ## The regex will match up to some `Integer` number of instances.
     Integer
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
index 4875a7eeb8fc..f357d139719e 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
@@ -7,30 +7,14 @@
 
          example_span =
              text = "Hello!"
-             Span.new 0 3 text
+             Span 0 3 text
 
 from Standard.Base import all
 
-import Standard.Base.Data.Range
+from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
 
-## Construct a new `Span`.
-
-   Arguments:
-   - start: The index of the first character included in the span.
-   - end: The index of the first character after `start` that is _not_ included
-     in the span.
-   - text: The `Text` over which the span exists. This is _optional_.
-
-   > Example
-     Creating a span over the first three characters of the text "hello!".
-
-         import Standard.Base.Data.Text.Span
-
-         example_span =
-             text = "Hello!"
-             Span.new 0 3 text
-new : Integer -> Integer -> Text | Nothing -> Span
-new start end text=Nothing = Span (start.up_to end) text
+polyglot java import org.enso.base.Text_Utils
+polyglot java import com.ibm.icu.text.BreakIterator
 
 type Span
 
@@ -38,7 +22,7 @@ type Span
 
        Arguments:
        - range: The range of characters over which the span exists.
-       - text: The text over which the span exists. This is _optional_.
+       - text: The text over which the span exists.
 
        ! What is a Character?
          A character is defined as an Extended Grapheme Cluster, see Unicode
@@ -54,7 +38,7 @@ type Span
                  text = "Hello!"
                  range = 0.up_to 3
                  Span.Span range text
-    type Span (range : Range.Range) (text : (Text | Nothing) = Nothing)
+    type Span (range : Range.Range) (text : Text)
 
     ## The index of the first character included in the span.
 
@@ -74,3 +58,112 @@ type Span
          meaning in most text-processing applications.
     end : Integer
     end = this.range.end
+
+    ## The length of the span in extended grapheme clusters.
+
+       ! What is a Character?
+         A character is defined as an Extended Grapheme Cluster, see Unicode
+         Standard Annex 29. This is the smallest unit that still has semantic
+         meaning in most text-processing applications.
+    length : Integer
+    length = this.range.length
+
+    ## Converts the span of extended grapheme clusters to a corresponding span
+       of UTF-16 code units.
+
+       > Example
+         Find the span of code units corresponding to the span of extended grapheme clusters.
+
+         text = 'ae\u{301}fz'
+         (Span (Range 1 3) text).to_utf_16_span == (Utf_16_Span (Range 1 4) text)
+    to_utf_16_span : Utf_16_Span
+    to_utf_16_span =
+        Utf_16_Span (here.range_to_char_indices this.text this.range) this.text
+
+type Utf_16_Span
+
+    ## A representation of a span of UTF-16 code units in Enso's `Text` type.
+
+       Arguments:
+       - range: The range of code units over which the span exists.
+       - text: The text over which the span exists.
+
+       > Example
+         Creating a span over the first three code units of the text 'a\u{301}bc'.
+
+             import Standard.Base.Data.Text.Span
+
+             example_span =
+                 text = 'a\u{301}bc'
+                 Span.Utf_16_Span (Range 0 3) text
+    type Utf_16_Span (range : Range.Range) (text : Text)
+
+    ## The index of the first code unit included in the span.
+    start : Integer
+    start = this.range.start
+
+    ## The index of the first code unit after `start` that is _not_ included in
+       the span.
+    end : Integer
+    end = this.range.end
+
+    ## The length of the span in UTF-16 code units.
+    length : Integer
+    length = this.range.length
+
+    ## Returns a span of extended grapheme clusters which is the closest
+       approximation of this span of code units.
+
+       The resulting span is extended in such a way that every code unit that
+       was contained by the original span is also contained in a new span. Since
+       some grapheme clusters consist of multiple code units, after the span was
+       extended it may also contain code units which were not contained inside
+       of the original span.
+
+       > Example
+         Convert a codepoint span to graphemes and back.
+
+         text = 'a\u{301}e\u{302}o\u{303}'
+         span = Utf_16_Span (Range 1 5) text # The span contains the units [\u{301}, e, \u{302}, o].
+         extended = span.to_grapheme_span
+         extended == Span (Range 0 3) text # The span is extended to the whole string since it contained code units from every grapheme cluster.
+         extended.to_utf_16_span == Utf_16_Span (Range 0 6) text
+    to_grapheme_span : Span
+    to_grapheme_span = if (this.start < 0) || (this.end > Text_Utils.char_length this.text) then Error.throw (Illegal_State_Error "Utf_16_Span indices are out of range of the associated text.") else
+        if this.end < this.start then Error.throw (Illegal_State_Error "Utf_16_Span invariant violation: start <= end") else
+            case this.start == this.end of
+                True ->
+                    grapheme_ix = Text_Utils.utf16_index_to_grapheme_index this.text this.start
+                    Span (Range grapheme_ix grapheme_ix) this.text
+                False ->
+                    grapheme_ixes = Text_Utils.utf16_indices_to_grapheme_indices this.text [this.start, this.end - 1].to_array
+                    grapheme_first = grapheme_ixes.at 0
+                    grapheme_last = grapheme_ixes.at 1
+                    ## We find the grapheme index of the last code unit actually contained within our span and set the
+                       end grapheme to the first grapheme after that. This ensures that if code units associated with
+                       only a part of a grapheme were contained in our original span, the resulting span will be
+                       extended to contain this whole grapheme.
+                    grapheme_end = grapheme_last + 1
+                    Span (Range grapheme_first grapheme_end) this.text
+
+## PRIVATE
+   Utility function taking a range pointing at grapheme clusters and converting
+   to a range on the underlying code units.
+range_to_char_indices : Text -> Range -> Range ! Index_Out_Of_Bounds_Error
+range_to_char_indices text range =
+    len = text.length
+    start = if range.start < 0 then range.start + len else range.start
+    end = if range.end == Nothing then len else (if range.end < 0 then range.end + len else range.end)
+    is_valid = (Range 0 len+1).contains
+
+    case (Pair (is_valid start) (is_valid end)) of
+        Pair False _ -> Error.throw (Index_Out_Of_Bounds_Error range.start len)
+        Pair True False -> Error.throw (Index_Out_Of_Bounds_Error range.end len)
+        Pair True True ->
+            if start>=end then (Range 0 0) else
+                iterator = BreakIterator.getCharacterInstance
+                iterator.setText text
+
+                start_index = iterator.next start
+                end_index = iterator.next (end - start)
+                Range start_index end_index
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
index 6db8ddbfbf96..2f22d84d9343 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
@@ -79,24 +79,24 @@ type Text_Sub_Range
                     Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
             Before delimiter ->
                 if delimiter.is_empty then (Range 0 0) else
-                    index = Text_Utils.index_of text delimiter
-                    if index == -1 then (Range 0 (Text_Utils.char_length text)) else
-                        (Range 0 index)
+                    span = Text_Utils.span_of text delimiter
+                    if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
+                        (Range 0 span.start)
             Before_Last delimiter ->
                 if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
-                    index = Text_Utils.last_index_of text delimiter
-                    if index == -1 then (Range 0 (Text_Utils.char_length text)) else
-                        (Range 0 index)
+                    span = Text_Utils.last_span_of text delimiter
+                    if span.is_nothing then (Range 0 (Text_Utils.char_length text)) else
+                        (Range 0 span.start)
             After delimiter ->
                 if delimiter.is_empty then (Range 0 (Text_Utils.char_length text)) else
-                    index = Text_Utils.index_of text delimiter
-                    if index == -1 then (Range 0 0) else
-                        (Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
+                    span = Text_Utils.span_of text delimiter
+                    if span.is_nothing then (Range 0 0) else
+                        (Range span.end (Text_Utils.char_length text))
             After_Last delimiter ->
                 if delimiter.is_empty then (Range 0 0) else
-                    index = Text_Utils.last_index_of text delimiter
-                    if index == -1 then (Range 0 0) else
-                        (Range (index + Text_Utils.char_length delimiter) (Text_Utils.char_length text))
+                    span = Text_Utils.last_span_of text delimiter
+                    if span.is_nothing then (Range 0 0) else
+                        (Range span.end (Text_Utils.char_length text))
             While predicate ->
                 indices = find_sub_range_end text _-> start-> end->
                     predicate (Text_Utils.substring text start end) . not
diff --git a/engine/launcher/src/main/resources/application.conf b/engine/launcher/src/main/resources/application.conf
index ef9a0daca38b..bf3de5e69207 100644
--- a/engine/launcher/src/main/resources/application.conf
+++ b/engine/launcher/src/main/resources/application.conf
@@ -1,7 +1,7 @@
 akka {
   loggers = ["akka.event.slf4j.Slf4jLogger"]
   logging-filter = "akka.event.slf4j.Slf4jLoggingFilter"
-  version = "2.6.6"
+  version = "2.6.18"
   stdout-loglevel = "ERROR"
 }
 
diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
index 8f7b1a858fc0..cab662fb181d 100644
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@@ -1,11 +1,19 @@
 package org.enso.base;
 
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.CaseMap.Fold;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.StringSearch;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
 import java.util.regex.Pattern;
+import org.enso.base.text.CaseFoldedString;
+import org.enso.base.text.GraphemeSpan;
+import org.enso.base.text.Utf16Span;
 
 /** Utils for standard library operations on Text. */
 public class Text_Utils {
@@ -117,6 +125,23 @@ public static boolean equals(String str1, Object str2) {
     }
   }
 
+  /**
+   * Checks whether two strings are equal up to Unicode canonicalization and ignoring case.
+   *
+   * @param str1 the first string
+   * @param str2 the second string
+   * @param locale the locale to use for case folding
+   * @return the result of comparison
+   */
+  public static boolean equals_ignore_case(String str1, Object str2, Locale locale) {
+    if (str2 instanceof String) {
+      Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
+      return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0;
+    } else {
+      return false;
+    }
+  }
+
   /**
    * Converts an array of codepoints into a string.
    *
@@ -176,6 +201,36 @@ public static boolean contains(String string, String substring) {
     return searcher.first() != StringSearch.DONE;
   }
 
+  /**
+   * Checks if {@code substring} is a substring of {@code string}.
+   *
+   * @param string the containing string.
+   * @param substring the contained string.
+   * @return whether {@code substring} is a substring of {@code string}.
+   */
+  public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
+    // {@code StringSearch} does not handle empty strings as we would want, so we need these special
+    // cases.
+    if (substring.isEmpty()) return true;
+    if (string.isEmpty()) return false;
+
+    Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
+    StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
+    return searcher.first() != StringSearch.DONE;
+  }
+
+  /**
+   * Transforms the provided string into a form which can be used for case insensitive comparisons.
+   *
+   * @param string the string to transform
+   * @param locale the locale to use - needed to distinguish a special case when handling Turkish
+   *     'i' characters
+   * @return a transformed string that can be used for case insensitive comparisons
+   */
+  public static String case_insensitive_key(String string, Locale locale) {
+    return CaseFoldedString.simpleFold(string, locale);
+  }
+
   /**
    * Replaces all occurrences of {@code oldSequence} within {@code str} with {@code newSequence}.
    *
@@ -200,37 +255,215 @@ public static long char_length(String str) {
   }
 
   /**
-   * Find the first index of needle in the haystack
+   * Find the first occurrence of needle in the haystack
    *
    * @param haystack the string to search
    * @param needle the substring that is searched for
-   * @return index of the first needle or -1 if not found.
+   * @return a UTF-16 code unit span of the first needle or null if not found.
    */
-  public static long index_of(String haystack, String needle) {
+  public static Utf16Span span_of(String haystack, String needle) {
+    if (needle.isEmpty()) return new Utf16Span(0, 0);
+    if (haystack.isEmpty()) return null;
+
     StringSearch search = new StringSearch(needle, haystack);
     int pos = search.first();
-    return pos == StringSearch.DONE ? -1 : pos;
+    if (pos == StringSearch.DONE) return null;
+    return new Utf16Span(pos, pos + search.getMatchLength());
   }
 
   /**
-   * Find the last index of needle in the haystack
+   * Find the last occurrence of needle in the haystack
    *
    * @param haystack the string to search
    * @param needle the substring that is searched for
-   * @return index of the last needle or -1 if not found.
+   * @return a UTF-16 code unit span of the last needle or null if not found.
    */
-  public static long last_index_of(String haystack, String needle) {
+  public static Utf16Span last_span_of(String haystack, String needle) {
+    if (needle.isEmpty()) {
+      int afterLast = haystack.length();
+      return new Utf16Span(afterLast, afterLast);
+    }
+    if (haystack.isEmpty()) return null;
+
     StringSearch search = new StringSearch(needle, haystack);
-    int pos = search.first();
+    int pos = search.last();
+    if (pos == StringSearch.DONE) return null;
+    return new Utf16Span(pos, pos + search.getMatchLength());
+  }
+
+  /**
+   * Find spans of all occurrences of the needle within the haystack.
+   *
+   * @param haystack the string to search
+   * @param needle the substring that is searched for
+   * @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
+   */
+  public static List<Utf16Span> span_of_all(String haystack, String needle) {
+    if (needle.isEmpty())
+      throw new IllegalArgumentException(
+          "The operation `index_of_all` does not support searching for an empty term.");
+    if (haystack.isEmpty()) return List.of();
+
+    StringSearch search = new StringSearch(needle, haystack);
+    ArrayList<Utf16Span> occurrences = new ArrayList<>();
+    long ix;
+    while ((ix = search.next()) != StringSearch.DONE) {
+      occurrences.add(new Utf16Span(ix, ix + search.getMatchLength()));
+    }
+    return occurrences;
+  }
+
+  /**
+   * Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
+   *
+   * @param text the text associated with the index
+   * @param codeunit_index the UTF-16 index
+   * @return an index of an extended grapheme cluster that contains the code unit from the input
+   */
+  public static long utf16_index_to_grapheme_index(String text, long codeunit_index) {
+    BreakIterator breakIterator = BreakIterator.getCharacterInstance();
+    breakIterator.setText(text);
+    if (codeunit_index < 0 || codeunit_index > text.length()) {
+      throw new IndexOutOfBoundsException(
+          "Index " + codeunit_index + " is outside of the provided text.");
+    }
+
+    int grapheme_end = breakIterator.next();
+    long grapheme_index = 0;
+
+    while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
+      grapheme_index++;
+      grapheme_end = breakIterator.next();
+    }
+    return grapheme_index;
+  }
+
+  /**
+   * Converts a series of UTF-16 code unit indices to indices of graphemes that these code units
+   * belong to.
+   *
+   * <p>For performance, it assumes that the provided indices are sorted in a non-decreasing order
+   * (duplicate entries are permitted). Behaviour is unspecified if an unsorted list is provided.
+   *
+   * <p>The behaviour is unspecified if indices provided on the input are outside of the range [0,
+   * text.length()].
+   *
+   * @param text the text associated with the indices
+   * @param codeunit_indices the array of UTF-16 code unit indices, sorted in non-decreasing order
+   * @return an array of grapheme indices corresponding to the UTF-16 units from the input
+   */
+  public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> codeunit_indices) {
+    BreakIterator breakIterator = BreakIterator.getCharacterInstance();
+    breakIterator.setText(text);
+
+    int grapheme_end = breakIterator.next();
+    long grapheme_index = 0;
+
+    long[] result = new long[codeunit_indices.size()];
+    int result_ix = 0;
+
+    for (long codeunit_index : codeunit_indices) {
+      while (grapheme_end <= codeunit_index && grapheme_end != BreakIterator.DONE) {
+        grapheme_index++;
+        grapheme_end = breakIterator.next();
+      }
+      result[result_ix++] = grapheme_index;
+    }
+
+    return result;
+  }
+
+  /**
+   * Find the first or last occurrence of needle in the haystack.
+   *
+   * @param haystack the string to search
+   * @param needle the substring that is searched for
+   * @param locale the locale used for case-insensitive comparisons
+   * @param searchForLast if set to true, will search for the last occurrence; otherwise searches
+   *     for the first one
+   * @return an extended-grapheme-cluster span of the first or last needle, or null if none found.
+   */
+  public static GraphemeSpan span_of_case_insensitive(
+      String haystack, String needle, Locale locale, boolean searchForLast) {
+    if (needle.isEmpty())
+      throw new IllegalArgumentException(
+          "The operation `span_of_case_insensitive` does not support searching for an empty term.");
+    if (haystack.isEmpty()) return null;
+
+    CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
+    String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
+    StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
+    int pos;
+    if (searchForLast) {
+      pos = search.last();
+    } else {
+      pos = search.first();
+    }
     if (pos == StringSearch.DONE) {
-      return -1;
+      return null;
+    } else {
+      return findExtendedSpan(foldedHaystack, pos, search.getMatchLength());
     }
+  }
+
+  /**
+   * Find all occurrences of needle in the haystack
+   *
+   * @param haystack the string to search
+   * @param needle the substring that is searched for
+   * @param locale the locale used for case-insensitive comparisons
+   * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
+   */
+  public static List<GraphemeSpan> span_of_all_case_insensitive(
+      String haystack, String needle, Locale locale) {
+    if (needle.isEmpty())
+      throw new IllegalArgumentException(
+          "The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
+    if (haystack.isEmpty()) return List.of();
+
+    CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
+    String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
+
+    StringSearch search = new StringSearch(foldedNeedle, foldedHaystack.getFoldedString());
+    ArrayList<GraphemeSpan> result = new ArrayList<>();
 
-    for (int next = search.next(); next != StringSearch.DONE; next = search.next()) {
-      pos = next;
+    int pos;
+    while ((pos = search.next()) != StringSearch.DONE) {
+      result.add(findExtendedSpan(foldedHaystack, pos, search.getMatchLength()));
     }
 
-    return pos;
+    return result;
+  }
+
+  /**
+   * Finds the grapheme span corresponding to the found match indexed with code units.
+   *
+   * <p>It extends the found span to ensure that graphemes associated with all found code units are
+   * included in the resulting span. Thus, some additional code units which were not present in the
+   * original match may also be present due to the extension.
+   *
+   * <p>The extension to the left is trivial - we just find the grapheme associated with the first
+   * code unit and even if that code unit is not the first one of that grapheme, by returning it we
+   * correctly extend to the left. The extension to the right works by finding the index of the
+   * grapheme associated with the last code unit actually present in the span, then the end of the
+   * returned span is set to the next grapheme after it. This correctly handles the edge case where
+   * only a part of some grapheme was matched.
+   *
+   * @param string the folded string with which the positions are associated, containing a cache of
+   *     position mappings
+   * @param position the position of the match (in code units)
+   * @param length the length of the match (in code units)
+   * @return a minimal {@code GraphemeSpan} which contains all code units from the match
+   */
+  private static GraphemeSpan findExtendedSpan(CaseFoldedString string, int position, int length) {
+    int firstGrapheme = string.codeUnitToGraphemeIndex(position);
+    if (length == 0) {
+      return new GraphemeSpan(firstGrapheme, firstGrapheme);
+    } else {
+      int lastGrapheme = string.codeUnitToGraphemeIndex(position + length - 1);
+      int endGrapheme = lastGrapheme + 1;
+      return new GraphemeSpan(firstGrapheme, endGrapheme);
+    }
   }
 
   /**
diff --git a/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java b/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java
new file mode 100644
index 000000000000..75a9aa101a44
--- /dev/null
+++ b/std-bits/base/src/main/java/org/enso/base/text/CaseFoldedString.java
@@ -0,0 +1,135 @@
+package org.enso.base.text;
+
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.CaseMap;
+import com.ibm.icu.text.CaseMap.Fold;
+import java.util.Locale;
+
+/**
+ * Represents a string transformed using Unicode Case Folding which can be used for case insensitive
+ * comparisons.
+ *
+ * <p>It contains facilities for converting indices in the transformed string to corresponding
+ * indices back in the original string.
+ */
+public class CaseFoldedString {
+  private final String foldedString;
+
+  /**
+   * A mapping from code units in the transformed string to their corresponding graphemes in the
+   * original string.
+   *
+   * <p>The mapping must be valid from indices from 0 to @{code foldedString.length()+1}
+   * (inclusive).
+   */
+  private final int[] graphemeIndexMapping;
+
+  /**
+   * Constructs a new instance of the folded string.
+   *
+   * @param foldeString the string after applying the case folding transformation
+   * @param graphemeIndexMapping a mapping created during the transformation which maps code units
+   *     in the transformed string to their corresponding graphemes in the original string
+   */
+  private CaseFoldedString(String foldeString, int[] graphemeIndexMapping) {
+    this.foldedString = foldeString;
+    this.graphemeIndexMapping = graphemeIndexMapping;
+  }
+
+  /**
+   * Maps a code unit in the folded string to the corresponding grapheme in the original string.
+   *
+   * @param codeunitIndex the index of the code unit in the folded string, valid indices range from
+   *     0 to {@code getFoldedString().length()+1} (inclusive), allowing to also ask for the
+   *     position of the end code unit which is located right after the end of the string - which
+   *     should always map to the analogous end grapheme.
+   * @return the index of the grapheme from the original string that after applying the
+   *     transformation contains the requested code unit
+   */
+  public int codeUnitToGraphemeIndex(int codeunitIndex) {
+    if (codeunitIndex < 0 || codeunitIndex > this.foldedString.length()) {
+      throw new IndexOutOfBoundsException(codeunitIndex);
+    }
+    return graphemeIndexMapping[codeunitIndex];
+  }
+
+  /** Returns the transformed string. */
+  public String getFoldedString() {
+    return foldedString;
+  }
+
+  /**
+   * Folds a string remembering the mapping from code units to its original grapheme cluster
+   * indices.
+   *
+   * @param charSequence a sequence of UTF-16 characters to transform
+   * @param locale the locale to use as a reference for case folding; it is needed because Turkish
+   *     and Azerbaijani locales handle casing of the letter `i` in a different way than other
+   *     locales
+   * @return a {@code CaseFoldedString} instance which contains the transformed string and allows to
+   *     map its code units to original grapheme clusters
+   */
+  public static CaseFoldedString fold(CharSequence charSequence, Locale locale) {
+    BreakIterator breakIterator = BreakIterator.getCharacterInstance();
+    breakIterator.setText(charSequence);
+    StringBuilder stringBuilder = new StringBuilder(charSequence.length());
+    Fold foldAlgorithm = caseFoldAlgorithmForLocale(locale);
+    IntArrayBuilder index_mapping = new IntArrayBuilder(charSequence.length() + 1);
+
+    // We rely on the fact that ICU Case Folding is _not_ context-sensitive, i.e. the mapping of
+    // each grapheme cluster is independent of surrounding ones. Regular casing is
+    // context-sensitive.
+    int current = breakIterator.current();
+    int next;
+    int grapheme_index = 0;
+    while ((next = breakIterator.next()) != BreakIterator.DONE) {
+      CharSequence grapheme = new StringSlice(charSequence, current, next);
+      String foldedGrapheme = foldAlgorithm.apply(grapheme);
+      stringBuilder.append(foldedGrapheme);
+      for (int i = 0; i < foldedGrapheme.length(); ++i) {
+        index_mapping.add(grapheme_index);
+      }
+
+      grapheme_index++;
+      current = next;
+    }
+
+    // The mapping should also be able to handle a {@code str.length()} query, so we add one more
+    // element to the mapping pointing to a non-existent grapheme after the end of the text.
+    index_mapping.add(grapheme_index);
+
+    return new CaseFoldedString(
+        stringBuilder.toString(), index_mapping.unsafeGetStorageAndInvalidateTheBuilder());
+  }
+
+  /**
+   * A helper function which folds the string without remembering the index mapping.
+   *
+   * <p>It should be used when the index mapping is not needed, as its implementation is much more
+   * efficient.
+   *
+   * @param charSequence a sequence of UTF-16 characters to transform
+   * @param locale the locale to use as a reference for case folding; it is needed because Turkish
+   *     and Azerbaijani locales handle casing of the letter `i` in a different way than the others
+   * @return the folded string
+   */
+  public static String simpleFold(CharSequence string, Locale locale) {
+    return caseFoldAlgorithmForLocale(locale).apply(string);
+  }
+
+  private static final Locale AZ_LOCALE = new Locale("az");
+  private static final Locale TR_LOCALE = new Locale("tr");
+
+  /**
+   * Returns a case folding algorithm appropriate for the given locale.
+   *
+   * <p>The algorithm is locale-dependent because Turkish and Azerbaijani locales handle casing of
+   * the letter `i` in a different way than other locales.
+   */
+  public static Fold caseFoldAlgorithmForLocale(Locale locale) {
+    if (locale.equals(AZ_LOCALE) || locale.equals(TR_LOCALE)) {
+      return CaseMap.fold().turkic();
+    }
+    return CaseMap.fold();
+  }
+}
diff --git a/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java b/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java
new file mode 100644
index 000000000000..8ba21e802415
--- /dev/null
+++ b/std-bits/base/src/main/java/org/enso/base/text/GraphemeSpan.java
@@ -0,0 +1,28 @@
+package org.enso.base.text;
+
+/**
+ * Represents a span of characters (understood as extended grapheme clusters) within a Text.
+ *
+ * <p>The start index indicates the first grapheme of the span and the end index indicates the first
+ * grapheme after the end of the span.
+ *
+ * <p>Represents an empty span if start and end indices are equal. Such an empty span refers to the
+ * space just before the grapheme corresponding to index start.
+ */
+public class GraphemeSpan {
+
+  public final long start, end;
+
+  /**
+   * Constructs a span of characters (understood as extended grapheme clusters).
+   *
+   * @param start index of the first extended grapheme cluster contained within the span (or
+   *     location of the span if it is empty)
+   * @param end index of the first extended grapheme cluster after start that is not contained
+   *     within the span
+   */
+  public GraphemeSpan(long start, long end) {
+    this.start = start;
+    this.end = end;
+  }
+}
diff --git a/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java b/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java
new file mode 100644
index 000000000000..23b56fdaac0d
--- /dev/null
+++ b/std-bits/base/src/main/java/org/enso/base/text/IntArrayBuilder.java
@@ -0,0 +1,65 @@
+package org.enso.base.text;
+
+/** A helper to efficiently build an array of unboxed integers of arbitrary length. */
+public class IntArrayBuilder {
+  private int[] storage;
+  private int length;
+
+  /**
+   * Constructs an empty builder with a given initial capacity.
+   *
+   * @param initialCapacity the initial capacity of the builder, can be used to avoid expanding the
+   *     storage if the amount of elements can be estimated in advance.
+   */
+  public IntArrayBuilder(int initialCapacity) {
+    length = 0;
+    storage = new int[initialCapacity];
+  }
+
+  /** Adds a new element to the array, expanding it if necessary. */
+  public void add(int x) {
+    if (length >= storage.length) {
+      grow();
+    }
+
+    storage[length++] = x;
+  }
+
+  /**
+   * Expands the storage to fit more elements.
+   *
+   * <p>The storage grows by 50% and is always increased by at least one. The 50% growth is chosen
+   * so that the amortized cost of adding a new element to the array stays constant.
+   */
+  private void grow() {
+    int newCapacity = storage.length + (storage.length / 2);
+    if (newCapacity <= storage.length) {
+      newCapacity = storage.length + 1;
+    }
+
+    int[] newStorage = new int[newCapacity];
+    System.arraycopy(this.storage, 0, newStorage, 0, length);
+    this.storage = newStorage;
+  }
+
+  /** Returns the amount of elements already added to the storage. */
+  public int getLength() {
+    return length;
+  }
+
+  /**
+   * Returns the underlying storage of the builder.
+   *
+   * <p>This method avoids copying for performance so it should be used with care. The storage can
+   * actually have more elements than were added, so the user should be careful to only query the
+   * first {@code getLength()} elements. Querying other elements results in an unspecified result.
+   *
+   * <p>After calling this method, the builder is invalidated and cannot be used anymore. Any usage
+   * of the builder afterwards will result in a {@code NullPointerException}.
+   */
+  public int[] unsafeGetStorageAndInvalidateTheBuilder() {
+    int[] tmp = storage;
+    this.storage = null;
+    return tmp;
+  }
+}
diff --git a/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java b/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java
new file mode 100644
index 000000000000..5374e3ff1129
--- /dev/null
+++ b/std-bits/base/src/main/java/org/enso/base/text/StringSlice.java
@@ -0,0 +1,34 @@
+package org.enso.base.text;
+
+/** A char sequence which allows to access a slice of another char sequence without copying. */
+class StringSlice implements CharSequence {
+  private final CharSequence text;
+  private final int subStart, subEnd;
+
+  /** Constructs a slice of the given text. */
+  public StringSlice(CharSequence text, int start, int end) {
+    this.text = text;
+    this.subStart = start;
+    this.subEnd = end;
+  }
+
+  @Override
+  public int length() {
+    return subEnd - subStart;
+  }
+
+  @Override
+  public char charAt(int index) {
+    return text.charAt(subStart + index);
+  }
+
+  @Override
+  public CharSequence subSequence(int start, int end) {
+    return new StringSlice(text, subStart + start, subStart + end);
+  }
+
+  @Override
+  public String toString() {
+    return text.subSequence(subStart, subEnd).toString();
+  }
+}
diff --git a/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java b/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java
new file mode 100644
index 000000000000..a4a3b31419fa
--- /dev/null
+++ b/std-bits/base/src/main/java/org/enso/base/text/Utf16Span.java
@@ -0,0 +1,18 @@
+package org.enso.base.text;
+
+/**
+ * Represents a span of UTF-16 code units within a String.
+ *
+ * <p>The start index indicates the first code unit of the span and the end index indicates the
+ * first code unit after the end of the span.
+ */
+public class Utf16Span {
+
+  public final long start, end;
+
+  /** Constructs a span of UTF-16 code units. */
+  public Utf16Span(long start, long end) {
+    this.start = start;
+    this.end = end;
+  }
+}
diff --git a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso b/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso
index 9cd711138329..1116d350254c 100644
--- a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso
+++ b/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso
@@ -6,7 +6,7 @@ import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
 import Standard.Base.Data.Text.Regex.Mode
 import Standard.Base.Data.Text.Regex.Option as Global_Option
-import Standard.Base.Data.Text.Span
+from Standard.Base.Data.Text.Span as Span_Module import Utf_16_Span
 
 polyglot java import java.util.regex.Pattern as Java_Pattern
 
@@ -182,6 +182,22 @@ spec =
             match.at 1 . group 0 . should_equal "ef"
             match.at 2 . group 0 . should_equal "gh"
 
+        Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
+           pattern = engine.compile "" []
+           match_1 = pattern.match "" mode=Mode.All
+           match_1.length . should_equal 1
+           match_1.at 0 . start 0 . should_equal 0
+           match_1.at 0 . end 0 . should_equal 0
+
+           match_2 = pattern.match "ABC" mode=Mode.All
+           match_2.length . should_equal 4
+           match_2.at 0 . start 0 . should_equal 0
+           match_2.at 0 . end 0 . should_equal 0
+           match_2.at 1 . start 0 . should_equal 1
+           match_2.at 1 . end 0 . should_equal 1
+           match_2.at 3 . start 0 . should_equal 3
+           match_2.at 3 . end 0 . should_equal 3
+
     Test.group "The default regex engine's Pattern.find" <|
         engine = Default_Engine.new
 
@@ -261,11 +277,23 @@ spec =
             match.at 1 . should_equal "ef"
             match.at 2 . should_equal "gh"
 
+            match_2 = pattern.find input mode=(Mode.Bounded 2 8 mode=10)
+            match_2.length . should_equal 3
+            match_2.at 0 . should_equal "cd"
+            match_2.at 1 . should_equal "ef"
+            match_2.at 2 . should_equal "gh"
+
+            match_3 = pattern.find input mode=(Mode.Bounded 2 8 mode=2)
+            match_3.length . should_equal 2
+            match_3.at 0 . should_equal "cd"
+            match_3.at 1 . should_equal "ef"
+
         Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
             engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
             engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
             engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
 
+            engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
             engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
             engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
             engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Mode.Bounded 1 3) . should_equal ["1", "b"]
@@ -501,10 +529,10 @@ spec =
         match . should_be_a Default_Engine.Match
 
         Test.specify "should get the span of a group by index" <|
-            match.span 1 . should_equal (Span.new 0 6 input)
+            match.span 1 . should_equal (Utf_16_Span (Range 0 6) input)
 
         Test.specify "should get the span of a group by name" <|
-            match.span "letters" . should_equal (Span.new 6 18 input)
+            match.span "letters" . should_equal (Utf_16_Span (Range 6 18) input)
 
         Test.specify "should return Nothing if the group didn't match" <|
             match.span 3 . should_equal Nothing
diff --git a/test/Tests/src/Data/Text/Regex_Spec.enso b/test/Tests/src/Data/Text/Regex_Spec.enso
index 55aa8a6a93fb..4d7b77cf2c45 100644
--- a/test/Tests/src/Data/Text/Regex_Spec.enso
+++ b/test/Tests/src/Data/Text/Regex_Spec.enso
@@ -26,3 +26,4 @@ spec =
             pattern = "http://example.com"
             Regex.escape pattern . should_equal "\Qhttp://example.com\E"
 
+main = Test.Suite.run_main here.spec
diff --git a/test/Tests/src/Data/Text/Span_Spec.enso b/test/Tests/src/Data/Text/Span_Spec.enso
index e8ae45a9c48e..2de0ac5096e8 100644
--- a/test/Tests/src/Data/Text/Span_Spec.enso
+++ b/test/Tests/src/Data/Text/Span_Spec.enso
@@ -2,20 +2,36 @@
 from Standard.Base import all
 import Standard.Test
 
-import Standard.Base.Data.Text.Span
+from Standard.Base.Data.Text.Span as Span_Module import Span, Utf_16_Span
 
 spec = Test.group "Text.Span" <|
 
     Test.specify "should be able to be created over a text" <|
         text = "Hello!"
-        span = Span.new 0 3 text
+        span = Span (Range 0 3) text
         span.start . should_equal 0
         span.end . should_equal 3
         span.text . should_equal text
 
-    Test.specify "should be able to be created without a text" <|
-        span = Span.new 5 8
-        span.start . should_equal 5
-        span.end . should_equal 8
-        span.text . should_equal Nothing
+    Test.specify "should be able to be converted to code units" <|
+        text = 'ae\u{301}fz'
+        (Span (Range 1 3) text).to_utf_16_span . should_equal (Utf_16_Span (Range 1 4) text)
 
+    Test.specify "should expand to the associated grapheme clusters" <|
+        text = 'a\u{301}e\u{302}o\u{303}'
+        span = Utf_16_Span (Range 1 5) text
+        extended = span.to_grapheme_span
+        extended . should_equal (Span (Range 0 3) text)
+        extended.to_utf_16_span . should_equal (Utf_16_Span (Range 0 6) text)
+
+        Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
+        Utf_16_Span (Range 0 1) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
+        Utf_16_Span (Range 0 0) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
+        Utf_16_Span (Range 1 1) text . to_grapheme_span . should_equal (Span (Range 0 0) text)
+        Utf_16_Span (Range 2 2) text . to_grapheme_span . should_equal (Span (Range 1 1) text)
+
+        Utf_16_Span (Range 0 4) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
+        Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
+        Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
+
+main = Test.Suite.run_main here.spec
diff --git a/test/Tests/src/Data/Text/Utils_Spec.enso b/test/Tests/src/Data/Text/Utils_Spec.enso
new file mode 100644
index 000000000000..b15d7ced5177
--- /dev/null
+++ b/test/Tests/src/Data/Text/Utils_Spec.enso
@@ -0,0 +1,61 @@
+from Standard.Base import all
+
+polyglot java import org.enso.base.Text_Utils
+polyglot java import org.enso.base.text.CaseFoldedString
+
+import Standard.Test
+
+polyglot java import com.ibm.icu.text.BreakIterator
+spec =
+    Test.group "Text_Utils" <|
+        kshi = '\u0915\u094D\u0937\u093F'
+        facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
+        text = "a"+kshi+facepalm+'e\u{301}Z'
+        codepoints_to_graphemes = _.flatten <| text.characters.map_with_index ix-> grapheme->
+            codepoints_count = grapheme.utf_16.length
+            Vector.new codepoints_count _->ix
+
+        Test.specify "should correctly translate an codepoint index to a grapheme index" <|
+            codepoints_to_graphemes . each_with_index codepoint_ix-> grapheme_ix->
+                found_grapheme_ix = Text_Utils.utf16_index_to_grapheme_index text codepoint_ix
+                found_grapheme_ix.should_equal grapheme_ix
+
+            Text_Utils.utf16_index_to_grapheme_index text text.utf_16.length . should_equal text.length
+            Text_Utils.utf16_index_to_grapheme_index "" 0 . should_equal 0
+
+            Text_Utils.utf16_index_to_grapheme_index 'ą' 0 . should_equal 0
+            Text_Utils.utf16_index_to_grapheme_index 'ą' 1 . should_equal 1
+
+            Text_Utils.utf16_index_to_grapheme_index "aB" 0 . should_equal 0
+            Text_Utils.utf16_index_to_grapheme_index "aB" 1 . should_equal 1
+            Text_Utils.utf16_index_to_grapheme_index "aB" 2 . should_equal 2
+
+            Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 0 . should_equal 0
+            Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 1 . should_equal 0
+            Text_Utils.utf16_index_to_grapheme_index 'a\u{301}' 2 . should_equal 1
+
+        Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
+            translate_indices text ixes =
+                Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
+            codepoint_indices = Vector.new text.utf_16.length ix->ix
+            translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
+
+            translate_indices "" [0] . should_equal [0]
+            translate_indices 'ą' [0, 1] . should_equal [0, 1]
+            translate_indices "aB" [0, 1, 2] . should_equal [0, 1, 2]
+            translate_indices 'a\u{301}' [0, 1, 2] . should_equal [0, 0, 1]
+
+        Test.specify "should correctly case-fold a string and translate codeunits to graphemes" <|
+            text = 'a\u{301}AZßﬃą'
+            folded = CaseFoldedString.fold text Locale.default.java_locale
+            folded.getFoldedString . should_equal 'a\u{301}azssffią'
+
+            codeunits = Vector.new folded.getFoldedString.utf_16.length+1 ix->ix
+            grapheme_ixes = codeunits.map ix->
+                folded.codeUnitToGraphemeIndex ix
+            grapheme_ixes . should_equal [0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6]
+
+            Test.expect_panic_with (folded.codeUnitToGraphemeIndex -1) Polyglot_Error
+            Test.expect_panic_with (folded.codeUnitToGraphemeIndex folded.getFoldedString.utf_16.length+1) Polyglot_Error
+
+main = Test.Suite.run_main here.spec
diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso
index 86a1fb209ba5..63ddb2514a33 100644
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@@ -4,7 +4,10 @@ from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error
 import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
 import Standard.Base.Data.Locale
 import Standard.Base.Data.Text.Split_Kind
+from Standard.Base.Data.Text.Span as Span_Module import Span
 from Standard.Base.Data.Text.Text_Sub_Range import all
+import Standard.Base.Data.Text.Regex.Mode
+import Standard.Base.Data.Text.Matching_Mode
 import Standard.Test
 
 type Auto a
@@ -87,9 +90,8 @@ spec =
             'e\u0301' . equals_ignore_case 'e\u0303' . should_be_false
 
             "I" . equals_ignore_case "i" . should_be_true
-            "I" . equals_ignore_case "ı" . should_be_true
-            "İ" . equals_ignore_case "i" . should_be_false
             "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
+            "I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
             "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
 
             "Kongressstraße"=="Kongressstrasse" . should_be_false
@@ -199,15 +201,20 @@ spec =
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 6) . should_equal 'Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Last 5) . should_equal 'o\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e\u{302}') . should_equal 'H'
+            'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'ê') . should_equal 'H'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (Before 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} W'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'ö') . should_equal 'He\u{302}llo\u{308} W'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Before_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{302}') . should_equal 'llo\u{308} Wo\u{301}rld!'
+            'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'ê') . should_equal 'llo\u{308} Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e\u{308}') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{301}rld!'.take (After 'e') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o\u{308}') . should_equal 'rld!'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'ö') . should_equal 'rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (After_Last 'o') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e\u{302}') . should_equal 'H'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='ê') . should_equal 'H'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (While c->c!='e') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range 3 5) . should_equal 'lo\u{308}'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.take (Range -3 -1) . should_equal 'ld'
@@ -232,6 +239,30 @@ spec =
             '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 Nothing) . should_equal '😙😉☺'
             '✨🚀🚧😍😃😍😎😙😉☺'.take (Range -3 -1) . should_equal '😙😉'
 
+        Test.specify "take should correctly handle edge cases" <|
+            "".take First.new . should_equal ""
+            "".take Last.new . should_equal ""
+
+            "".take (After "a") . should_equal ""
+            "".take (After_Last "a") . should_equal ""
+            "".take (Before "a") . should_equal ""
+            "".take (Before_Last "a") . should_equal ""
+
+            "".take (After "") . should_equal ""
+            "".take (After_Last "") . should_equal ""
+            "".take (Before "") . should_equal ""
+            "".take (Before_Last "") . should_equal ""
+
+            "".take (While _->True) . should_equal ""
+
+            "".take (Range 0 0) . should_equal ""
+            'ABC\u{301}'.take (Range 0 0) . should_equal ""
+
+            'ABC\u{301}'.take (After "") . should_equal 'ABC\u{301}'
+            'ABC\u{301}'.take (After_Last "") . should_equal ""
+            'ABC\u{301}'.take (Before "") . should_equal ""
+            'ABC\u{301}'.take (Before_Last "") . should_equal 'ABC\u{301}'
+
         Test.specify "drop should work as in the examples" <|
             "Hello World!".drop First.new . should_equal "ello World!"
             "Hello World!".drop (First 5) . should_equal " World!"
@@ -269,15 +300,20 @@ spec =
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 6) . should_equal 'He\u{302}llo\u{308} '
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Last 5) . should_equal 'He\u{302}llo\u{308} W'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
+            'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'ê') . should_equal 'e\u{302}llo\u{308} Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (Before 'e') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o\u{308}') . should_equal 'o\u{308}rld!'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'ö') . should_equal 'o\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Before_Last 'o') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{302}') . should_equal 'He\u{302}'
+            'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'ê') . should_equal 'He\u{302}'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{301}rld!'.drop (After 'e') . should_equal 'He\u{302}llo\u{308} Wo\u{301}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o\u{308}') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'ö') . should_equal 'He\u{302}llo\u{308} Wo\u{308}'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (After_Last 'o') . should_equal 'He\u{302}llo\u{308} Wo\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e\u{302}') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
+            'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='ê') . should_equal 'e\u{302}llo\u{308} Wo\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (While c->c!='e') . should_equal ''
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range 3 5) . should_equal 'He\u{302}l Wo\u{308}rld!'
             'He\u{302}llo\u{308} Wo\u{308}rld!'.drop (Range -3 -1) . should_equal 'He\u{302}llo\u{308} Wo\u{308}r!'
@@ -301,6 +337,30 @@ spec =
             '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 Nothing) . should_equal '✨🚀🚧😍😃😍😎'
             '✨🚀🚧😍😃😍😎😙😉☺'.drop (Range -3 -1) . should_equal '✨🚀🚧😍😃😍😎☺'
 
+        Test.specify "drop should correctly handle edge cases" <|
+            "".drop First.new . should_equal ""
+            "".drop Last.new . should_equal ""
+
+            "".drop (After "a") . should_equal ""
+            "".drop (After_Last "a") . should_equal ""
+            "".drop (Before "a") . should_equal ""
+            "".drop (Before_Last "a") . should_equal ""
+
+            "".drop (After "") . should_equal ""
+            "".drop (After_Last "") . should_equal ""
+            "".drop (Before "") . should_equal ""
+            "".drop (Before_Last "") . should_equal ""
+
+            "".drop (While _->True) . should_equal ""
+
+            "".drop (Range 0 0) . should_equal ""
+            'ABC\u{301}'.drop (Range 0 0) . should_equal 'ABC\u{301}'
+
+            'ABC\u{301}'.drop (After "") . should_equal ''
+            'ABC\u{301}'.drop (After_Last "") . should_equal 'ABC\u{301}'
+            'ABC\u{301}'.drop (Before "") . should_equal 'ABC\u{301}'
+            'ABC\u{301}'.drop (Before_Last "") . should_equal ''
+
         Test.specify "should correctly convert character case" <|
             "FooBar Baz".to_case Case.Lower . should_equal "foobar baz"
             "FooBar Baz".to_case Case.Upper . should_equal "FOOBAR BAZ"
@@ -465,10 +525,7 @@ spec =
             ## This shows what regex is doing by default and we cannot easily fix
                that.
             's\u{301}' . contains 's' (Regex_Matcher.new) . should_be_true
-            ## This would normally be false, but we perform input normalization
-               to get results that are consistent regardless of if the input was
-               normalized or not.
-            'ś' . contains 's' (Regex_Matcher.new) . should_be_true
+            'ś' . contains 's' (Regex_Matcher.new) . should_be_false
             's\u{301}' . contains 'ś' (Regex_Matcher.new) . should_be_true
             'ś' . contains 's\u{301}' (Regex_Matcher.new) . should_be_true
 
@@ -767,6 +824,157 @@ spec =
 
             '✨🚀🚧'*2 . should_equal '✨🚀🚧✨🚀🚧'
 
+        Test.specify "location_of should work as shown in examples" <|
+            example_1 =
+                "Hello World!".location_of "J" == Nothing
+                "Hello World!".location_of "o" == Span (Range 4 5) "Hello World!"
+                "Hello World!".location_of "o" mode=Matching_Mode.Last == Span (Range 4 5) "Hello World!"
+
+            example_2 =
+                term = "straße"
+                text = "MONUMENTENSTRASSE 42"
+                match = text . location_of term matcher=(Text_Matcher Case_Insensitive.new)
+                term.length . should_equal 6
+                match.length . should_equal 7
+
+            example_3 =
+                ligatures = "ﬃﬄ"
+                ligatures.length . should_equal 2
+                term_1 = "IFF"
+                match_1 = ligatures . location_of term_1 matcher=(Text_Matcher Case_Insensitive.new)
+                term_1.length . should_equal 3
+                match_1.length . should_equal 2
+                term_2 = "ffiffl"
+                match_2 = ligatures . location_of term_2 matcher=(Text_Matcher Case_Insensitive.new)
+                term_2.length . should_equal 6
+                match_2.length . should_equal 2
+                match_1 . should_equal match_2
+
+            example_4 =
+                "Hello World!".location_of_all "J" . should_equal []
+                "Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
+
+            example_5 =
+                term = "strasse"
+                text = "MONUMENTENSTRASSE ist eine große Straße."
+                match = text . location_of_all term matcher=(Text_Matcher Case_Insensitive.new)
+                term.length . should_equal 7
+                match . map .length . should_equal [7, 6]
+
+            example_6 =
+                ligatures = "ﬃﬄFFIFF"
+                ligatures.length . should_equal 7
+                match_1 = ligatures . location_of_all "IFF" matcher=(Text_Matcher Case_Insensitive.new)
+                match_1 . map .length . should_equal [2, 3]
+                match_2 = ligatures . location_of_all "ffiff" matcher=(Text_Matcher Case_Insensitive.new)
+                match_2 . map .length . should_equal [2, 5]
+
+            # Put them in blocks to avoid name clashes.
+            example_1
+            example_2
+            example_3
+            example_4
+            example_5
+            example_6
+
+        Test.specify "should allow to find location_of occurrences within a text" <|
+            "Hello World!".location_of_all "J" . should_equal []
+            "Hello World!".location_of_all "o" . map .start . should_equal [4, 7]
+
+            accents = 'a\u{301}e\u{301}o\u{301}'
+            accents.location_of accent_1 . should_equal (Span (Range 1 2) accents)
+
+            "".location_of "foo" . should_equal Nothing
+            "".location_of "foo" mode=Matching_Mode.Last . should_equal Nothing
+            "".location_of_all "foo" . should_equal []
+            "".location_of "" . should_equal (Span (Range 0 0) "")
+            "".location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
+            "".location_of_all "" . should_equal [Span (Range 0 0) ""]
+            abc = 'A\u{301}ßC'
+            abc.location_of "" . should_equal (Span (Range 0 0) abc)
+            abc.location_of "" mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
+            abc.location_of_all "" . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
+
+        Test.specify "should allow case insensitive matching in location_of" <|
+            hello = "Hello WORLD!"
+            case_insensitive = Text_Matcher Case_Insensitive.new
+            hello.location_of "world" . should_equal Nothing
+            hello.location_of "world" matcher=case_insensitive . should_equal (Span (Range 6 11) hello)
+
+            hello.location_of "o" mode=Mode.First matcher=case_insensitive . should_equal (Span (Range 4 5) hello)
+            hello.location_of "o" mode=Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 7 8) hello)
+
+            accents = 'A\u{301}E\u{301}O\u{301}'
+            accents.location_of accent_1 matcher=case_insensitive . should_equal (Span (Range 1 2) accents)
+
+            "Strasse".location_of "ß" matcher=case_insensitive . should_equal (Span (Range 4 6) "Strasse")
+            "Monumentenstraße 42".location_of "STRASSE" matcher=case_insensitive . should_equal (Span (Range 10 16) "Monumentenstraße 42")
+
+            '\u0390'.location_of '\u03B9\u0308\u0301' matcher=case_insensitive . should_equal (Span (Range 0 1) '\u0390')
+            'ԵՒ'.location_of 'և' . should_equal Nothing
+            'ԵՒ'.location_of 'և' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ԵՒ')
+            'և'.location_of 'ԵՒ' matcher=case_insensitive . should_equal (Span (Range 0 1) 'և')
+
+            ligatures = 'ffaﬀﬁﬂﬃﬄﬅﬆZ'
+            ligatures.location_of 'FFI' matcher=case_insensitive . should_equal (Span (Range 3 5) ligatures)
+            ligatures.location_of 'FF' matcher=case_insensitive . should_equal (Span (Range 0 2) ligatures)
+            ligatures.location_of 'ff' matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 7 8) ligatures)
+            ligatures.location_of_all 'ff' . should_equal [Span (Range 0 2) ligatures]
+            ligatures.location_of_all 'FF' matcher=case_insensitive . should_equal [Span (Range 0 2) ligatures, Span (Range 3 4) ligatures, Span (Range 6 7) ligatures, Span (Range 7 8) ligatures]
+            ligatures.location_of_all 'ffi' matcher=case_insensitive . should_equal [Span (Range 3 5) ligatures, Span (Range 6 7) ligatures]
+            'fffi'.location_of_all 'ﬀ' matcher=case_insensitive . should_equal [Span (Range 0 2) 'fffi']
+            'fffi'.location_of_all 'ﬃ' . should_equal []
+            'fffi'.location_of_all 'ﬃ' matcher=case_insensitive . should_equal [Span (Range 1 4) 'fffi']
+            'FFFI'.location_of 'ﬃ' matcher=case_insensitive . should_equal (Span (Range 1 4) 'FFFI')
+
+            'ﬃﬄ'.location_of 'IF' matcher=case_insensitive . should_equal (Span (Range 0 2) 'ﬃﬄ')
+            'ﬃﬄ'.location_of 'F' Matching_Mode.Last matcher=case_insensitive . should_equal (Span (Range 1 2) 'ﬃﬄ')
+            'ﬃﬄ'.location_of_all 'F' matcher=case_insensitive . should_equal [Span (Range 0 1) 'ﬃﬄ', Span (Range 0 1) 'ﬃﬄ', Span (Range 1 2) 'ﬃﬄ', Span (Range 1 2) 'ﬃﬄ']
+            'aaﬃbb'.location_of_all 'af' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaﬃbb']
+            'aaﬃbb'.location_of_all 'affi' matcher=case_insensitive . should_equal [Span (Range 1 3) 'aaﬃbb']
+            'aaﬃbb'.location_of_all 'ib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaﬃbb']
+            'aaﬃbb'.location_of_all 'ffib' matcher=case_insensitive . should_equal [Span (Range 2 4) 'aaﬃbb']
+
+            "".location_of "foo" matcher=case_insensitive . should_equal Nothing
+            "".location_of "foo" matcher=case_insensitive mode=Matching_Mode.Last . should_equal Nothing
+            "".location_of_all "foo" matcher=case_insensitive . should_equal []
+            "".location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) "")
+            "".location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
+            "".location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) ""]
+            abc = 'A\u{301}ßC'
+            abc.location_of "" matcher=case_insensitive . should_equal (Span (Range 0 0) abc)
+            abc.location_of "" matcher=case_insensitive mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
+            abc.location_of_all "" matcher=case_insensitive . should_equal [Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
+
+        Test.specify "should allow regexes in location_of" <|
+            hello = "Hello World!"
+            regex = Regex_Matcher.new
+            regex_insensitive = Regex_Matcher.new case_sensitive=Case_Insensitive.new
+            hello.location_of ".o" Matching_Mode.First matcher=regex . should_equal (Span (Range 3 5) hello)
+            hello.location_of ".o" Matching_Mode.Last matcher=regex . should_equal (Span (Range 6 8) hello)
+            hello.location_of_all ".o" matcher=regex . map .start . should_equal [3, 6]
+
+            "foobar".location_of "BAR" Mode.First matcher=regex_insensitive . should_equal (Span (Range 3 6) "foobar")
+
+            ## Regex matching does not do case folding
+            "Strasse".location_of "ß" Mode.First matcher=regex_insensitive . should_equal Nothing
+
+            ## But it should handle the Unicode normalization
+            accents = 'a\u{301}e\u{301}o\u{301}'
+            accents.location_of accent_1 Mode.First matcher=regex . should_equal (Span (Range 1 2) accents)
+        Test.specify "should correctly handle regex edge cases in location_of" pending="Figure out how to make Regex correctly handle empty patterns." <|
+            regex = Regex_Matcher.new
+            "".location_of "foo" matcher=regex . should_equal Nothing
+            "".location_of "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
+            "".location_of_all "foo" matcher=regex . should_equal []
+            "".location_of "" matcher=regex . should_equal (Span (Range 0 0) "")
+            "".location_of_all "" matcher=regex . should_equal [Span (Range 0 0) ""]
+            "".location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 0 0) "")
+            abc = 'A\u{301}ßC'
+            abc.location_of "" matcher=regex . should_equal (Span (Range 0 0) abc)
+            abc.location_of_all "" matcher=regex . should_equal [Span (Range 0 0) abc, Span (Range 0 0) abc, Span (Range 1 1) abc, Span (Range 2 2) abc, Span (Range 3 3) abc]
+            abc.location_of "" matcher=regex mode=Matching_Mode.Last . should_equal (Span (Range 3 3) abc)
+
     Test.group "Regex matching" <|
         Test.specify "should be possible on text" <|
             match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First
diff --git a/test/Tests/src/Examples_Spec.enso b/test/Tests/src/Examples_Spec.enso
index 43820cebf97b..34acbc2125bd 100644
--- a/test/Tests/src/Examples_Spec.enso
+++ b/test/Tests/src/Examples_Spec.enso
@@ -128,3 +128,4 @@ spec = Test.group "Examples" <|
         match.groups.length . should_equal 5
         match.named_groups.size . should_equal 2
 
+main = Test.Suite.run_main here.spec
diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso
index b8558f158eac..0060b1872ce4 100644
--- a/test/Tests/src/Main.enso
+++ b/test/Tests/src/Main.enso
@@ -34,6 +34,7 @@ import project.Data.Text_Spec
 import project.Data.Time.Spec as Time_Spec
 import project.Data.Vector_Spec
 import project.Data.Text.Regex_Spec
+import project.Data.Text.Utils_Spec
 import project.Data.Text.Default_Regex_Engine_Spec
 import project.Data.Text.Matching_Spec
 import project.Data.Text.Span_Spec
@@ -87,6 +88,7 @@ main = Test.Suite.run_main <|
     Runtime_Spec.spec
     Span_Spec.spec
     Stack_Traces_Spec.spec
+    Utils_Spec.spec
     Text_Spec.spec
     Time_Spec.spec
     Uri_Spec.spec