Support case-changes to Annotated{String,Char}s (#54013)

Previously, any case changes to Annotated{String,Char} types triggered "fall back to non-annotated type" non-specialised methods. It would be nice to keep the annotations though, and that can be done so long as we keep track of any potential changes to the number of bytes taken by each character on case changes. This is unusual, but can happen with some letters (e.g. the upper case of 'ſ' is 'S'). To handle this, a helper function annotated_chartransform is introduced. This allows for efficient uppercase/lowercase methods (about 50% overhead in managing the annotation ranges, compared to just transforming a String). The {upper,lower}casefirst and titlecase transformations are much more inefficient with this style of implementation, but not prohibitively so. If somebody has a bright idea, or they emerge as an area deserving of more attention, the performance characteristics can be improved. As a bonus, a specialised textwidth method is implemented to avoid the generic fallback, providing a ~12x performance improvement. To check that annotated_chartransform is accurate, as are the specialised case-transformations, a few million random collections of strings were pre- and post-annotated and checked to be the same in a fuzzing check performed with Supposition.jl. const short_str = Data.Text(Data.Characters(), max_len=20) const short_strs = Data.Vectors(short_str, max_size=10) const case_transform_fn = Data.SampledFrom((uppercase, lowercase)) function annot_caseinvariant(f::Function, strs::Vector{String}) annot_strs = map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]), enumerate(strs)) f_annot_strs = map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]), enumerate(map(f, strs))) pre_join = Base.annotated_chartransform(join(annot_strs), f) post_join = join(f_annot_strs) pre_join == post_join end @check max_examples=1_000_000 annot_caseinvariant(case_transform_fn, short_strs) This helped me determine that in annotated_chartransform the "- 1" was needed with offset position calculation, and that in the "findlast" calls that less than *or equal* was the correct equality test.
JuliaLang · Apr 18, 2024 · 38a9725 · 38a9725
1 parent c741bd3
commit 38a9725
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 1 deletion.
diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl
@@ -399,6 +399,51 @@ Get all annotations of `chr`, in the form of a vector of annotation pairs.
 """
 annotations(c::AnnotatedChar) = c.annotations
 
+## Character transformation helper function, c.f. `unicode.jl`.
+
+"""
+    annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
+
+Transform every character in `str` with `f`, adjusting annotation regions as
+appropriate. `f` must take one of two forms, either:
+- `f(c::Char) -> Char`, or
+- `f(c::Char, state) -> (Char, state)`.
+
+This works by comparing the number of code units of each character before and
+after transforming with `f`, recording and aggregating any differences, then
+applying them to the annotation regions.
+
+Returns an `AnnotatedString{String}` (regardless of the original underling
+string type of `str`).
+"""
+function annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
+    outstr = IOBuffer()
+    annots = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
+    bytepos = firstindex(str) - 1
+    offsets = [bytepos => 0]
+    for c in str.string
+        oldnb = ncodeunits(c)
+        bytepos += oldnb
+        if isnothing(state)
+            c = f(c)
+        else
+            c, state = f(c, state)
+        end
+        nb = write(outstr, c)
+        if nb != oldnb
+            push!(offsets, bytepos => last(last(offsets)) + nb - oldnb)
+        end
+    end
+    for annot in str.annotations
+        region, value = annot
+        start, stop = first(region), last(region)
+        start_offset = last(offsets[findlast(<=(start) ∘ first, offsets)::Int])
+        stop_offset  = last(offsets[findlast(<=(stop) ∘ first, offsets)::Int])
+        push!(annots, ((start + start_offset):(stop + stop_offset), value))
+    end
+    AnnotatedString(String(take!(outstr)), annots)
+end
+
 ## AnnotatedIOBuffer
 
 struct AnnotatedIOBuffer <: AbstractPipe

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -4,7 +4,8 @@
 module Unicode
 
 import Base: show, ==, hash, string, Symbol, isless, length, eltype,
-             convert, isvalid, ismalformed, isoverlong, iterate
+             convert, isvalid, ismalformed, isoverlong, iterate,
+             AnnotatedString, AnnotatedChar, annotated_chartransform
 
 # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
 
@@ -271,6 +272,8 @@ julia> textwidth("March")
 """
 textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)
 
+textwidth(s::AnnotatedString) = textwidth(s.string)
+
 """
     lowercase(c::AbstractChar)
 
@@ -290,6 +293,8 @@ julia> lowercase('Ö')
 lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
     T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
 
+lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c))
+
 """
     uppercase(c::AbstractChar)
 
@@ -309,6 +314,8 @@ julia> uppercase('ê')
 uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
     T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
 
+uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c))
+
 """
     titlecase(c::AbstractChar)
 
@@ -332,6 +339,8 @@ julia> uppercase('ǆ')
 titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
     T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
 
+titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))
+
 ############################################################################
 
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
@@ -606,6 +615,7 @@ julia> uppercase("Julia")
 ```
 """
 uppercase(s::AbstractString) = map(uppercase, s)
+uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s)
 
 """
     lowercase(s::AbstractString)
@@ -621,6 +631,7 @@ julia> lowercase("STRINGS AND THINGS")
 ```
 """
 lowercase(s::AbstractString) = map(lowercase, s)
+lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s)
 
 """
     titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
@@ -669,6 +680,23 @@ function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Boo
     return String(take!(b))
 end
 
+# TODO: improve performance characteristics, room for a ~10x improvement.
+function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true)
+    initial_state = (; startword = true, state = Ref{Int32}(0),
+             c0 = eltype(s)(zero(UInt32)), wordsep, strict)
+    annotated_chartransform(s, initial_state) do c, state
+        if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c)
+            state = Base.setindex(state, true, :startword)
+            cnew = c
+        else
+            cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c
+            state = Base.setindex(state, false, :startword)
+        end
+        state = Base.setindex(state, c, :c0)
+        cnew, state
+    end
+end
+
 """
     uppercasefirst(s::AbstractString) -> String
 
@@ -693,6 +721,17 @@ function uppercasefirst(s::AbstractString)
     string(c′, SubString(s, nextind(s, 1)))
 end
 
+# TODO: improve performance characteristics, room for a ~5x improvement.
+function uppercasefirst(s::AnnotatedString)
+    annotated_chartransform(s, true) do c, state
+        if state
+            (titlecase(c), false)
+        else
+            (c, state)
+        end
+    end
+end
+
 """
     lowercasefirst(s::AbstractString)
 
@@ -715,6 +754,17 @@ function lowercasefirst(s::AbstractString)
     string(c′, SubString(s, nextind(s, 1)))
 end
 
+# TODO: improve performance characteristics, room for a ~5x improvement.
+function lowercasefirst(s::AnnotatedString)
+    annotated_chartransform(s, true) do c, state
+        if state
+            (lowercase(c), false)
+        else
+            (c, state)
+        end
+    end
+end
+
 ############################################################################
 # iterators for grapheme segmentation
 

diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl
@@ -108,6 +108,33 @@ end
     @test reverse(str2) == Base.AnnotatedString("esac", [(2:3, :label => "oomph")])
 end
 
+@testset "Unicode" begin
+    for words in (["ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE", "Сodeunıts"],
+                  ["Сodeunıts", "ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE"])
+        ann_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
+                     for (i, w) in enumerate(words)]
+        ann_str = join(ann_words, '-')
+        for transform in (lowercase, uppercase, titlecase)
+            t_words = map(transform, words)
+            ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
+                        for (i, w) in enumerate(t_words)]
+            ann_t_str = join(ann_t_words, '-')
+            t_ann_str = transform(ann_str)
+            @test String(ann_t_str) == String(t_ann_str)
+            @test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
+        end
+        for transform in (uppercasefirst, lowercasefirst)
+            t_words = vcat(transform(first(words)), words[2:end])
+            ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
+                        for (i, w) in enumerate(t_words)]
+            ann_t_str = join(ann_t_words, '-')
+            t_ann_str = transform(ann_str)
+            @test String(ann_t_str) == String(t_ann_str)
+            @test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
+        end
+    end
+end
+
 @testset "AnnotatedIOBuffer" begin
     aio = Base.AnnotatedIOBuffer()
     # Append-only writing