Skip to content

Commit

Permalink
fix isequal_normalized for combining-char reordering (#52447)
Browse files Browse the repository at this point in the history
Fixes #52408.

(Note that this function was added in Julia 1.8, in #42493.)

In the future it would be good to further optimize this function by
adding a fast path for the common case of strings that are mostly ASCII
characters. Perhaps simply skip ahead to the first byte that doesn't
match before we begin doing decomposition etcetera.

(cherry picked from commit 3b250c7)
  • Loading branch information
stevengj authored and KristofferC committed Dec 23, 2023
1 parent a7dbaad commit 5bf09bb
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 11 deletions.
81 changes: 70 additions & 11 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,19 @@ end

using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK

function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer)
ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
ret < 0 && utf8proc_error(ret)
return ret
end

# would be good to have higher-level accessor functions in utf8proc. alternatively,
# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
# because of the bitfields.
combining_class(uc::Integer) =
0x000301 uc 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
Expand All @@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
!!! compat "Julia 1.8"
The `isequal_normalized` function was added in Julia 1.8.
# Examples
For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
n = _decompose_char!(c, resize!(d, n), options)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) =
_isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark)

# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
function _isequal_normalized!(s1::AbstractString, s2::AbstractString,
d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity;
casefold::Bool=false, stripmark::Bool=false) where {F}
function decompose_next_chars!(state, d, options, s)
local n
offset = 0
@inbounds while true
# read a char and decompose it to d
c = chartransform(UInt32(state[1]))
state = iterate(s, state[2])
if c < 0x80 # fast path for common ASCII case
n = 1 + offset
n > length(d) && resize!(d, 2n)
d[n] = casefold ? (0x41 c 0x5A ? c+0x20 : c) : c
break # ASCII characters are all zero combining class
else
while true
n = _decompose_char!(c, d, offset, options) + offset
if n > length(d)
resize!(d, 2n)
continue
end
break
end
end

# decomposed chars must be sorted in ascending order of combining class,
# which means we need to keep fetching chars until we get to non-combining
(iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining
offset = n
end
return 1, n, iterate(s, state)

# sort by combining class
if n < 32 # almost always true
for j1 = 2:n # insertion sort
cc = combining_class(d[j1])
iszero(cc) && continue # don't re-order non-combiners
for j2 = j1:-1:2
combining_class(d[j2-1]) cc && break
d[j2-1], d[j2] = d[j2], d[j2-1]
end
end
else # avoid n^2 complexity in crazy large-n case
j = 1
@views while j < n
j₀ = j + something(findnext(iszero combining_class, d[j+1:n], 1), n+1-j)
sort!(d[j:j₀-1], by=combining_class)
j = j₀
end
end

# split return statement to help type inference:
return state === nothing ? (1, n, nothing) : (1, n, state)
end
options = UTF8PROC_DECOMPOSE
casefold && (options |= UTF8PROC_CASEFOLD)
stripmark && (options |= UTF8PROC_STRIPMARK)
i1,i2 = iterate(s1),iterate(s2)
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
n1 = n2 = 0 # lengths of codepoint buffers
j1 = j2 = 1 # indices in d1, d2
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
67 changes: 67 additions & 0 deletions stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
using Test
using Unicode
using Unicode: normalize, isassigned, julia_chartransform
import Random

Random.seed!(12345)

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand Down Expand Up @@ -455,6 +458,9 @@ end
@test !Base.Unicode.isvalid(Char, overlong_char)
end

# the obvious, but suboptimal, algorithm:
isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...)

@testset "Unicode equivalence" begin
@test isequal_normalized("no\u00EBl", "noe\u0308l")
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
Expand All @@ -466,4 +472,65 @@ end
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)

# issue #52408
@testset "Sorting combining characters" begin
for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples
@test isequal_normalized(str, normalize(str))
end

# first codepoint in every possible Unicode combining class
let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345],
vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)]
function randcc(n, n_cc) # random string with lots of combining chars
buf = IOBuffer()
for _ = 1:n
print.(buf, rand(Vowels, rand(1:5)))
print.(buf, Char.(rand(cc_chars, rand(0:n_cc))))
end
return String(take!(buf))
end
for _ = 1:100
s = randcc(10,10)
ns = normalize(s)
cs = normalize(s, casefold=true)
@test isequal_normalized(s, s)
if !isequal_normalized(s, ns)
@show s
end
@test isequal_normalized(s, ns)
@test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns)
@test isequal_normalized(cs, ns, casefold=true) ==
isequal_normalized_naive(cs, ns, casefold=true)
end
for _ = 1:3
s = randcc(5,1000) # exercise sort!-based fallback
@test isequal_normalized(s, normalize(s))
end
function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners
buf1 = IOBuffer()
buf2 = IOBuffer()
p = n_cc / length(cc_chars)
for _ = 1:n
a = join(rand(Vowels, rand(1:5)))
print(buf1, a)
print(buf2, a)

# chars from distinct combining classes
# are canonically equivalent when re-ordered
c = Random.randsubseq(cc_chars, p)
print.(buf1, Char.(Random.shuffle!(c)))
print.(buf2, Char.(Random.shuffle!(c)))
end
return String(take!(buf1)), String(take!(buf2))
end
for _ = 1:100
s1, s2 = randcc2(10,10)
@test isequal_normalized(s1, s2)
end
end

# combining characters in the same class are inequivalent if re-ordered:
@test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334")
end
end

0 comments on commit 5bf09bb

Please sign in to comment.