Skip to content

Commit

Permalink
Handle case folding in String#compare correctly (crystal-lang#13532)
Browse files Browse the repository at this point in the history
  • Loading branch information
HertzDevil authored and Blacksmoke16 committed Dec 11, 2023
1 parent 8febc9b commit 9ef62d9
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 128 deletions.
2 changes: 1 addition & 1 deletion scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ body.each_line do |line|
casefold = nil
end
if casefold
while casefold.size < 4
while casefold.size < 3
casefold << 0
end
special_cases_casefold << SpecialCase.new(codepoint, casefold)
Expand Down
8 changes: 4 additions & 4 deletions scripts/unicode_data.ecr
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ module Unicode
end

# Fold case transformation that involve mapping a codepoint
# to multiple codepoints. The maximum transformation is always 4
# codepoints, so we store them all as 4 codepoints and 0 means end.
private class_getter fold_cases : Hash(Int32, {Int32, Int32, Int32, Int32}) do
data = Hash(Int32, {Int32, Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_casefold.size %>)
# to multiple codepoints. The maximum transformation is always 3
# codepoints, so we store them all as 3 codepoints and 0 means end.
private class_getter fold_cases : Hash(Int32, {Int32, Int32, Int32}) do
data = Hash(Int32, {Int32, Int32, Int32}).new(initial_capacity: <%= special_cases_casefold.size %>)
<%- special_cases_casefold.each do |a_case| -%>
put(data, <%= a_case.codepoint %>, <%= a_case.value.join(", ") %>)
<%- end -%>
Expand Down
8 changes: 7 additions & 1 deletion spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2652,7 +2652,7 @@ describe "String" do
end
end

describe "compare" do
describe "#compare" do
it "compares case-sensitive" do
"fo".compare("foo").should eq(-1)
"foo".compare("fo").should eq(1)
Expand Down Expand Up @@ -2685,6 +2685,12 @@ describe "String" do
"abcA".compare("abca", case_insensitive: true).should eq(0)
end

it "compares case-insensitive, multiple chars after case conversion (#4513)" do
"".compare("ffl", case_insensitive: true, options: Unicode::CaseOptions::Fold).should eq(0)
"FFL".compare("", case_insensitive: true, options: Unicode::CaseOptions::Fold).should eq(0)
"".compare("ßs", case_insensitive: true, options: Unicode::CaseOptions::Fold).should eq(0)
end

it "treats invalid code units as replacement char in an otherwise ascii string" do
"\xC0".compare("\xE0", case_insensitive: true).should eq(0)
"\xE0".compare("\xC0", case_insensitive: true).should eq(0)
Expand Down
53 changes: 53 additions & 0 deletions src/crystal/small_deque.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
module Crystal
# :nodoc:
# An internal deque type whose storage is backed by a `StaticArray`. The deque
# capacity is fixed to N and storing more than N elements is an error. Only a
# subset of `::Deque`'s functionality is defined as needed.
struct SmallDeque(T, N)
include Indexable::Mutable(T)

@start = 0
@buffer = uninitialized T[N]
getter size : Int32 = 0

def unsafe_fetch(index : Int) : T
index_to_ptr(index).value
end

def unsafe_put(index : Int, value : T)
index_to_ptr(index).value = value
end

def <<(value : T)
check_capacity_for_insert
unsafe_put(@size, value)
@size += 1
self
end

def shift(&)
if @size == 0
yield
else
ptr = index_to_ptr(0)
value = ptr.value
ptr.clear
@size &-= 1
@start &+= 1
@start &-= N if @start >= N
value
end
end

# precondition: 0 <= index <= N
private def index_to_ptr(index)
index &+= @start
index &-= N if index >= N
@buffer.to_unsafe + index
end

private def check_capacity_for_insert
raise "Out of capacity" if @size >= N
end
end
end
58 changes: 44 additions & 14 deletions src/string.cr
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require "c/stdlib"
require "c/string"
require "crystal/small_deque"
{% unless flag?(:without_iconv) %}
require "crystal/iconv"
{% end %}
Expand Down Expand Up @@ -3116,6 +3117,7 @@ class String
# "abcdef".compare("ABCDEG", case_insensitive: true) # => -1
#
# "heIIo".compare("heııo", case_insensitive: true, options: Unicode::CaseOptions::Turkic) # => 0
# "Baffle".compare("baffle", case_insensitive: true, options: Unicode::CaseOptions::Fold) # => 0
# ```
#
# Case-sensitive only comparison is provided by the comparison operator `#<=>`.
Expand Down Expand Up @@ -3153,23 +3155,51 @@ class String
else
reader1 = Char::Reader.new(self)
reader2 = Char::Reader.new(other)
char1 = reader1.current_char
char2 = reader2.current_char

while reader1.has_next? && reader2.has_next?
comparison = char1.downcase(options) <=> char2.downcase(options)
return comparison.sign unless comparison == 0
# 3 chars maximum for case folding; 2 held in temporary buffers
chars1 = Crystal::SmallDeque(Char, 2).new
chars2 = Crystal::SmallDeque(Char, 2).new

while true
lhs = chars1.shift do
next unless reader1.has_next?
lhs_ = nil
reader1.current_char.downcase(options) do |char|
if lhs_
chars1 << char
else
lhs_ = char
end
end
reader1.next_char
lhs_
end

char1 = reader1.next_char
char2 = reader2.next_char
end
rhs = chars2.shift do
next unless reader2.has_next?
rhs_ = nil
reader2.current_char.downcase(options) do |char|
if rhs_
chars2 << char
else
rhs_ = char
end
end
reader2.next_char
rhs_
end

if reader1.has_next?
1
elsif reader2.has_next?
-1
else
0
case {lhs, rhs}
in {Nil, Nil}
return 0
in {Nil, Char}
return -1
in {Char, Nil}
return 1
in {Char, Char}
comparison = lhs <=> rhs
return comparison.sign unless comparison == 0
end
end
end
end
Expand Down
Loading

0 comments on commit 9ef62d9

Please sign in to comment.