Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize String#valid_encoding? #12145

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2848,7 +2848,67 @@ describe "String" do

it "valid_encoding?" do
"hello".valid_encoding?.should be_true
String.new(Bytes[255, 0]).valid_encoding?.should be_false
"hello\u{80}\u{7FF}\u{800}\u{FFFF}\u{10000}\u{10FFFF}".valid_encoding?.should be_true

# non-starters
String.new(Bytes[0x80]).valid_encoding?.should be_false
String.new(Bytes[0x8F]).valid_encoding?.should be_false
String.new(Bytes[0x90]).valid_encoding?.should be_false
String.new(Bytes[0x9F]).valid_encoding?.should be_false
String.new(Bytes[0xA0]).valid_encoding?.should be_false
String.new(Bytes[0xAF]).valid_encoding?.should be_false

# incomplete, 2-byte
String.new(Bytes[0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0xC2]).valid_encoding?.should be_false

# overlong, 2-byte
String.new(Bytes[0xC0, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xC1, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0x80]).valid_encoding?.should be_true

# incomplete, 3-byte
String.new(Bytes[0xE1]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80, 0xC2]).valid_encoding?.should be_false

# overlong, 3-byte
String.new(Bytes[0xE0, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xE0, 0x9F, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xE0, 0xA0, 0x80]).valid_encoding?.should be_true

# surrogate pairs
String.new(Bytes[0xED, 0x9F, 0xBF]).valid_encoding?.should be_true
String.new(Bytes[0xED, 0xA0, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xED, 0xBF, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xEE, 0x80, 0x80]).valid_encoding?.should be_true

# incomplete, 4-byte
String.new(Bytes[0xF1]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80, 0xC2]).valid_encoding?.should be_false

# overlong, 4-byte
String.new(Bytes[0xF0, 0x80, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF0, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xF0, 0x90, 0x80, 0x80]).valid_encoding?.should be_true

# upper boundary, 4-byte
String.new(Bytes[0xF4, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_true
String.new(Bytes[0xF4, 0x90, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF5]).valid_encoding?.should be_false
String.new(Bytes[0xF8]).valid_encoding?.should be_false
String.new(Bytes[0xFF]).valid_encoding?.should be_false
end

it "scrubs" do
Expand Down
7 changes: 1 addition & 6 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -4946,12 +4946,7 @@ class String
# Returns `true` if this String is encoded correctly
# according to the UTF-8 encoding.
def valid_encoding? : Bool
reader = Char::Reader.new(self)
while reader.has_next?
return false if reader.error
reader.next_char
end
true
Unicode.valid?(to_slice)
end

# Returns a String where bytes that are invalid in the
Expand Down
103 changes: 103 additions & 0 deletions src/unicode/unicode.cr
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,109 @@ module Unicode
Fold
end

private UNROLL = 64

# :nodoc:
# Returns whether the given *bytes* refer to a correctly encoded UTF-8 string.
#
# The implementation here uses a shift-based DFA based on
# https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725.
# This loop is very tight and bypasses `Char::Reader` completely. The downside
# is that it does not compute anything else, such as the code points
# themselves or their count, because the required handling for invalid byte
# sequences would significantly slow down the loop.
def self.valid?(bytes : Bytes) : Bool
state = 0_u64
table = UTF8_ENCODING_DFA.to_unsafe
s = bytes.to_unsafe
e = s + bytes.size

while s + UNROLL <= e
{% for i in 0...UNROLL %}
state = table[s[{{ i }}]].unsafe_shr(state & 0x3F)
{% end %}
return false if state & 0x3F == 6
s += UNROLL
end

while s < e
state = table[s.value].unsafe_shr(state & 0x3F)
return false if state & 0x3F == 6
s += 1
end

state & 0x3F == 0
end

private UTF8_ENCODING_DFA = begin
x = Array(UInt64).new(256)

# The same DFA transition table, with error state and unused bytes hidden:
#
# accepted (initial state)
# | 1 continuation byte left
# | | 2 continuation bytes left
# | | | E0-?? ??; disallow overlong encodings up to U+07FF
# | | | | ED-?? ??; disallow surrogate pairs
# | | | | | F0-?? ?? ??; disallow overlong encodings up to U+FFFF
# | | | | | | 3 continuation bytes left
# | | | | | | | F4-?? ?? ??; disallow codepoints above U+10FFFF
# v v v v v v v v
#
# | 0 2 3 4 5 6 7 8
# -----------+----------------
# 0x00..0x7F | 0 _ _ _ _ _ _ _
# 0x80..0x8F | _ 0 2 _ 2 _ 3 3
# 0x90..0x9F | _ 0 2 _ 2 3 3 _
# 0xA0..0xBF | _ 0 2 2 _ 3 3 _
# 0xC2..0xDF | 2 _ _ _ _ _ _ _
# 0xE0..0xE0 | 4 _ _ _ _ _ _ _
# 0xE1..0xEC | 3 _ _ _ _ _ _ _
# 0xED..0xED | 5 _ _ _ _ _ _ _
# 0xEE..0xEF | 3 _ _ _ _ _ _ _
# 0xF0..0xF0 | 6 _ _ _ _ _ _ _
# 0xF1..0xF3 | 7 _ _ _ _ _ _ _
# 0xF4..0xF4 | 8 _ _ _ _ _ _ _

{% for ch in 0x00..0x7F %} put1(x, dfa_state(0, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0x80..0x8F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 1, 3, 3)); {% end %}
{% for ch in 0x90..0x9F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 3, 3, 1)); {% end %}
{% for ch in 0xA0..0xBF %} put1(x, dfa_state(1, 1, 0, 2, 2, 1, 3, 3, 1)); {% end %}
{% for ch in 0xC0..0xC1 %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xC2..0xDF %} put1(x, dfa_state(2, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xE0..0xE0 %} put1(x, dfa_state(4, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xE1..0xEC %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xED..0xED %} put1(x, dfa_state(5, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xEE..0xEF %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xF0..0xF0 %} put1(x, dfa_state(6, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xF1..0xF3 %} put1(x, dfa_state(7, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xF4..0xF4 %} put1(x, dfa_state(8, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
{% for ch in 0xF5..0xFF %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}

x
end

private def self.put1(array : Array, value) : Nil
array << value
end

# TODO: remove the workaround for 1.0.0 eventually (needed until #10713)
private macro dfa_state(*transitions)
{% if compare_versions(Crystal::VERSION, "1.1.0") >= 0 %}
{% x = 0_u64 %}
{% for tr, i in transitions %}
{% x |= (1_u64 << (i * 6)) * tr * 6 %}
{% end %}
{{ x }}
{% else %}
{% x = [] of Nil %}
{% for tr, i in transitions %}
{% x << "(#{tr * 6}_u64 << #{i * 6})" %}
{% end %}
{{ x.join(" | ").id }}
{% end %}
end

# :nodoc:
def self.upcase(char : Char, options : CaseOptions) : Char
result = check_upcase_ascii(char, options)
Expand Down