diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 1aac3c2031f8..3aac5331192b 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -2848,7 +2848,67 @@ describe "String" do it "valid_encoding?" do "hello".valid_encoding?.should be_true - String.new(Bytes[255, 0]).valid_encoding?.should be_false + "hello\u{80}\u{7FF}\u{800}\u{FFFF}\u{10000}\u{10FFFF}".valid_encoding?.should be_true + + # non-starters + String.new(Bytes[0x80]).valid_encoding?.should be_false + String.new(Bytes[0x8F]).valid_encoding?.should be_false + String.new(Bytes[0x90]).valid_encoding?.should be_false + String.new(Bytes[0x9F]).valid_encoding?.should be_false + String.new(Bytes[0xA0]).valid_encoding?.should be_false + String.new(Bytes[0xAF]).valid_encoding?.should be_false + + # incomplete, 2-byte + String.new(Bytes[0xC2]).valid_encoding?.should be_false + String.new(Bytes[0xC2, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xC2, 0xC2]).valid_encoding?.should be_false + + # overlong, 2-byte + String.new(Bytes[0xC0, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xC1, 0xBF]).valid_encoding?.should be_false + String.new(Bytes[0xC2, 0x80]).valid_encoding?.should be_true + + # incomplete, 3-byte + String.new(Bytes[0xE1]).valid_encoding?.should be_false + String.new(Bytes[0xE1, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xE1, 0xC2]).valid_encoding?.should be_false + String.new(Bytes[0xE1, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xE1, 0x80, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xE1, 0x80, 0xC2]).valid_encoding?.should be_false + + # overlong, 3-byte + String.new(Bytes[0xE0, 0x80, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xE0, 0x9F, 0xBF]).valid_encoding?.should be_false + String.new(Bytes[0xE0, 0xA0, 0x80]).valid_encoding?.should be_true + + # surrogate pairs + String.new(Bytes[0xED, 0x9F, 0xBF]).valid_encoding?.should be_true + String.new(Bytes[0xED, 0xA0, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xED, 0xBF, 0xBF]).valid_encoding?.should be_false + String.new(Bytes[0xEE, 0x80, 0x80]).valid_encoding?.should be_true + + # incomplete, 4-byte + String.new(Bytes[0xF1]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0xC2]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80, 0xC2]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80, 0x80, 0x00]).valid_encoding?.should be_false + String.new(Bytes[0xF1, 0x80, 0x80, 0xC2]).valid_encoding?.should be_false + + # overlong, 4-byte + String.new(Bytes[0xF0, 0x80, 0x80, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xF0, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_false + String.new(Bytes[0xF0, 0x90, 0x80, 0x80]).valid_encoding?.should be_true + + # upper boundary, 4-byte + String.new(Bytes[0xF4, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_true + String.new(Bytes[0xF4, 0x90, 0x80, 0x80]).valid_encoding?.should be_false + String.new(Bytes[0xF5]).valid_encoding?.should be_false + String.new(Bytes[0xF8]).valid_encoding?.should be_false + String.new(Bytes[0xFF]).valid_encoding?.should be_false end it "scrubs" do diff --git a/src/string.cr b/src/string.cr index e9dd8ab19347..4a1bedcc8c45 100644 --- a/src/string.cr +++ b/src/string.cr @@ -4946,12 +4946,7 @@ class String # Returns `true` if this String is encoded correctly # according to the UTF-8 encoding. def valid_encoding? : Bool - reader = Char::Reader.new(self) - while reader.has_next? - return false if reader.error - reader.next_char - end - true + Unicode.valid?(to_slice) end # Returns a String where bytes that are invalid in the diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr index fde457404f49..cc93abd239ba 100644 --- a/src/unicode/unicode.cr +++ b/src/unicode/unicode.cr @@ -23,6 +23,109 @@ module Unicode Fold end + private UNROLL = 64 + + # :nodoc: + # Returns whether the given *bytes* refer to a correctly encoded UTF-8 string. + # + # The implementation here uses a shift-based DFA based on + # https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725. + # This loop is very tight and bypasses `Char::Reader` completely. The downside + # is that it does not compute anything else, such as the code points + # themselves or their count, because the required handling for invalid byte + # sequences would significantly slow down the loop. + def self.valid?(bytes : Bytes) : Bool + state = 0_u64 + table = UTF8_ENCODING_DFA.to_unsafe + s = bytes.to_unsafe + e = s + bytes.size + + while s + UNROLL <= e + {% for i in 0...UNROLL %} + state = table[s[{{ i }}]].unsafe_shr(state & 0x3F) + {% end %} + return false if state & 0x3F == 6 + s += UNROLL + end + + while s < e + state = table[s.value].unsafe_shr(state & 0x3F) + return false if state & 0x3F == 6 + s += 1 + end + + state & 0x3F == 0 + end + + private UTF8_ENCODING_DFA = begin + x = Array(UInt64).new(256) + + # The same DFA transition table, with error state and unused bytes hidden: + # + # accepted (initial state) + # | 1 continuation byte left + # | | 2 continuation bytes left + # | | | E0-?? ??; disallow overlong encodings up to U+07FF + # | | | | ED-?? ??; disallow surrogate pairs + # | | | | | F0-?? ?? ??; disallow overlong encodings up to U+FFFF + # | | | | | | 3 continuation bytes left + # | | | | | | | F4-?? ?? ??; disallow codepoints above U+10FFFF + # v v v v v v v v + # + # | 0 2 3 4 5 6 7 8 + # -----------+---------------- + # 0x00..0x7F | 0 _ _ _ _ _ _ _ + # 0x80..0x8F | _ 0 2 _ 2 _ 3 3 + # 0x90..0x9F | _ 0 2 _ 2 3 3 _ + # 0xA0..0xBF | _ 0 2 2 _ 3 3 _ + # 0xC2..0xDF | 2 _ _ _ _ _ _ _ + # 0xE0..0xE0 | 4 _ _ _ _ _ _ _ + # 0xE1..0xEC | 3 _ _ _ _ _ _ _ + # 0xED..0xED | 5 _ _ _ _ _ _ _ + # 0xEE..0xEF | 3 _ _ _ _ _ _ _ + # 0xF0..0xF0 | 6 _ _ _ _ _ _ _ + # 0xF1..0xF3 | 7 _ _ _ _ _ _ _ + # 0xF4..0xF4 | 8 _ _ _ _ _ _ _ + + {% for ch in 0x00..0x7F %} put1(x, dfa_state(0, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0x80..0x8F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 1, 3, 3)); {% end %} + {% for ch in 0x90..0x9F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 3, 3, 1)); {% end %} + {% for ch in 0xA0..0xBF %} put1(x, dfa_state(1, 1, 0, 2, 2, 1, 3, 3, 1)); {% end %} + {% for ch in 0xC0..0xC1 %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xC2..0xDF %} put1(x, dfa_state(2, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xE0..0xE0 %} put1(x, dfa_state(4, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xE1..0xEC %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xED..0xED %} put1(x, dfa_state(5, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xEE..0xEF %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xF0..0xF0 %} put1(x, dfa_state(6, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xF1..0xF3 %} put1(x, dfa_state(7, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xF4..0xF4 %} put1(x, dfa_state(8, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + {% for ch in 0xF5..0xFF %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %} + + x + end + + private def self.put1(array : Array, value) : Nil + array << value + end + + # TODO: remove the workaround for 1.0.0 eventually (needed until #10713) + private macro dfa_state(*transitions) + {% if compare_versions(Crystal::VERSION, "1.1.0") >= 0 %} + {% x = 0_u64 %} + {% for tr, i in transitions %} + {% x |= (1_u64 << (i * 6)) * tr * 6 %} + {% end %} + {{ x }} + {% else %} + {% x = [] of Nil %} + {% for tr, i in transitions %} + {% x << "(#{tr * 6}_u64 << #{i * 6})" %} + {% end %} + {{ x.join(" | ").id }} + {% end %} + end + # :nodoc: def self.upcase(char : Char, options : CaseOptions) : Char result = check_upcase_ascii(char, options)