diff --git a/spec/std/char/reader_spec.cr b/spec/std/char/reader_spec.cr index 9b214c4ea5a6..55409ddb0963 100644 --- a/spec/std/char/reader_spec.cr +++ b/spec/std/char/reader_spec.cr @@ -1,11 +1,31 @@ require "spec" require "char/reader" -private def assert_invalid_byte_sequence(bytes) +private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__) reader = Char::Reader.new(String.new bytes) - reader.current_char.should eq(Char::REPLACEMENT) - reader.current_char_width.should eq(1) - reader.error.should eq(bytes[0]) + reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line + reader.current_char_width.should eq(1), file: file, line: line + reader.error.should eq(bytes[0]), file: file, line: line +end + +private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__) + str = String.new bytes + reader = Char::Reader.new(str, pos: bytes.size) + reader.previous_char + reader.current_char.should eq(str[0]), file: file, line: line + reader.current_char_width.should eq(bytes.size), file: file, line: line + reader.pos.should eq(0), file: file, line: line + reader.error.should be_nil, file: file, line: line +end + +private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__) + str = String.new bytes + reader = Char::Reader.new(str, pos: bytes.size) + reader.previous_char + reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line + reader.current_char_width.should eq(1), file: file, line: line + reader.pos.should eq(bytes.size - 1), file: file, line: line + reader.error.should eq(bytes[-1]), file: file, line: line end describe "Char::Reader" do @@ -242,4 +262,112 @@ describe "Char::Reader" do it "errors if fourth_byte is out of bounds" do assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0] end + + describe "#previous_char" do + it "reads on valid UTF-8" do + assert_reads_at_end Bytes[0x00] + assert_reads_at_end Bytes[0x7f] + + assert_reads_at_end Bytes[0xc2, 0x80] + assert_reads_at_end Bytes[0xc2, 0xbf] + assert_reads_at_end Bytes[0xdf, 0x80] + assert_reads_at_end Bytes[0xdf, 0xbf] + + assert_reads_at_end Bytes[0xe1, 0x80, 0x80] + assert_reads_at_end Bytes[0xe1, 0x80, 0xbf] + assert_reads_at_end Bytes[0xe1, 0x9f, 0x80] + assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf] + assert_reads_at_end Bytes[0xed, 0x80, 0x80] + assert_reads_at_end Bytes[0xed, 0x80, 0xbf] + assert_reads_at_end Bytes[0xed, 0x9f, 0x80] + assert_reads_at_end Bytes[0xed, 0x9f, 0xbf] + assert_reads_at_end Bytes[0xef, 0x80, 0x80] + assert_reads_at_end Bytes[0xef, 0x80, 0xbf] + assert_reads_at_end Bytes[0xef, 0x9f, 0x80] + assert_reads_at_end Bytes[0xef, 0x9f, 0xbf] + + assert_reads_at_end Bytes[0xe0, 0xa0, 0x80] + assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xe0, 0xbf, 0x80] + assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf] + assert_reads_at_end Bytes[0xe1, 0xa0, 0x80] + assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xe1, 0xbf, 0x80] + assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf] + assert_reads_at_end Bytes[0xef, 0xa0, 0x80] + assert_reads_at_end Bytes[0xef, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xef, 0xbf, 0x80] + assert_reads_at_end Bytes[0xef, 0xbf, 0xbf] + + assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80] + assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80] + assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80] + assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80] + + assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80] + assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80] + assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80] + assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80] + end + + it "errors on invalid UTF-8" do + assert_invalid_byte_sequence_at_end Bytes[0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xc0] + assert_invalid_byte_sequence_at_end Bytes[0xff] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80] + end + end end diff --git a/src/char/reader.cr b/src/char/reader.cr index 1624a35f4488..bde6de53b4c3 100644 --- a/src/char/reader.cr +++ b/src/char/reader.cr @@ -331,7 +331,7 @@ struct Char end private macro invalid_byte_sequence - return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8 + return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8! end @[AlwaysInline] @@ -343,15 +343,92 @@ struct Char end end - private def decode_previous_char - return if @pos == 0 + # The reverse UTF-8 DFA transition table for reference: (contrast with + # `Unicode::UTF8_ENCODING_DFA`) + # + # accepted (initial state) + # | 1 continuation byte + # | | 2 continuation bytes; disallow overlong encodings up to U+07FF + # | | | 2 continuation bytes; disallow surrogate pairs + # | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF + # | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF + # v v v v v v + # + # | 0 2 3 4 5 6 + # -----------+------------ + # 0x00..0x7F | 0 _ _ _ _ _ + # 0x80..0x8F | 2 3 5 5 _ _ + # 0x90..0x9F | 2 3 6 6 _ _ + # 0xA0..0xBF | 2 4 6 6 _ _ + # 0xC2..0xDF | _ 0 _ _ _ _ + # 0xE0..0xE0 | _ _ _ 0 _ _ + # 0xE1..0xEC | _ _ 0 0 _ _ + # 0xED..0xED | _ _ 0 _ _ _ + # 0xEE..0xEF | _ _ 0 0 _ _ + # 0xF0..0xF0 | _ _ _ _ _ 0 + # 0xF1..0xF3 | _ _ _ _ 0 0 + # 0xF4..0xF4 | _ _ _ _ 0 _ + private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->) + fourth = byte_at(pos - 1) + if fourth <= 0x7f + return yield fourth, 1, nil + end - while @pos > 0 - @pos -= 1 - break if (byte_at(@pos) & 0xC0) != 0x80 + if fourth > 0xbf || pos < 2 + invalid_byte_sequence_before end - decode_char_at(@pos) do |code_point, width, error| + + third = byte_at(pos - 2) + if 0xc2 <= third <= 0xdf + return yield (third << 6) &+ (fourth &- 0x3080), 2, nil + end + + if (third & 0xc0) != 0x80 || pos < 3 + invalid_byte_sequence_before + end + + second = byte_at(pos - 3) + if second & 0xf0 == 0xe0 + if second == 0xe0 && third <= 0x9f + invalid_byte_sequence_before + end + + if second == 0xed && third >= 0xa0 + invalid_byte_sequence_before + end + + return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil + end + + if (second & 0xc0) != 0x80 || pos < 4 + invalid_byte_sequence_before + end + + first = byte_at(pos - 4) + if second <= 0x8f + unless 0xf1 <= first <= 0xf4 + invalid_byte_sequence_before + end + else + unless 0xf0 <= first <= 0xf3 + invalid_byte_sequence_before + end + end + + return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil + end + + private macro invalid_byte_sequence_before + return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8! + end + + @[AlwaysInline] + private def decode_previous_char + return nil if @pos == 0 + + decode_char_before(@pos) do |code_point, width, error| @current_char_width = width + @pos -= width @error = error @current_char = code_point.unsafe_chr end