From 8bc4de0b61439d12616f47bd81a1ef5cd805f0d6 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Thu, 23 Nov 2023 20:37:12 +0800 Subject: [PATCH 1/2] Fix invalid UTF-8 handling in `Char::Reader#previous_char` --- spec/std/char/reader_spec.cr | 133 +++++++++++++++++++++++++++++++++-- src/char/reader.cr | 91 ++++++++++++++++++++++-- 2 files changed, 213 insertions(+), 11 deletions(-) diff --git a/spec/std/char/reader_spec.cr b/spec/std/char/reader_spec.cr index 6197a3d4b6c8..cc812648b9b2 100644 --- a/spec/std/char/reader_spec.cr +++ b/spec/std/char/reader_spec.cr @@ -1,11 +1,28 @@ require "spec" require "char/reader" -private def assert_invalid_byte_sequence(bytes) +private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__) reader = Char::Reader.new(String.new bytes) - reader.current_char.should eq(Char::REPLACEMENT) - reader.current_char_width.should eq(1) - reader.error.should eq(bytes[0]) + reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line + reader.current_char_width.should eq(1), file: file, line: line + reader.error.should eq(bytes[0]), file: file, line: line +end + +private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__) + str = String.new bytes + reader = Char::Reader.new(at_end: str) + reader.current_char.should eq(str[0]), file: file, line: line + reader.current_char_width.should eq(bytes.size), file: file, line: line + reader.pos.should eq(0), file: file, line: line + reader.error.should be_nil, file: file, line: line +end + +private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__) + reader = Char::Reader.new(at_end: String.new bytes) + reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line + reader.current_char_width.should eq(1), file: file, line: line + reader.pos.should eq(bytes.size - 1), file: file, line: line + reader.error.should eq(bytes[-1]), file: file, line: line end describe "Char::Reader" do @@ -193,4 +210,112 @@ describe "Char::Reader" do it "errors if fourth_byte is out of bounds" do assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0] end + + describe "#previous_char / at_end" do + it "reads on valid UTF-8" do + assert_reads_at_end Bytes[0x00] + assert_reads_at_end Bytes[0x7f] + + assert_reads_at_end Bytes[0xc2, 0x80] + assert_reads_at_end Bytes[0xc2, 0xbf] + assert_reads_at_end Bytes[0xdf, 0x80] + assert_reads_at_end Bytes[0xdf, 0xbf] + + assert_reads_at_end Bytes[0xe1, 0x80, 0x80] + assert_reads_at_end Bytes[0xe1, 0x80, 0xbf] + assert_reads_at_end Bytes[0xe1, 0x9f, 0x80] + assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf] + assert_reads_at_end Bytes[0xed, 0x80, 0x80] + assert_reads_at_end Bytes[0xed, 0x80, 0xbf] + assert_reads_at_end Bytes[0xed, 0x9f, 0x80] + assert_reads_at_end Bytes[0xed, 0x9f, 0xbf] + assert_reads_at_end Bytes[0xef, 0x80, 0x80] + assert_reads_at_end Bytes[0xef, 0x80, 0xbf] + assert_reads_at_end Bytes[0xef, 0x9f, 0x80] + assert_reads_at_end Bytes[0xef, 0x9f, 0xbf] + + assert_reads_at_end Bytes[0xe0, 0xa0, 0x80] + assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xe0, 0xbf, 0x80] + assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf] + assert_reads_at_end Bytes[0xe1, 0xa0, 0x80] + assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xe1, 0xbf, 0x80] + assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf] + assert_reads_at_end Bytes[0xef, 0xa0, 0x80] + assert_reads_at_end Bytes[0xef, 0xa0, 0xbf] + assert_reads_at_end Bytes[0xef, 0xbf, 0x80] + assert_reads_at_end Bytes[0xef, 0xbf, 0xbf] + + assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80] + assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80] + assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80] + assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80] + + assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80] + assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80] + assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80] + assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80] + end + + it "errors on invalid UTF-8" do + assert_invalid_byte_sequence_at_end Bytes[0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xc0] + assert_invalid_byte_sequence_at_end Bytes[0xff] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80] + + assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80] + assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80] + end + end end diff --git a/src/char/reader.cr b/src/char/reader.cr index cb307117cdbb..a108e16b4e19 100644 --- a/src/char/reader.cr +++ b/src/char/reader.cr @@ -242,7 +242,7 @@ struct Char end private macro invalid_byte_sequence - return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8 + return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8! end @[AlwaysInline] @@ -254,15 +254,92 @@ struct Char end end - private def decode_previous_char - return if @pos == 0 + # The reverse UTF-8 DFA transition table for reference: (contrast with + # `Unicode::UTF8_ENCODING_DFA`) + # + # accepted (initial state) + # | 1 continuation byte + # | | 2 continuation bytes; disallow overlong encodings up to U+07FF + # | | | 2 continuation bytes; disallow surrogate pairs + # | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF + # | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF + # v v v v v v + # + # | 0 2 3 4 5 6 + # -----------+------------ + # 0x00..0x7F | 0 _ _ _ _ _ + # 0x80..0x8F | 2 3 5 5 _ _ + # 0x90..0x9F | 2 3 6 6 _ _ + # 0xA0..0xBF | 2 4 6 6 _ _ + # 0xC2..0xDF | _ 0 _ _ _ _ + # 0xE0..0xE0 | _ _ _ 0 _ _ + # 0xE1..0xEC | _ _ 0 0 _ _ + # 0xED..0xED | _ _ 0 _ _ _ + # 0xEE..0xEF | _ _ 0 0 _ _ + # 0xF0..0xF0 | _ _ _ _ _ 0 + # 0xF1..0xF3 | _ _ _ _ 0 0 + # 0xF4..0xF4 | _ _ _ _ 0 _ + private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->) + fourth = byte_at(pos - 1) + if fourth <= 0x7f + return yield fourth, 1, nil + end - while @pos > 0 - @pos -= 1 - break if (byte_at(@pos) & 0xC0) != 0x80 + if fourth > 0xbf || pos < 2 + invalid_byte_sequence_before end - decode_char_at(@pos) do |code_point, width, error| + + third = byte_at(pos - 2) + if 0xc2 <= third <= 0xdf + return yield (third << 6) &+ (fourth &- 0x3080), 2, nil + end + + if (third & 0xc0) != 0x80 || pos < 3 + invalid_byte_sequence_before + end + + second = byte_at(pos - 3) + if second & 0xf0 == 0xe0 + if second == 0xe0 && third <= 0x9f + invalid_byte_sequence_before + end + + if second == 0xed && third >= 0xa0 + invalid_byte_sequence_before + end + + return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil + end + + if (second & 0xc0) != 0x80 || pos < 4 + invalid_byte_sequence_before + end + + first = byte_at(pos - 4) + if second <= 0x8f + unless 0xf1 <= first <= 0xf4 + invalid_byte_sequence_before + end + else + unless 0xf0 <= first <= 0xf3 + invalid_byte_sequence_before + end + end + + return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil + end + + private macro invalid_byte_sequence_before + return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8! + end + + @[AlwaysInline] + private def decode_previous_char + return nil if @pos == 0 + + decode_char_before(@pos) do |code_point, width, error| @current_char_width = width + @pos -= width @error = error @current_char = code_point.unsafe_chr end From f9f2abe9b34c4f980edf46bbe4b1f0efae21565d Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Sat, 25 Nov 2023 00:57:28 +0800 Subject: [PATCH 2/2] fixup --- spec/std/char/reader_spec.cr | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spec/std/char/reader_spec.cr b/spec/std/char/reader_spec.cr index cc812648b9b2..3df3844d2a0e 100644 --- a/spec/std/char/reader_spec.cr +++ b/spec/std/char/reader_spec.cr @@ -10,7 +10,8 @@ end private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__) str = String.new bytes - reader = Char::Reader.new(at_end: str) + reader = Char::Reader.new(str, pos: bytes.size) + reader.previous_char reader.current_char.should eq(str[0]), file: file, line: line reader.current_char_width.should eq(bytes.size), file: file, line: line reader.pos.should eq(0), file: file, line: line @@ -18,7 +19,9 @@ private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__) end private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__) - reader = Char::Reader.new(at_end: String.new bytes) + str = String.new bytes + reader = Char::Reader.new(str, pos: bytes.size) + reader.previous_char reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line reader.current_char_width.should eq(1), file: file, line: line reader.pos.should eq(bytes.size - 1), file: file, line: line @@ -211,7 +214,7 @@ describe "Char::Reader" do assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0] end - describe "#previous_char / at_end" do + describe "#previous_char" do it "reads on valid UTF-8" do assert_reads_at_end Bytes[0x00] assert_reads_at_end Bytes[0x7f]