Skip to content

Commit

Permalink
Fix invalid UTF-8 handling in Char::Reader#previous_char (crystal-l…
Browse files Browse the repository at this point in the history
  • Loading branch information
HertzDevil authored Nov 25, 2023
1 parent f9b7226 commit c7202d4
Show file tree
Hide file tree
Showing 2 changed files with 216 additions and 11 deletions.
136 changes: 132 additions & 4 deletions spec/std/char/reader_spec.cr
Original file line number Diff line number Diff line change
@@ -1,11 +1,31 @@
require "spec"
require "char/reader"

private def assert_invalid_byte_sequence(bytes)
private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__)
reader = Char::Reader.new(String.new bytes)
reader.current_char.should eq(Char::REPLACEMENT)
reader.current_char_width.should eq(1)
reader.error.should eq(bytes[0])
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
reader.current_char_width.should eq(1), file: file, line: line
reader.error.should eq(bytes[0]), file: file, line: line
end

private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__)
str = String.new bytes
reader = Char::Reader.new(str, pos: bytes.size)
reader.previous_char
reader.current_char.should eq(str[0]), file: file, line: line
reader.current_char_width.should eq(bytes.size), file: file, line: line
reader.pos.should eq(0), file: file, line: line
reader.error.should be_nil, file: file, line: line
end

private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__)
str = String.new bytes
reader = Char::Reader.new(str, pos: bytes.size)
reader.previous_char
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
reader.current_char_width.should eq(1), file: file, line: line
reader.pos.should eq(bytes.size - 1), file: file, line: line
reader.error.should eq(bytes[-1]), file: file, line: line
end

describe "Char::Reader" do
Expand Down Expand Up @@ -242,4 +262,112 @@ describe "Char::Reader" do
it "errors if fourth_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0]
end

describe "#previous_char" do
it "reads on valid UTF-8" do
assert_reads_at_end Bytes[0x00]
assert_reads_at_end Bytes[0x7f]

assert_reads_at_end Bytes[0xc2, 0x80]
assert_reads_at_end Bytes[0xc2, 0xbf]
assert_reads_at_end Bytes[0xdf, 0x80]
assert_reads_at_end Bytes[0xdf, 0xbf]

assert_reads_at_end Bytes[0xe1, 0x80, 0x80]
assert_reads_at_end Bytes[0xe1, 0x80, 0xbf]
assert_reads_at_end Bytes[0xe1, 0x9f, 0x80]
assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xed, 0x80, 0x80]
assert_reads_at_end Bytes[0xed, 0x80, 0xbf]
assert_reads_at_end Bytes[0xed, 0x9f, 0x80]
assert_reads_at_end Bytes[0xed, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xef, 0x80, 0x80]
assert_reads_at_end Bytes[0xef, 0x80, 0xbf]
assert_reads_at_end Bytes[0xef, 0x9f, 0x80]
assert_reads_at_end Bytes[0xef, 0x9f, 0xbf]

assert_reads_at_end Bytes[0xe0, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe0, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xef, 0xa0, 0x80]
assert_reads_at_end Bytes[0xef, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xef, 0xbf, 0x80]
assert_reads_at_end Bytes[0xef, 0xbf, 0xbf]

assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80]

assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80]
end

it "errors on invalid UTF-8" do
assert_invalid_byte_sequence_at_end Bytes[0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xc0]
assert_invalid_byte_sequence_at_end Bytes[0xff]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80]
end
end
end
91 changes: 84 additions & 7 deletions src/char/reader.cr
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ struct Char
end

private macro invalid_byte_sequence
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8!
end

@[AlwaysInline]
Expand All @@ -343,15 +343,92 @@ struct Char
end
end

private def decode_previous_char
return if @pos == 0
# The reverse UTF-8 DFA transition table for reference: (contrast with
# `Unicode::UTF8_ENCODING_DFA`)
#
# accepted (initial state)
# | 1 continuation byte
# | | 2 continuation bytes; disallow overlong encodings up to U+07FF
# | | | 2 continuation bytes; disallow surrogate pairs
# | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF
# | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF
# v v v v v v
#
# | 0 2 3 4 5 6
# -----------+------------
# 0x00..0x7F | 0 _ _ _ _ _
# 0x80..0x8F | 2 3 5 5 _ _
# 0x90..0x9F | 2 3 6 6 _ _
# 0xA0..0xBF | 2 4 6 6 _ _
# 0xC2..0xDF | _ 0 _ _ _ _
# 0xE0..0xE0 | _ _ _ 0 _ _
# 0xE1..0xEC | _ _ 0 0 _ _
# 0xED..0xED | _ _ 0 _ _ _
# 0xEE..0xEF | _ _ 0 0 _ _
# 0xF0..0xF0 | _ _ _ _ _ 0
# 0xF1..0xF3 | _ _ _ _ 0 0
# 0xF4..0xF4 | _ _ _ _ 0 _
private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->)
fourth = byte_at(pos - 1)
if fourth <= 0x7f
return yield fourth, 1, nil
end

while @pos > 0
@pos -= 1
break if (byte_at(@pos) & 0xC0) != 0x80
if fourth > 0xbf || pos < 2
invalid_byte_sequence_before
end
decode_char_at(@pos) do |code_point, width, error|

third = byte_at(pos - 2)
if 0xc2 <= third <= 0xdf
return yield (third << 6) &+ (fourth &- 0x3080), 2, nil
end

if (third & 0xc0) != 0x80 || pos < 3
invalid_byte_sequence_before
end

second = byte_at(pos - 3)
if second & 0xf0 == 0xe0
if second == 0xe0 && third <= 0x9f
invalid_byte_sequence_before
end

if second == 0xed && third >= 0xa0
invalid_byte_sequence_before
end

return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil
end

if (second & 0xc0) != 0x80 || pos < 4
invalid_byte_sequence_before
end

first = byte_at(pos - 4)
if second <= 0x8f
unless 0xf1 <= first <= 0xf4
invalid_byte_sequence_before
end
else
unless 0xf0 <= first <= 0xf3
invalid_byte_sequence_before
end
end

return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil
end

private macro invalid_byte_sequence_before
return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8!
end

@[AlwaysInline]
private def decode_previous_char
return nil if @pos == 0

decode_char_before(@pos) do |code_point, width, error|
@current_char_width = width
@pos -= width
@error = error
@current_char = code_point.unsafe_chr
end
Expand Down

0 comments on commit c7202d4

Please sign in to comment.