Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix invalid UTF-8 handling in Char::Reader#previous_char #14013

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 129 additions & 4 deletions spec/std/char/reader_spec.cr
Original file line number Diff line number Diff line change
@@ -1,11 +1,28 @@
require "spec"
require "char/reader"

private def assert_invalid_byte_sequence(bytes)
private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__)
reader = Char::Reader.new(String.new bytes)
reader.current_char.should eq(Char::REPLACEMENT)
reader.current_char_width.should eq(1)
reader.error.should eq(bytes[0])
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
reader.current_char_width.should eq(1), file: file, line: line
reader.error.should eq(bytes[0]), file: file, line: line
end

private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__)
str = String.new bytes
reader = Char::Reader.new(at_end: str)
reader.current_char.should eq(str[0]), file: file, line: line
reader.current_char_width.should eq(bytes.size), file: file, line: line
reader.pos.should eq(0), file: file, line: line
reader.error.should be_nil, file: file, line: line
end

private def assert_invalid_byte_sequence_at_end(bytes, *, file = __FILE__, line = __LINE__)
reader = Char::Reader.new(at_end: String.new bytes)
reader.current_char.should eq(Char::REPLACEMENT), file: file, line: line
reader.current_char_width.should eq(1), file: file, line: line
reader.pos.should eq(bytes.size - 1), file: file, line: line
reader.error.should eq(bytes[-1]), file: file, line: line
end

describe "Char::Reader" do
Expand Down Expand Up @@ -193,4 +210,112 @@ describe "Char::Reader" do
it "errors if fourth_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0]
end

describe "#previous_char / at_end" do
straight-shoota marked this conversation as resolved.
Show resolved Hide resolved
it "reads on valid UTF-8" do
assert_reads_at_end Bytes[0x00]
assert_reads_at_end Bytes[0x7f]

assert_reads_at_end Bytes[0xc2, 0x80]
assert_reads_at_end Bytes[0xc2, 0xbf]
assert_reads_at_end Bytes[0xdf, 0x80]
assert_reads_at_end Bytes[0xdf, 0xbf]

assert_reads_at_end Bytes[0xe1, 0x80, 0x80]
assert_reads_at_end Bytes[0xe1, 0x80, 0xbf]
assert_reads_at_end Bytes[0xe1, 0x9f, 0x80]
assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xed, 0x80, 0x80]
assert_reads_at_end Bytes[0xed, 0x80, 0xbf]
assert_reads_at_end Bytes[0xed, 0x9f, 0x80]
assert_reads_at_end Bytes[0xed, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xef, 0x80, 0x80]
assert_reads_at_end Bytes[0xef, 0x80, 0xbf]
assert_reads_at_end Bytes[0xef, 0x9f, 0x80]
assert_reads_at_end Bytes[0xef, 0x9f, 0xbf]

assert_reads_at_end Bytes[0xe0, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe0, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xef, 0xa0, 0x80]
assert_reads_at_end Bytes[0xef, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xef, 0xbf, 0x80]
assert_reads_at_end Bytes[0xef, 0xbf, 0xbf]

assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80]

assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80]
end

it "errors on invalid UTF-8" do
assert_invalid_byte_sequence_at_end Bytes[0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xc0]
assert_invalid_byte_sequence_at_end Bytes[0xff]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x9f, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xe0, 0x9f, 0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x7f, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x80, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x8f, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0x90, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xbf, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc0, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc1, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xc2, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xdf, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xed, 0xbf, 0xbf]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0xa0, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0xa0, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf0, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x80, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x80, 0x80, 0x80]

assert_invalid_byte_sequence_at_end Bytes[0x00, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xef, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf4, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xf5, 0x90, 0x80, 0x80]
assert_invalid_byte_sequence_at_end Bytes[0xff, 0x90, 0x80, 0x80]
end
end
end
91 changes: 84 additions & 7 deletions src/char/reader.cr
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ struct Char
end

private macro invalid_byte_sequence
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8
return yield Char::REPLACEMENT.ord.to_u32!, 1, first.to_u8!
end

@[AlwaysInline]
Expand All @@ -254,15 +254,92 @@ struct Char
end
end

private def decode_previous_char
return if @pos == 0
# The reverse UTF-8 DFA transition table for reference: (contrast with
# `Unicode::UTF8_ENCODING_DFA`)
#
# accepted (initial state)
# | 1 continuation byte
# | | 2 continuation bytes; disallow overlong encodings up to U+07FF
# | | | 2 continuation bytes; disallow surrogate pairs
# | | | | 3 continuation bytes; disallow overlong encodings up to U+FFFF
# | | | | | 3 continuation bytes; disallow codepoints above U+10FFFF
# v v v v v v
#
# | 0 2 3 4 5 6
# -----------+------------
# 0x00..0x7F | 0 _ _ _ _ _
# 0x80..0x8F | 2 3 5 5 _ _
# 0x90..0x9F | 2 3 6 6 _ _
# 0xA0..0xBF | 2 4 6 6 _ _
# 0xC2..0xDF | _ 0 _ _ _ _
# 0xE0..0xE0 | _ _ _ 0 _ _
# 0xE1..0xEC | _ _ 0 0 _ _
# 0xED..0xED | _ _ 0 _ _ _
# 0xEE..0xEF | _ _ 0 0 _ _
# 0xF0..0xF0 | _ _ _ _ _ 0
# 0xF1..0xF3 | _ _ _ _ 0 0
# 0xF4..0xF4 | _ _ _ _ 0 _
private def decode_char_before(pos, & : UInt32, Int32, UInt8? ->)
fourth = byte_at(pos - 1)
if fourth <= 0x7f
return yield fourth, 1, nil
end

while @pos > 0
@pos -= 1
break if (byte_at(@pos) & 0xC0) != 0x80
if fourth > 0xbf || pos < 2
invalid_byte_sequence_before
end
decode_char_at(@pos) do |code_point, width, error|

third = byte_at(pos - 2)
if 0xc2 <= third <= 0xdf
return yield (third << 6) &+ (fourth &- 0x3080), 2, nil
end

if (third & 0xc0) != 0x80 || pos < 3
invalid_byte_sequence_before
end

second = byte_at(pos - 3)
if second & 0xf0 == 0xe0
if second == 0xe0 && third <= 0x9f
invalid_byte_sequence_before
end

if second == 0xed && third >= 0xa0
invalid_byte_sequence_before
end

return yield (second << 12) &+ (third << 6) &+ (fourth &- 0xE2080), 3, nil
end

if (second & 0xc0) != 0x80 || pos < 4
invalid_byte_sequence_before
end

first = byte_at(pos - 4)
if second <= 0x8f
unless 0xf1 <= first <= 0xf4
invalid_byte_sequence_before
end
else
unless 0xf0 <= first <= 0xf3
invalid_byte_sequence_before
end
end

return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil
yield (first << 18) &+ (second << 12) &+ (third << 6) &+ (fourth &- 0x3C82080), 4, nil

end

private macro invalid_byte_sequence_before
return yield Char::REPLACEMENT.ord.to_u32!, 1, fourth.to_u8!
end

@[AlwaysInline]
private def decode_previous_char
return nil if @pos == 0

decode_char_before(@pos) do |code_point, width, error|
@current_char_width = width
@pos -= width
@error = error
@current_char = code_point.unsafe_chr
end
Expand Down
Loading