Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add single source of UTF-8 test sequences for specs #14433

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 11 additions & 93 deletions spec/std/char/reader_spec.cr
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require "spec"
require "char/reader"
require "../../support/string"

private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LINE__)
reader = Char::Reader.new(String.new bytes)
Expand All @@ -8,11 +9,11 @@ private def assert_invalid_byte_sequence(bytes, *, file = __FILE__, line = __LIN
reader.error.should eq(bytes[0]), file: file, line: line
end

private def assert_reads_at_end(bytes, *, file = __FILE__, line = __LINE__)
private def assert_reads_at_end(bytes, char, *, file = __FILE__, line = __LINE__)
str = String.new bytes
reader = Char::Reader.new(str, pos: bytes.size)
reader.previous_char
reader.current_char.should eq(str[0]), file: file, line: line
reader.previous_char.should eq(char), file: file, line: line
reader.current_char.should eq(char), file: file, line: line
reader.current_char_width.should eq(bytes.size), file: file, line: line
reader.pos.should eq(0), file: file, line: line
reader.error.should be_nil, file: file, line: line
Expand Down Expand Up @@ -214,100 +215,17 @@ describe "Char::Reader" do
reader.pos.should eq(0)
end

it "errors if 0x80 <= first_byte < 0xC2" do
assert_invalid_byte_sequence Bytes[0x80]
assert_invalid_byte_sequence Bytes[0xC1]
end

it "errors if (second_byte & 0xC0) != 0x80" do
assert_invalid_byte_sequence Bytes[0xd0]
end

it "errors if first_byte == 0xE0 && second_byte < 0xA0" do
assert_invalid_byte_sequence Bytes[0xe0, 0x9F, 0xA0]
end

it "errors if first_byte == 0xED && second_byte >= 0xA0" do
assert_invalid_byte_sequence Bytes[0xed, 0xB0, 0xA0]
end

it "errors if first_byte < 0xF0 && (third_byte & 0xC0) != 0x80" do
assert_invalid_byte_sequence Bytes[0xe0, 0xA0, 0]
end

it "errors if first_byte == 0xF0 && second_byte < 0x90" do
assert_invalid_byte_sequence Bytes[0xf0, 0x8F, 0xA0]
end

it "errors if first_byte == 0xF4 && second_byte >= 0x90" do
assert_invalid_byte_sequence Bytes[0xf4, 0x90, 0xA0]
end

it "errors if first_byte < 0xF5 && (fourth_byte & 0xC0) != 0x80" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8F, 0xA0, 0]
end

it "errors if first_byte >= 0xF5" do
assert_invalid_byte_sequence Bytes[0xf5, 0x8F, 0xA0, 0xA0]
end

it "errors if second_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4]
end

it "errors if third_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f]
end

it "errors if fourth_byte is out of bounds" do
assert_invalid_byte_sequence Bytes[0xf4, 0x8f, 0xa0]
it "errors on invalid UTF-8" do
{% for bytes in INVALID_UTF8_BYTE_SEQUENCES %}
assert_invalid_byte_sequence Bytes{{ bytes }}
{% end %}
end

describe "#previous_char" do
it "reads on valid UTF-8" do
assert_reads_at_end Bytes[0x00]
assert_reads_at_end Bytes[0x7f]

assert_reads_at_end Bytes[0xc2, 0x80]
assert_reads_at_end Bytes[0xc2, 0xbf]
assert_reads_at_end Bytes[0xdf, 0x80]
assert_reads_at_end Bytes[0xdf, 0xbf]

assert_reads_at_end Bytes[0xe1, 0x80, 0x80]
assert_reads_at_end Bytes[0xe1, 0x80, 0xbf]
assert_reads_at_end Bytes[0xe1, 0x9f, 0x80]
assert_reads_at_end Bytes[0xe1, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xed, 0x80, 0x80]
assert_reads_at_end Bytes[0xed, 0x80, 0xbf]
assert_reads_at_end Bytes[0xed, 0x9f, 0x80]
assert_reads_at_end Bytes[0xed, 0x9f, 0xbf]
assert_reads_at_end Bytes[0xef, 0x80, 0x80]
assert_reads_at_end Bytes[0xef, 0x80, 0xbf]
assert_reads_at_end Bytes[0xef, 0x9f, 0x80]
assert_reads_at_end Bytes[0xef, 0x9f, 0xbf]

assert_reads_at_end Bytes[0xe0, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe0, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe0, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe0, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xa0, 0x80]
assert_reads_at_end Bytes[0xe1, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xe1, 0xbf, 0x80]
assert_reads_at_end Bytes[0xe1, 0xbf, 0xbf]
assert_reads_at_end Bytes[0xef, 0xa0, 0x80]
assert_reads_at_end Bytes[0xef, 0xa0, 0xbf]
assert_reads_at_end Bytes[0xef, 0xbf, 0x80]
assert_reads_at_end Bytes[0xef, 0xbf, 0xbf]

assert_reads_at_end Bytes[0xf1, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf1, 0x8f, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x80, 0x80, 0x80]
assert_reads_at_end Bytes[0xf4, 0x8f, 0x80, 0x80]

assert_reads_at_end Bytes[0xf0, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf0, 0xbf, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0x90, 0x80, 0x80]
assert_reads_at_end Bytes[0xf3, 0xbf, 0x80, 0x80]
{% for bytes, char in VALID_UTF8_BYTE_SEQUENCES %}
assert_reads_at_end Bytes{{ bytes }}, {{ char }}
{% end %}
end

it "errors on invalid UTF-8" do
Expand Down
24 changes: 9 additions & 15 deletions spec/std/io/buffered_spec.cr
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require "../spec_helper"
require "../../support/string"

private class BufferedWrapper < IO
include IO::Buffered
Expand Down Expand Up @@ -176,22 +177,15 @@ describe "IO::Buffered" do
io.read_char.should eq('界')
io.read_char.should be_nil

io = IO::Memory.new
io.write Bytes[0xf8, 0xff, 0xff, 0xff]
io.rewind
io = BufferedWrapper.new(io)

expect_raises(InvalidByteSequenceError) do
io.read_char
end
{% for bytes, char in VALID_UTF8_BYTE_SEQUENCES %}
BufferedWrapper.new(IO::Memory.new(Bytes{{ bytes }})).read_char.should eq({{ char }})
{% end %}

io = IO::Memory.new
io.write_byte 0x81_u8
io.rewind
io = BufferedWrapper.new(io)
expect_raises(InvalidByteSequenceError) do
p io.read_char
end
{% for bytes in INVALID_UTF8_BYTE_SEQUENCES %}
expect_raises(InvalidByteSequenceError) do
BufferedWrapper.new(IO::Memory.new(Bytes{{ bytes }})).read_char
end
{% end %}
end

it "reads byte" do
Expand Down
31 changes: 8 additions & 23 deletions spec/std/io/io_spec.cr
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require "../spec_helper"
require "../../support/channel"
require "../../support/string"
require "spec/helpers/iterate"

require "socket"
Expand Down Expand Up @@ -338,29 +339,13 @@ describe IO do
io.read_char.should eq('界')
io.read_char.should be_nil

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc4, 0x70]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc4, 0x70, 0x00, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf8]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf8, 0x00, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0x81]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0x81, 0x00, 0x00, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xa0, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xa0, 0x80, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xbf, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xed, 0xbf, 0xbf, 0x00]).read_char }

expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc0, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc0, 0x80, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc1, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xc1, 0xbf, 0x00, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x80, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x80, 0x80, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x9f, 0xbf]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xe0, 0x9f, 0xbf, 0x00]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf0, 0x80, 0x80, 0x80]).read_char }
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes[0xf0, 0x8f, 0xbf, 0xbf]).read_char }
{% for bytes, char in VALID_UTF8_BYTE_SEQUENCES %}
SimpleIOMemory.new(Bytes{{ bytes }}).read_char.should eq({{ char }})
{% end %}

{% for bytes in INVALID_UTF8_BYTE_SEQUENCES %}
expect_raises(InvalidByteSequenceError) { SimpleIOMemory.new(Bytes{{ bytes }}).read_char }
{% end %}
end

it "reads byte" do
Expand Down
112 changes: 15 additions & 97 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require "./spec_helper"
require "../support/string"
require "spec/helpers/iterate"
require "spec/helpers/string"

Expand Down Expand Up @@ -2946,65 +2947,13 @@ describe "String" do
"hello".valid_encoding?.should be_true
"hello\u{80}\u{7FF}\u{800}\u{FFFF}\u{10000}\u{10FFFF}".valid_encoding?.should be_true

# non-starters
String.new(Bytes[0x80]).valid_encoding?.should be_false
String.new(Bytes[0x8F]).valid_encoding?.should be_false
String.new(Bytes[0x90]).valid_encoding?.should be_false
String.new(Bytes[0x9F]).valid_encoding?.should be_false
String.new(Bytes[0xA0]).valid_encoding?.should be_false
String.new(Bytes[0xAF]).valid_encoding?.should be_false

# incomplete, 2-byte
String.new(Bytes[0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0xC2]).valid_encoding?.should be_false

# overlong, 2-byte
String.new(Bytes[0xC0, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xC1, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xC2, 0x80]).valid_encoding?.should be_true

# incomplete, 3-byte
String.new(Bytes[0xE1]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xE1, 0x80, 0xC2]).valid_encoding?.should be_false

# overlong, 3-byte
String.new(Bytes[0xE0, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xE0, 0x9F, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xE0, 0xA0, 0x80]).valid_encoding?.should be_true

# surrogate pairs
String.new(Bytes[0xED, 0x9F, 0xBF]).valid_encoding?.should be_true
String.new(Bytes[0xED, 0xA0, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xED, 0xBF, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xEE, 0x80, 0x80]).valid_encoding?.should be_true

# incomplete, 4-byte
String.new(Bytes[0xF1]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0xC2]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80, 0x00]).valid_encoding?.should be_false
String.new(Bytes[0xF1, 0x80, 0x80, 0xC2]).valid_encoding?.should be_false

# overlong, 4-byte
String.new(Bytes[0xF0, 0x80, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF0, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_false
String.new(Bytes[0xF0, 0x90, 0x80, 0x80]).valid_encoding?.should be_true

# upper boundary, 4-byte
String.new(Bytes[0xF4, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_true
String.new(Bytes[0xF4, 0x90, 0x80, 0x80]).valid_encoding?.should be_false
String.new(Bytes[0xF5]).valid_encoding?.should be_false
String.new(Bytes[0xF8]).valid_encoding?.should be_false
String.new(Bytes[0xFF]).valid_encoding?.should be_false
{% for bytes in VALID_UTF8_BYTE_SEQUENCES %}
String.new(Bytes{{ bytes }}).valid_encoding?.should be_true
{% end %}

{% for bytes in INVALID_UTF8_BYTE_SEQUENCES %}
String.new(Bytes{{ bytes }}).valid_encoding?.should be_false
{% end %}
end

it "scrubs" do
Expand Down Expand Up @@ -3114,44 +3063,13 @@ end
class String
describe String do
it ".char_bytesize_at" do
String.char_bytesize_at(Bytes[0x00, 0].to_unsafe).should eq 1
String.char_bytesize_at(Bytes[0x7F, 0].to_unsafe).should eq 1
String.char_bytesize_at(Bytes[0x80, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xBF, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xC2, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xC3, 0].to_unsafe).should eq 1 # malformed

String.char_bytesize_at(Bytes[0xC2, 0x7F, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xC2, 0x80, 0].to_unsafe).should eq 2
String.char_bytesize_at(Bytes[0xDF, 0xBF, 0].to_unsafe).should eq 2
String.char_bytesize_at(Bytes[0xDF, 0xC0, 0].to_unsafe).should eq 1 # malformed

String.char_bytesize_at(Bytes[0xE0, 0xA0, 0x7F, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xE0, 0x9F, 0x8F, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xE0, 0xA0, 0x80, 0].to_unsafe).should eq 3
String.char_bytesize_at(Bytes[0xED, 0x9F, 0xBF, 0].to_unsafe).should eq 3
String.char_bytesize_at(Bytes[0xED, 0x9F, 0xC0, 0].to_unsafe).should eq 1 # surrogate
String.char_bytesize_at(Bytes[0xED, 0xBF, 0xBF, 0].to_unsafe).should eq 1 # surrogate
String.char_bytesize_at(Bytes[0xEE, 0x80, 0x80, 0].to_unsafe).should eq 3
String.char_bytesize_at(Bytes[0xEF, 0xBF, 0xBD, 0].to_unsafe).should eq 3
String.char_bytesize_at(Bytes[0xEF, 0xBF, 0xBF, 0].to_unsafe).should eq 3
String.char_bytesize_at(Bytes[0xEF, 0xBF, 0xC0, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xEF, 0xC0, 0xBF, 0].to_unsafe).should eq 1 # malformed

String.char_bytesize_at(Bytes[0xF0, 0x90, 0x80, 0x7F, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xF0, 0x90, 0x7F, 0x80, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xF0, 0x8F, 0x80, 0x80, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xF0, 0x90, 0x80, 0x80, 0].to_unsafe).should eq 4
String.char_bytesize_at(Bytes[0xF0, 0x9F, 0xBF, 0xBF, 0].to_unsafe).should eq 4
String.char_bytesize_at(Bytes[0xF3, 0x90, 0x80, 0x80, 0].to_unsafe).should eq 4
String.char_bytesize_at(Bytes[0xF4, 0x8F, 0xBD, 0xBF, 0].to_unsafe).should eq 4
String.char_bytesize_at(Bytes[0xF4, 0x8F, 0xBF, 0xBF, 0].to_unsafe).should eq 4
String.char_bytesize_at(Bytes[0xF4, 0x8F, 0xBF, 0xC0, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xF4, 0x8F, 0xC0, 0xBF, 0].to_unsafe).should eq 1 # malformed
String.char_bytesize_at(Bytes[0xF4, 0x90, 0xBF, 0xBF, 0].to_unsafe).should eq 1 # malformed

String.char_bytesize_at(Bytes[0xF5, 0].to_unsafe).should eq 1 # out of codepoint range
String.char_bytesize_at(Bytes[0xFF, 0].to_unsafe).should eq 1 # out of codepoint range
{% for bytes, char in VALID_UTF8_BYTE_SEQUENCES %}
String.char_bytesize_at(Bytes[{{ bytes.splat }}, 0].to_unsafe).should eq({{ bytes.size }})
{% end %}

{% for bytes in INVALID_UTF8_BYTE_SEQUENCES %}
String.char_bytesize_at(Bytes[{{ bytes.splat }}, 0].to_unsafe).should eq 1
{% end %}
end
end
end
Loading
Loading