From 9e456b0b66841a78a59f225622d62411334df5f6 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Sat, 27 Feb 2021 20:18:20 +0800 Subject: [PATCH 1/3] Make Int#chr reject surrogate halves --- spec/std/base64_spec.cr | 2 +- spec/std/char_spec.cr | 12 ------------ spec/std/int_spec.cr | 8 ++++++++ src/char.cr | 8 ++------ src/http/common.cr | 2 +- src/int.cr | 5 +++-- src/primitives.cr | 2 +- src/string.cr | 12 ++++++------ 8 files changed, 22 insertions(+), 29 deletions(-) diff --git a/spec/std/base64_spec.cr b/spec/std/base64_spec.cr index ba601955677b..1a5b021b2956 100644 --- a/spec/std/base64_spec.cr +++ b/spec/std/base64_spec.cr @@ -87,7 +87,7 @@ describe "Base64" do it "works for most characters" do a = String.build(65536 * 4) do |buf| - 65536.times { |i| buf << (i + 1).chr } + 65536.times { |i| buf << (i + 1).unsafe_chr } end b = Base64.encode(a) Crystal::Digest::MD5.hexdigest(Base64.decode_string(b)).should eq(Crystal::Digest::MD5.hexdigest(a)) diff --git a/spec/std/char_spec.cr b/spec/std/char_spec.cr index dce9d853ba42..51ed153518ff 100644 --- a/spec/std/char_spec.cr +++ b/spec/std/char_spec.cr @@ -280,12 +280,6 @@ describe "Char" do it "does for unicode" do '青'.bytesize.should eq(3) end - - it "raises on codepoint bigger than 0x10ffff" do - expect_raises InvalidByteSequenceError do - (0x10ffff + 1).unsafe_chr.bytesize - end - end end describe "in_set?" do @@ -332,12 +326,6 @@ describe "Char" do end end - it "raises on codepoint bigger than 0x10ffff when doing each_byte" do - expect_raises InvalidByteSequenceError do - (0x10ffff + 1).unsafe_chr.each_byte { |b| } - end - end - it "does each_byte" do 'a'.each_byte(&.should eq('a'.ord)).should be_nil end diff --git a/spec/std/int_spec.cr b/spec/std/int_spec.cr index 93b6db182998..8e97b973623d 100644 --- a/spec/std/int_spec.cr +++ b/spec/std/int_spec.cr @@ -776,6 +776,14 @@ describe "Int" do expect_raises(ArgumentError, "#{0x10ffff + 1} out of char range") do (0x10ffff + 1).chr end + + expect_raises(ArgumentError, "#{0xd800} out of char range") do + 0xd800.chr + end + + expect_raises(ArgumentError, "#{0xdfff} out of char range") do + 0xdfff.chr + end end it "#unsafe_chr" do diff --git a/src/char.cr b/src/char.cr index 0e7414a31b64..72744aaab69e 100644 --- a/src/char.cr +++ b/src/char.cr @@ -693,14 +693,12 @@ struct Char yield (0xe0 | (c >> 12)).to_u8 yield (0x80 | ((c >> 6) & 0x3f)).to_u8 yield (0x80 | (c & 0x3f)).to_u8 - elsif c <= MAX_CODEPOINT + else # 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx yield (0xf0 | (c >> 18)).to_u8 yield (0x80 | ((c >> 12) & 0x3f)).to_u8 yield (0x80 | ((c >> 6) & 0x3f)).to_u8 yield (0x80 | (c & 0x3f)).to_u8 - else - raise InvalidByteSequenceError.new("Invalid char value #{dump}") end end @@ -723,11 +721,9 @@ struct Char elsif c <= 0xffff # 1110xxxx 10xxxxxx 10xxxxxx 3 - elsif c <= MAX_CODEPOINT + else # 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4 - else - raise InvalidByteSequenceError.new("Invalid char value #{dump}") end end diff --git a/src/http/common.cr b/src/http/common.cr index ad2288fae67e..5d821e65e5f2 100644 --- a/src/http/common.cr +++ b/src/http/common.cr @@ -387,7 +387,7 @@ module HTTP String.build do |io| while quoted_pair_index io.write(data[0, quoted_pair_index]) - io << data[quoted_pair_index + 1].chr + io << data[quoted_pair_index + 1].unsafe_chr data += quoted_pair_index + 2 quoted_pair_index = data.index('\\'.ord) diff --git a/src/int.cr b/src/int.cr index 90f20589e4e6..e007237b57b4 100644 --- a/src/int.cr +++ b/src/int.cr @@ -62,13 +62,14 @@ struct Int # Returns a `Char` that has the unicode codepoint of `self`. # - # Raises `ArgumentError` if this integer's value doesn't fit a char's range (`0..0x10ffff`). + # Raises `ArgumentError` if this integer's value doesn't fit a char's range + # (`0..0xd7ff` and `0xe000..0x10ffff`). # # ``` # 97.chr # => 'a' # ``` def chr - unless 0 <= self <= Char::MAX_CODEPOINT + unless 0 <= self <= 0xd7ff || 0xe000 <= self <= Char::MAX_CODEPOINT raise ArgumentError.new("#{self} out of char range") end unsafe_chr diff --git a/src/primitives.cr b/src/primitives.cr index 246ff09714c4..ffe43126eaf9 100644 --- a/src/primitives.cr +++ b/src/primitives.cr @@ -316,7 +316,7 @@ end struct {{int.id}} # Returns a `Char` that has the unicode codepoint of `self`, # without checking if this integer is in the range valid for - # chars (`0..0x10ffff`). + # chars (`0..0xd7ff` and `0xe000..0x10ffff`). # # You should never use this method unless `chr` turns out to # be a bottleneck. diff --git a/src/string.cr b/src/string.cr index d0e5ba852b32..4a4c1bc0dca0 100644 --- a/src/string.cr +++ b/src/string.cr @@ -710,18 +710,18 @@ class String unless v.finite? startptr = to_unsafe if whitespace - while startptr.value.chr.ascii_whitespace? + while startptr.value.unsafe_chr.ascii_whitespace? startptr += 1 end end - if startptr.value.chr.in?('+', '-') + if startptr.value.unsafe_chr.in?('+', '-') startptr += 1 end if v.nan? - return unless startptr.value.chr.in?('n', 'N') + return unless startptr.value.unsafe_chr.in?('n', 'N') else - return unless startptr.value.chr.in?('i', 'I') + return unless startptr.value.unsafe_chr.in?('i', 'I') end end @@ -732,7 +732,7 @@ class String if strict if whitespace - while endptr < string_end && endptr.value.chr.ascii_whitespace? + while endptr < string_end && endptr.value.unsafe_chr.ascii_whitespace? endptr += 1 end end @@ -741,7 +741,7 @@ class String else ptr = to_unsafe if whitespace - while ptr < string_end && ptr.value.chr.ascii_whitespace? + while ptr < string_end && ptr.value.unsafe_chr.ascii_whitespace? ptr += 1 end end From a8546541d381187e9793d63c24cb4ead78e440e5 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Sat, 27 Feb 2021 22:12:29 +0800 Subject: [PATCH 2/3] Int#chr uses base 16 in error message --- spec/std/int_spec.cr | 6 +++--- src/int.cr | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/std/int_spec.cr b/spec/std/int_spec.cr index 8e97b973623d..fea087630c7c 100644 --- a/spec/std/int_spec.cr +++ b/spec/std/int_spec.cr @@ -773,15 +773,15 @@ describe "Int" do it "#chr" do 65.chr.should eq('A') - expect_raises(ArgumentError, "#{0x10ffff + 1} out of char range") do + expect_raises(ArgumentError, "0x110000 out of char range") do (0x10ffff + 1).chr end - expect_raises(ArgumentError, "#{0xd800} out of char range") do + expect_raises(ArgumentError, "0xd800 out of char range") do 0xd800.chr end - expect_raises(ArgumentError, "#{0xdfff} out of char range") do + expect_raises(ArgumentError, "0xdfff out of char range") do 0xdfff.chr end end diff --git a/src/int.cr b/src/int.cr index e007237b57b4..0e5d538ab41c 100644 --- a/src/int.cr +++ b/src/int.cr @@ -70,7 +70,7 @@ struct Int # ``` def chr unless 0 <= self <= 0xd7ff || 0xe000 <= self <= Char::MAX_CODEPOINT - raise ArgumentError.new("#{self} out of char range") + raise ArgumentError.new("0x#{self.to_s(16)} out of char range") end unsafe_chr end From a54ebdd66d32cc06e2b9882f6ab8fc5aaad3cbd6 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Sat, 27 Feb 2021 22:19:22 +0800 Subject: [PATCH 3/3] Don't use surrogates in string literal --- spec/std/string/utf16_spec.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr index 912772ea050b..49141c79ddd2 100644 --- a/spec/std/string/utf16_spec.cr +++ b/spec/std/string/utf16_spec.cr @@ -27,7 +27,7 @@ describe "String UTF16" do end it "in the range U+D800..U+DFFF" do - encoded = "\u{D800}\u{DFFF}".to_utf16 + encoded = String.new(Bytes[0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF]).to_utf16 encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16]) encoded.unsafe_fetch(encoded.size).should eq 0_u16 end