From 9e456b0b66841a78a59f225622d62411334df5f6 Mon Sep 17 00:00:00 2001
From: Quinton Miller <nicetas.c@gmail.com>
Date: Sat, 27 Feb 2021 20:18:20 +0800
Subject: [PATCH 1/3] Make Int#chr reject surrogate halves

---
 spec/std/base64_spec.cr |  2 +-
 spec/std/char_spec.cr   | 12 ------------
 spec/std/int_spec.cr    |  8 ++++++++
 src/char.cr             |  8 ++------
 src/http/common.cr      |  2 +-
 src/int.cr              |  5 +++--
 src/primitives.cr       |  2 +-
 src/string.cr           | 12 ++++++------
 8 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/spec/std/base64_spec.cr b/spec/std/base64_spec.cr
index ba601955677b..1a5b021b2956 100644
--- a/spec/std/base64_spec.cr
+++ b/spec/std/base64_spec.cr
@@ -87,7 +87,7 @@ describe "Base64" do
 
     it "works for most characters" do
       a = String.build(65536 * 4) do |buf|
-        65536.times { |i| buf << (i + 1).chr }
+        65536.times { |i| buf << (i + 1).unsafe_chr }
       end
       b = Base64.encode(a)
       Crystal::Digest::MD5.hexdigest(Base64.decode_string(b)).should eq(Crystal::Digest::MD5.hexdigest(a))
diff --git a/spec/std/char_spec.cr b/spec/std/char_spec.cr
index dce9d853ba42..51ed153518ff 100644
--- a/spec/std/char_spec.cr
+++ b/spec/std/char_spec.cr
@@ -280,12 +280,6 @@ describe "Char" do
     it "does for unicode" do
       '青'.bytesize.should eq(3)
     end
-
-    it "raises on codepoint bigger than 0x10ffff" do
-      expect_raises InvalidByteSequenceError do
-        (0x10ffff + 1).unsafe_chr.bytesize
-      end
-    end
   end
 
   describe "in_set?" do
@@ -332,12 +326,6 @@ describe "Char" do
     end
   end
 
-  it "raises on codepoint bigger than 0x10ffff when doing each_byte" do
-    expect_raises InvalidByteSequenceError do
-      (0x10ffff + 1).unsafe_chr.each_byte { |b| }
-    end
-  end
-
   it "does each_byte" do
     'a'.each_byte(&.should eq('a'.ord)).should be_nil
   end
diff --git a/spec/std/int_spec.cr b/spec/std/int_spec.cr
index 93b6db182998..8e97b973623d 100644
--- a/spec/std/int_spec.cr
+++ b/spec/std/int_spec.cr
@@ -776,6 +776,14 @@ describe "Int" do
     expect_raises(ArgumentError, "#{0x10ffff + 1} out of char range") do
       (0x10ffff + 1).chr
     end
+
+    expect_raises(ArgumentError, "#{0xd800} out of char range") do
+      0xd800.chr
+    end
+
+    expect_raises(ArgumentError, "#{0xdfff} out of char range") do
+      0xdfff.chr
+    end
   end
 
   it "#unsafe_chr" do
diff --git a/src/char.cr b/src/char.cr
index 0e7414a31b64..72744aaab69e 100644
--- a/src/char.cr
+++ b/src/char.cr
@@ -693,14 +693,12 @@ struct Char
       yield (0xe0 | (c >> 12)).to_u8
       yield (0x80 | ((c >> 6) & 0x3f)).to_u8
       yield (0x80 | (c & 0x3f)).to_u8
-    elsif c <= MAX_CODEPOINT
+    else
       # 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
       yield (0xf0 | (c >> 18)).to_u8
       yield (0x80 | ((c >> 12) & 0x3f)).to_u8
       yield (0x80 | ((c >> 6) & 0x3f)).to_u8
       yield (0x80 | (c & 0x3f)).to_u8
-    else
-      raise InvalidByteSequenceError.new("Invalid char value #{dump}")
     end
   end
 
@@ -723,11 +721,9 @@ struct Char
     elsif c <= 0xffff
       # 1110xxxx  10xxxxxx  10xxxxxx
       3
-    elsif c <= MAX_CODEPOINT
+    else
       # 11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
       4
-    else
-      raise InvalidByteSequenceError.new("Invalid char value #{dump}")
     end
   end
 
diff --git a/src/http/common.cr b/src/http/common.cr
index ad2288fae67e..5d821e65e5f2 100644
--- a/src/http/common.cr
+++ b/src/http/common.cr
@@ -387,7 +387,7 @@ module HTTP
     String.build do |io|
       while quoted_pair_index
         io.write(data[0, quoted_pair_index])
-        io << data[quoted_pair_index + 1].chr
+        io << data[quoted_pair_index + 1].unsafe_chr
 
         data += quoted_pair_index + 2
         quoted_pair_index = data.index('\\'.ord)
diff --git a/src/int.cr b/src/int.cr
index 90f20589e4e6..e007237b57b4 100644
--- a/src/int.cr
+++ b/src/int.cr
@@ -62,13 +62,14 @@ struct Int
 
   # Returns a `Char` that has the unicode codepoint of `self`.
   #
-  # Raises `ArgumentError` if this integer's value doesn't fit a char's range (`0..0x10ffff`).
+  # Raises `ArgumentError` if this integer's value doesn't fit a char's range
+  # (`0..0xd7ff` and `0xe000..0x10ffff`).
   #
   # ```
   # 97.chr # => 'a'
   # ```
   def chr
-    unless 0 <= self <= Char::MAX_CODEPOINT
+    unless 0 <= self <= 0xd7ff || 0xe000 <= self <= Char::MAX_CODEPOINT
       raise ArgumentError.new("#{self} out of char range")
     end
     unsafe_chr
diff --git a/src/primitives.cr b/src/primitives.cr
index 246ff09714c4..ffe43126eaf9 100644
--- a/src/primitives.cr
+++ b/src/primitives.cr
@@ -316,7 +316,7 @@ end
     struct {{int.id}}
       # Returns a `Char` that has the unicode codepoint of `self`,
       # without checking if this integer is in the range valid for
-      # chars (`0..0x10ffff`).
+      # chars (`0..0xd7ff` and `0xe000..0x10ffff`).
       #
       # You should never use this method unless `chr` turns out to
       # be a bottleneck.
diff --git a/src/string.cr b/src/string.cr
index d0e5ba852b32..4a4c1bc0dca0 100644
--- a/src/string.cr
+++ b/src/string.cr
@@ -710,18 +710,18 @@ class String
     unless v.finite?
       startptr = to_unsafe
       if whitespace
-        while startptr.value.chr.ascii_whitespace?
+        while startptr.value.unsafe_chr.ascii_whitespace?
           startptr += 1
         end
       end
-      if startptr.value.chr.in?('+', '-')
+      if startptr.value.unsafe_chr.in?('+', '-')
         startptr += 1
       end
 
       if v.nan?
-        return unless startptr.value.chr.in?('n', 'N')
+        return unless startptr.value.unsafe_chr.in?('n', 'N')
       else
-        return unless startptr.value.chr.in?('i', 'I')
+        return unless startptr.value.unsafe_chr.in?('i', 'I')
       end
     end
 
@@ -732,7 +732,7 @@ class String
 
     if strict
       if whitespace
-        while endptr < string_end && endptr.value.chr.ascii_whitespace?
+        while endptr < string_end && endptr.value.unsafe_chr.ascii_whitespace?
           endptr += 1
         end
       end
@@ -741,7 +741,7 @@ class String
     else
       ptr = to_unsafe
       if whitespace
-        while ptr < string_end && ptr.value.chr.ascii_whitespace?
+        while ptr < string_end && ptr.value.unsafe_chr.ascii_whitespace?
           ptr += 1
         end
       end

From a8546541d381187e9793d63c24cb4ead78e440e5 Mon Sep 17 00:00:00 2001
From: Quinton Miller <nicetas.c@gmail.com>
Date: Sat, 27 Feb 2021 22:12:29 +0800
Subject: [PATCH 2/3] Int#chr uses base 16 in error message

---
 spec/std/int_spec.cr | 6 +++---
 src/int.cr           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spec/std/int_spec.cr b/spec/std/int_spec.cr
index 8e97b973623d..fea087630c7c 100644
--- a/spec/std/int_spec.cr
+++ b/spec/std/int_spec.cr
@@ -773,15 +773,15 @@ describe "Int" do
   it "#chr" do
     65.chr.should eq('A')
 
-    expect_raises(ArgumentError, "#{0x10ffff + 1} out of char range") do
+    expect_raises(ArgumentError, "0x110000 out of char range") do
       (0x10ffff + 1).chr
     end
 
-    expect_raises(ArgumentError, "#{0xd800} out of char range") do
+    expect_raises(ArgumentError, "0xd800 out of char range") do
       0xd800.chr
     end
 
-    expect_raises(ArgumentError, "#{0xdfff} out of char range") do
+    expect_raises(ArgumentError, "0xdfff out of char range") do
       0xdfff.chr
     end
   end
diff --git a/src/int.cr b/src/int.cr
index e007237b57b4..0e5d538ab41c 100644
--- a/src/int.cr
+++ b/src/int.cr
@@ -70,7 +70,7 @@ struct Int
   # ```
   def chr
     unless 0 <= self <= 0xd7ff || 0xe000 <= self <= Char::MAX_CODEPOINT
-      raise ArgumentError.new("#{self} out of char range")
+      raise ArgumentError.new("0x#{self.to_s(16)} out of char range")
     end
     unsafe_chr
   end

From a54ebdd66d32cc06e2b9882f6ab8fc5aaad3cbd6 Mon Sep 17 00:00:00 2001
From: Quinton Miller <nicetas.c@gmail.com>
Date: Sat, 27 Feb 2021 22:19:22 +0800
Subject: [PATCH 3/3] Don't use surrogates in string literal

---
 spec/std/string/utf16_spec.cr | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr
index 912772ea050b..49141c79ddd2 100644
--- a/spec/std/string/utf16_spec.cr
+++ b/spec/std/string/utf16_spec.cr
@@ -27,7 +27,7 @@ describe "String UTF16" do
     end
 
     it "in the range U+D800..U+DFFF" do
-      encoded = "\u{D800}\u{DFFF}".to_utf16
+      encoded = String.new(Bytes[0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF]).to_utf16
       encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16])
       encoded.unsafe_fetch(encoded.size).should eq 0_u16
     end