crystal-lang · straight-shoota · Aug 18, 2022 · Jun 21, 2022 · Aug 2, 2022 · Aug 2, 2022
diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr
@@ -2848,7 +2848,67 @@ describe "String" do
 
     it "valid_encoding?" do
       "hello".valid_encoding?.should be_true
-      String.new(Bytes[255, 0]).valid_encoding?.should be_false
+      "hello\u{80}\u{7FF}\u{800}\u{FFFF}\u{10000}\u{10FFFF}".valid_encoding?.should be_true
+
+      # non-starters
+      String.new(Bytes[0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0x8F]).valid_encoding?.should be_false
+      String.new(Bytes[0x90]).valid_encoding?.should be_false
+      String.new(Bytes[0x9F]).valid_encoding?.should be_false
+      String.new(Bytes[0xA0]).valid_encoding?.should be_false
+      String.new(Bytes[0xAF]).valid_encoding?.should be_false
+
+      # incomplete, 2-byte
+      String.new(Bytes[0xC2]).valid_encoding?.should be_false
+      String.new(Bytes[0xC2, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xC2, 0xC2]).valid_encoding?.should be_false
+
+      # overlong, 2-byte
+      String.new(Bytes[0xC0, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xC1, 0xBF]).valid_encoding?.should be_false
+      String.new(Bytes[0xC2, 0x80]).valid_encoding?.should be_true
+
+      # incomplete, 3-byte
+      String.new(Bytes[0xE1]).valid_encoding?.should be_false
+      String.new(Bytes[0xE1, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xE1, 0xC2]).valid_encoding?.should be_false
+      String.new(Bytes[0xE1, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xE1, 0x80, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xE1, 0x80, 0xC2]).valid_encoding?.should be_false
+
+      # overlong, 3-byte
+      String.new(Bytes[0xE0, 0x80, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xE0, 0x9F, 0xBF]).valid_encoding?.should be_false
+      String.new(Bytes[0xE0, 0xA0, 0x80]).valid_encoding?.should be_true
+
+      # surrogate pairs
+      String.new(Bytes[0xED, 0x9F, 0xBF]).valid_encoding?.should be_true
+      String.new(Bytes[0xED, 0xA0, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xED, 0xBF, 0xBF]).valid_encoding?.should be_false
+      String.new(Bytes[0xEE, 0x80, 0x80]).valid_encoding?.should be_true
+
+      # incomplete, 4-byte
+      String.new(Bytes[0xF1]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0xC2]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80, 0xC2]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80, 0x80, 0x00]).valid_encoding?.should be_false
+      String.new(Bytes[0xF1, 0x80, 0x80, 0xC2]).valid_encoding?.should be_false
+
+      # overlong, 4-byte
+      String.new(Bytes[0xF0, 0x80, 0x80, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xF0, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_false
+      String.new(Bytes[0xF0, 0x90, 0x80, 0x80]).valid_encoding?.should be_true
+
+      # upper boundary, 4-byte
+      String.new(Bytes[0xF4, 0x8F, 0xBF, 0xBF]).valid_encoding?.should be_true
+      String.new(Bytes[0xF4, 0x90, 0x80, 0x80]).valid_encoding?.should be_false
+      String.new(Bytes[0xF5]).valid_encoding?.should be_false
+      String.new(Bytes[0xF8]).valid_encoding?.should be_false
+      String.new(Bytes[0xFF]).valid_encoding?.should be_false
     end
 
     it "scrubs" do

diff --git a/src/string.cr b/src/string.cr
@@ -4946,12 +4946,7 @@ class String
   # Returns `true` if this String is encoded correctly
   # according to the UTF-8 encoding.
   def valid_encoding? : Bool
-    reader = Char::Reader.new(self)
-    while reader.has_next?
-      return false if reader.error
-      reader.next_char
-    end
-    true
+    Unicode.valid?(to_slice)
   end
 
   # Returns a String where bytes that are invalid in the

diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr
@@ -23,6 +23,109 @@ module Unicode
     Fold
   end
 
+  private UNROLL = 64
+
+  # :nodoc:
+  # Returns whether the given *bytes* refer to a correctly encoded UTF-8 string.
+  #
+  # The implementation here uses a shift-based DFA based on
+  # https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725.
+  # This loop is very tight and bypasses `Char::Reader` completely. The downside
+  # is that it does not compute anything else, such as the code points
+  # themselves or their count, because the required handling for invalid byte
+  # sequences would significantly slow down the loop.
+  def self.valid?(bytes : Bytes) : Bool
+    state = 0_u64
+    table = UTF8_ENCODING_DFA.to_unsafe
+    s = bytes.to_unsafe
+    e = s + bytes.size
+
+    while s + UNROLL <= e
+      {% for i in 0...UNROLL %}
+        state = table[s[{{ i }}]].unsafe_shr(state & 0x3F)
+      {% end %}
+      return false if state & 0x3F == 6
+      s += UNROLL
+    end
+
+    while s < e
+      state = table[s.value].unsafe_shr(state & 0x3F)
+      return false if state & 0x3F == 6
+      s += 1
+    end
+
+    state & 0x3F == 0
+  end
+
+  private UTF8_ENCODING_DFA = begin
+    x = Array(UInt64).new(256)
+
+    # The same DFA transition table, with error state and unused bytes hidden:
+    #
+    #              accepted (initial state)
+    #              | 1 continuation byte left
+    #              | | 2 continuation bytes left
+    #              | | | E0-?? ??; disallow overlong encodings up to U+07FF
+    #              | | | | ED-?? ??; disallow surrogate pairs
+    #              | | | | | F0-?? ?? ??; disallow overlong encodings up to U+FFFF
+    #              | | | | | | 3 continuation bytes left
+    #              | | | | | | | F4-?? ?? ??; disallow codepoints above U+10FFFF
+    #              v v v v v v v v
+    #
+    #            | 0 2 3 4 5 6 7 8
+    # -----------+----------------
+    # 0x00..0x7F | 0 _ _ _ _ _ _ _
+    # 0x80..0x8F | _ 0 2 _ 2 _ 3 3
+    # 0x90..0x9F | _ 0 2 _ 2 3 3 _
+    # 0xA0..0xBF | _ 0 2 2 _ 3 3 _
+    # 0xC2..0xDF | 2 _ _ _ _ _ _ _
+    # 0xE0..0xE0 | 4 _ _ _ _ _ _ _
+    # 0xE1..0xEC | 3 _ _ _ _ _ _ _
+    # 0xED..0xED | 5 _ _ _ _ _ _ _
+    # 0xEE..0xEF | 3 _ _ _ _ _ _ _
+    # 0xF0..0xF0 | 6 _ _ _ _ _ _ _
+    # 0xF1..0xF3 | 7 _ _ _ _ _ _ _
+    # 0xF4..0xF4 | 8 _ _ _ _ _ _ _
+
+    {% for ch in 0x00..0x7F %} put1(x, dfa_state(0, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0x80..0x8F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 1, 3, 3)); {% end %}
+    {% for ch in 0x90..0x9F %} put1(x, dfa_state(1, 1, 0, 2, 1, 2, 3, 3, 1)); {% end %}
+    {% for ch in 0xA0..0xBF %} put1(x, dfa_state(1, 1, 0, 2, 2, 1, 3, 3, 1)); {% end %}
+    {% for ch in 0xC0..0xC1 %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xC2..0xDF %} put1(x, dfa_state(2, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xE0..0xE0 %} put1(x, dfa_state(4, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xE1..0xEC %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xED..0xED %} put1(x, dfa_state(5, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xEE..0xEF %} put1(x, dfa_state(3, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xF0..0xF0 %} put1(x, dfa_state(6, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xF1..0xF3 %} put1(x, dfa_state(7, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xF4..0xF4 %} put1(x, dfa_state(8, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+    {% for ch in 0xF5..0xFF %} put1(x, dfa_state(1, 1, 1, 1, 1, 1, 1, 1, 1)); {% end %}
+
+    x
+  end
+
+  private def self.put1(array : Array, value) : Nil
+    array << value
+  end
+
+  # TODO: remove the workaround for 1.0.0 eventually (needed until #10713)
+  private macro dfa_state(*transitions)
+    {% if compare_versions(Crystal::VERSION, "1.1.0") >= 0 %}
+      {% x = 0_u64 %}
+      {% for tr, i in transitions %}
+        {% x |= (1_u64 << (i * 6)) * tr * 6 %}
+      {% end %}
+      {{ x }}
+    {% else %}
+      {% x = [] of Nil %}
+      {% for tr, i in transitions %}
+        {% x << "(#{tr * 6}_u64 << #{i * 6})" %}
+      {% end %}
+      {{ x.join(" | ").id }}
+    {% end %}
+  end
+
   # :nodoc:
   def self.upcase(char : Char, options : CaseOptions) : Char
     result = check_upcase_ascii(char, options)