From 99ea035944966388021652ed64ba791565960df2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com>
Date: Mon, 3 Jan 2022 21:40:35 +0100
Subject: [PATCH] Revert "Restrict identifier grammar" (#11687)

---
 scripts/generate_unicode_data.cr           |  2 +-
 spec/compiler/crystal/tools/format_spec.cr |  6 +++---
 spec/compiler/lexer/lexer_spec.cr          | 17 ++-------------
 spec/compiler/parser/parser_spec.cr        | 13 -----------
 src/compiler/crystal/syntax/lexer.cr       | 14 +++---------
 src/unicode/data.cr                        | 14 ------------
 src/unicode/unicode.cr                     | 25 ----------------------
 7 files changed, 9 insertions(+), 82 deletions(-)

diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr
index b46578e820f6..cc04c816dcfc 100644
--- a/scripts/generate_unicode_data.cr
+++ b/scripts/generate_unicode_data.cr
@@ -220,7 +220,7 @@ alternate_ranges = alternate_ranges(downcase_one_ranges)
 casefold_ranges = case_ranges entries, &.casefold
 
 all_strides = {} of String => Array(Stride)
-categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Zs Zl Zp Cc Cf Cs Co Cn)
+categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)
 
 categories.each do |category|
   all_strides[category] = strides entries, category, &.general_category
diff --git a/spec/compiler/crystal/tools/format_spec.cr b/spec/compiler/crystal/tools/format_spec.cr
index a19d5bc760ce..d0b37e866e29 100644
--- a/spec/compiler/crystal/tools/format_spec.cr
+++ b/spec/compiler/crystal/tools/format_spec.cr
@@ -57,7 +57,7 @@ describe Crystal::Command::FormatCommand do
     format_command.run
     format_command.status_code.should eq(1)
     stdout.to_s.empty?.should be_true
-    stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")
+    stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
   end
 
   it "formats stdin (bug)" do
@@ -162,7 +162,7 @@ describe Crystal::Command::FormatCommand do
         format_command.status_code.should eq(1)
         stdout.to_s.should contain("Format #{Path[".", "format.cr"]}")
         stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF")
-        stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")
+        stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
 
         File.read(File.join(path, "format.cr")).should eq("if true\n  1\nend\n")
       end
@@ -226,7 +226,7 @@ describe Crystal::Command::FormatCommand do
         stderr.to_s.should_not contain("not_format.cr")
         stderr.to_s.should contain("formatting '#{Path[".", "format.cr"]}' produced changes")
         stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF")
-        stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")
+        stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
       end
     end
   end
diff --git a/spec/compiler/lexer/lexer_spec.cr b/spec/compiler/lexer/lexer_spec.cr
index 7c1725048c66..5ab23ba2be59 100644
--- a/spec/compiler/lexer/lexer_spec.cr
+++ b/spec/compiler/lexer/lexer_spec.cr
@@ -150,21 +150,8 @@ describe "Lexer" do
                      :pointerof, :sizeof, :instance_sizeof, :offsetof, :as, :as?, :typeof, :for, :in,
                      :with, :self, :super, :private, :protected, :asm, :uninitialized, :nil?,
                      :annotation, :verbatim]
-  it_lexes_idents ["ident", "something", "with_underscores", "_start_underscore", "with_1", "foo?", "bar!", "fooBar"]
-  it_lexes_idents [
-    "ä",       # L
-    "a\u0300", # Mn
-    "aः",      # Mc
-    "a٠",      # Nd
-    "a_",      # Pc
-    "aⅧ",      # Nl
-  ]
-
-  assert_syntax_error "\u200B", "unknown token: '\\u200B'"
-  assert_syntax_error "ident\u200B", "unknown token: '\\u200B'"
-  assert_syntax_error ":\u200B", %(unexpected token: ":")
-  assert_syntax_error ":ident\u200B", "unknown token: '\\u200B'"
-
+  it_lexes_idents ["ident", "something", "with_underscores", "with_1", "foo?", "bar!", "fooBar",
+                   "❨╯°□°❩╯︵┻━┻"]
   it_lexes_idents ["def?", "if?", "else?", "elsif?", "end?", "true?", "false?", "class?", "while?",
                    "do?", "yield?", "return?", "unless?", "next?", "break?", "begin?"]
   it_lexes_idents ["def!", "if!", "else!", "elsif!", "end!", "true!", "false!", "class!", "while!",
diff --git a/spec/compiler/parser/parser_spec.cr b/spec/compiler/parser/parser_spec.cr
index 0dcc3594879f..e419e31cd3db 100644
--- a/spec/compiler/parser/parser_spec.cr
+++ b/spec/compiler/parser/parser_spec.cr
@@ -156,19 +156,6 @@ module Crystal
     it_parses "a = 1", Assign.new("a".var, 1.int32)
     it_parses "a = b = 2", Assign.new("a".var, Assign.new("b".var, 2.int32))
 
-    # check control characters: They're allowed inside literals, but not in identifiers.
-    ['\u200B', '\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char|
-      it_parses %('#{char}'), CharLiteral.new(char)
-      assert_syntax_error %(ident#{char}), "unknown token: #{char.inspect}"
-      it_parses %("#{char}"), StringLiteral.new("#{char}")
-      it_parses %(%w(#{char})), ArrayLiteral.new([StringLiteral.new "#{char}"] of ASTNode, of: Path.new("String", global: true))
-      assert_syntax_error %(:#{char}), %(unexpected token: ":")
-      it_parses %(:"#{char}"), SymbolLiteral.new "#{char}"
-      it_parses %(%i(#{char})), ArrayLiteral.new([SymbolLiteral.new "#{char}"] of ASTNode, of: Path.new("Symbol", global: true))
-      it_parses %(##{char}), Nop.new
-      it_parses %(macro foo\n##{char}\nend), Macro.new("foo", body: MacroLiteral.new("##{char}\n"))
-    end
-
     it_parses "a, b = 1, 2", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32, 2.int32] of ASTNode)
     it_parses "a, b = 1", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32] of ASTNode)
     it_parses "_, _ = 1, 2", MultiAssign.new([Underscore.new, Underscore.new] of ASTNode, [1.int32, 2.int32] of ASTNode)
diff --git a/src/compiler/crystal/syntax/lexer.cr b/src/compiler/crystal/syntax/lexer.cr
index bb10d3a5637f..8f0ae59f55ab 100644
--- a/src/compiler/crystal/syntax/lexer.cr
+++ b/src/compiler/crystal/syntax/lexer.cr
@@ -54,11 +54,6 @@ module Crystal
 
     def initialize(string, string_pool : StringPool? = nil)
       @reader = Char::Reader.new(string)
-
-      if error = @reader.error
-        ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8")
-      end
-
       @token = Token.new
       @temp_token = Token.new
       @line_number = 1
@@ -2758,7 +2753,7 @@ module Crystal
     def next_char_no_column_increment
       char = @reader.next_char
       if error = @reader.error
-        ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8")
+        ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16)} at position #{@reader.pos}, malformed UTF-8")
       end
       char
     end
@@ -2860,14 +2855,11 @@ module Crystal
     end
 
     def self.ident_start?(char)
-      char.letter? || char == '_'
+      char.ascii_letter? || char == '_' || char.ord > 0x9F
     end
 
     def self.ident_part?(char)
-      ident_start?(char) ||
-        Unicode.mark_nonspacing?(char) || Unicode.mark_spacing_combining?(char) ||
-        Unicode.number_digit?(char) || Unicode.number_letter?(char) ||
-        Unicode.punctuation_connector?(char)
+      ident_start?(char) || char.ascii_number?
     end
 
     def self.ident?(name)
diff --git a/src/unicode/data.cr b/src/unicode/data.cr
index ce4cf6128cf1..2dae0885d8e7 100644
--- a/src/unicode/data.cr
+++ b/src/unicode/data.cr
@@ -1922,20 +1922,6 @@ module Unicode
     end
   end
 
-  @@category_Pc : Array({Int32, Int32, Int32})?
-
-  private def self.category_Pc
-    @@category_Pc ||= begin
-      data = Array({Int32, Int32, Int32}).new(5)
-      put(data, 95, 8255, 8160)
-      put(data, 8256, 8276, 20)
-      put(data, 65075, 65076, 1)
-      put(data, 65101, 65103, 1)
-      put(data, 65343, 65343, 1)
-      data
-    end
-  end
-
   @@category_Zs : Array({Int32, Int32, Int32})?
 
   private def self.category_Zs
diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr
index 7dcac3691f8c..fde457404f49 100644
--- a/src/unicode/unicode.cr
+++ b/src/unicode/unicode.cr
@@ -197,16 +197,6 @@ module Unicode
     in_any_category?(char.ord, category_Nd, category_Nl, category_No)
   end
 
-  # :nodoc:
-  def self.number_digit?(char : Char) : Bool
-    in_any_category?(char.ord, category_Nd)
-  end
-
-  # :nodoc:
-  def self.number_letter?(char : Char) : Bool
-    in_any_category?(char.ord, category_Nl)
-  end
-
   # :nodoc:
   def self.control?(char : Char) : Bool
     in_any_category?(char.ord, category_Cs, category_Co, category_Cn, category_Cf, category_Cc)
@@ -217,26 +207,11 @@ module Unicode
     in_any_category?(char.ord, category_Zs, category_Zl, category_Zp)
   end
 
-  # :nodoc:
-  def self.punctuation_connector?(char : Char) : Bool
-    in_any_category?(char.ord, category_Pc)
-  end
-
   # :nodoc:
   def self.mark?(char : Char) : Bool
     in_any_category?(char.ord, category_Mn, category_Me, category_Mc)
   end
 
-  # :nodoc:
-  def self.mark_nonspacing?(char : Char) : Bool
-    in_any_category?(char.ord, category_Mn)
-  end
-
-  # :nodoc:
-  def self.mark_spacing_combining?(char : Char) : Bool
-    in_any_category?(char.ord, category_Mc)
-  end
-
   private def self.search_ranges(haystack, needle)
     value = haystack.bsearch { |low, high, delta| needle <= high }
     if value && value[0] <= needle <= value[1]