From 99ea035944966388021652ed64ba791565960df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com> Date: Mon, 3 Jan 2022 21:40:35 +0100 Subject: [PATCH] Revert "Restrict identifier grammar" (#11687) --- scripts/generate_unicode_data.cr | 2 +- spec/compiler/crystal/tools/format_spec.cr | 6 +++--- spec/compiler/lexer/lexer_spec.cr | 17 ++------------- spec/compiler/parser/parser_spec.cr | 13 ----------- src/compiler/crystal/syntax/lexer.cr | 14 +++--------- src/unicode/data.cr | 14 ------------ src/unicode/unicode.cr | 25 ---------------------- 7 files changed, 9 insertions(+), 82 deletions(-) diff --git a/scripts/generate_unicode_data.cr b/scripts/generate_unicode_data.cr index b46578e820f6..cc04c816dcfc 100644 --- a/scripts/generate_unicode_data.cr +++ b/scripts/generate_unicode_data.cr @@ -220,7 +220,7 @@ alternate_ranges = alternate_ranges(downcase_one_ranges) casefold_ranges = case_ranges entries, &.casefold all_strides = {} of String => Array(Stride) -categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Zs Zl Zp Cc Cf Cs Co Cn) +categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn) categories.each do |category| all_strides[category] = strides entries, category, &.general_category diff --git a/spec/compiler/crystal/tools/format_spec.cr b/spec/compiler/crystal/tools/format_spec.cr index a19d5bc760ce..d0b37e866e29 100644 --- a/spec/compiler/crystal/tools/format_spec.cr +++ b/spec/compiler/crystal/tools/format_spec.cr @@ -57,7 +57,7 @@ describe Crystal::Command::FormatCommand do format_command.run format_command.status_code.should eq(1) stdout.to_s.empty?.should be_true - stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8") + stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8") end it "formats stdin (bug)" do @@ -162,7 +162,7 @@ describe Crystal::Command::FormatCommand do format_command.status_code.should eq(1) stdout.to_s.should contain("Format #{Path[".", "format.cr"]}") stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF") - stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8") + stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8") File.read(File.join(path, "format.cr")).should eq("if true\n 1\nend\n") end @@ -226,7 +226,7 @@ describe Crystal::Command::FormatCommand do stderr.to_s.should_not contain("not_format.cr") stderr.to_s.should contain("formatting '#{Path[".", "format.cr"]}' produced changes") stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF") - stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8") + stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8") end end end diff --git a/spec/compiler/lexer/lexer_spec.cr b/spec/compiler/lexer/lexer_spec.cr index 7c1725048c66..5ab23ba2be59 100644 --- a/spec/compiler/lexer/lexer_spec.cr +++ b/spec/compiler/lexer/lexer_spec.cr @@ -150,21 +150,8 @@ describe "Lexer" do :pointerof, :sizeof, :instance_sizeof, :offsetof, :as, :as?, :typeof, :for, :in, :with, :self, :super, :private, :protected, :asm, :uninitialized, :nil?, :annotation, :verbatim] - it_lexes_idents ["ident", "something", "with_underscores", "_start_underscore", "with_1", "foo?", "bar!", "fooBar"] - it_lexes_idents [ - "ä", # L - "a\u0300", # Mn - "aः", # Mc - "a٠", # Nd - "a_", # Pc - "aⅧ", # Nl - ] - - assert_syntax_error "\u200B", "unknown token: '\\u200B'" - assert_syntax_error "ident\u200B", "unknown token: '\\u200B'" - assert_syntax_error ":\u200B", %(unexpected token: ":") - assert_syntax_error ":ident\u200B", "unknown token: '\\u200B'" - + it_lexes_idents ["ident", "something", "with_underscores", "with_1", "foo?", "bar!", "fooBar", + "❨╯°□°❩╯︵┻━┻"] it_lexes_idents ["def?", "if?", "else?", "elsif?", "end?", "true?", "false?", "class?", "while?", "do?", "yield?", "return?", "unless?", "next?", "break?", "begin?"] it_lexes_idents ["def!", "if!", "else!", "elsif!", "end!", "true!", "false!", "class!", "while!", diff --git a/spec/compiler/parser/parser_spec.cr b/spec/compiler/parser/parser_spec.cr index 0dcc3594879f..e419e31cd3db 100644 --- a/spec/compiler/parser/parser_spec.cr +++ b/spec/compiler/parser/parser_spec.cr @@ -156,19 +156,6 @@ module Crystal it_parses "a = 1", Assign.new("a".var, 1.int32) it_parses "a = b = 2", Assign.new("a".var, Assign.new("b".var, 2.int32)) - # check control characters: They're allowed inside literals, but not in identifiers. - ['\u200B', '\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char| - it_parses %('#{char}'), CharLiteral.new(char) - assert_syntax_error %(ident#{char}), "unknown token: #{char.inspect}" - it_parses %("#{char}"), StringLiteral.new("#{char}") - it_parses %(%w(#{char})), ArrayLiteral.new([StringLiteral.new "#{char}"] of ASTNode, of: Path.new("String", global: true)) - assert_syntax_error %(:#{char}), %(unexpected token: ":") - it_parses %(:"#{char}"), SymbolLiteral.new "#{char}" - it_parses %(%i(#{char})), ArrayLiteral.new([SymbolLiteral.new "#{char}"] of ASTNode, of: Path.new("Symbol", global: true)) - it_parses %(##{char}), Nop.new - it_parses %(macro foo\n##{char}\nend), Macro.new("foo", body: MacroLiteral.new("##{char}\n")) - end - it_parses "a, b = 1, 2", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32, 2.int32] of ASTNode) it_parses "a, b = 1", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32] of ASTNode) it_parses "_, _ = 1, 2", MultiAssign.new([Underscore.new, Underscore.new] of ASTNode, [1.int32, 2.int32] of ASTNode) diff --git a/src/compiler/crystal/syntax/lexer.cr b/src/compiler/crystal/syntax/lexer.cr index bb10d3a5637f..8f0ae59f55ab 100644 --- a/src/compiler/crystal/syntax/lexer.cr +++ b/src/compiler/crystal/syntax/lexer.cr @@ -54,11 +54,6 @@ module Crystal def initialize(string, string_pool : StringPool? = nil) @reader = Char::Reader.new(string) - - if error = @reader.error - ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8") - end - @token = Token.new @temp_token = Token.new @line_number = 1 @@ -2758,7 +2753,7 @@ module Crystal def next_char_no_column_increment char = @reader.next_char if error = @reader.error - ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8") + ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16)} at position #{@reader.pos}, malformed UTF-8") end char end @@ -2860,14 +2855,11 @@ module Crystal end def self.ident_start?(char) - char.letter? || char == '_' + char.ascii_letter? || char == '_' || char.ord > 0x9F end def self.ident_part?(char) - ident_start?(char) || - Unicode.mark_nonspacing?(char) || Unicode.mark_spacing_combining?(char) || - Unicode.number_digit?(char) || Unicode.number_letter?(char) || - Unicode.punctuation_connector?(char) + ident_start?(char) || char.ascii_number? end def self.ident?(name) diff --git a/src/unicode/data.cr b/src/unicode/data.cr index ce4cf6128cf1..2dae0885d8e7 100644 --- a/src/unicode/data.cr +++ b/src/unicode/data.cr @@ -1922,20 +1922,6 @@ module Unicode end end - @@category_Pc : Array({Int32, Int32, Int32})? - - private def self.category_Pc - @@category_Pc ||= begin - data = Array({Int32, Int32, Int32}).new(5) - put(data, 95, 8255, 8160) - put(data, 8256, 8276, 20) - put(data, 65075, 65076, 1) - put(data, 65101, 65103, 1) - put(data, 65343, 65343, 1) - data - end - end - @@category_Zs : Array({Int32, Int32, Int32})? private def self.category_Zs diff --git a/src/unicode/unicode.cr b/src/unicode/unicode.cr index 7dcac3691f8c..fde457404f49 100644 --- a/src/unicode/unicode.cr +++ b/src/unicode/unicode.cr @@ -197,16 +197,6 @@ module Unicode in_any_category?(char.ord, category_Nd, category_Nl, category_No) end - # :nodoc: - def self.number_digit?(char : Char) : Bool - in_any_category?(char.ord, category_Nd) - end - - # :nodoc: - def self.number_letter?(char : Char) : Bool - in_any_category?(char.ord, category_Nl) - end - # :nodoc: def self.control?(char : Char) : Bool in_any_category?(char.ord, category_Cs, category_Co, category_Cn, category_Cf, category_Cc) @@ -217,26 +207,11 @@ module Unicode in_any_category?(char.ord, category_Zs, category_Zl, category_Zp) end - # :nodoc: - def self.punctuation_connector?(char : Char) : Bool - in_any_category?(char.ord, category_Pc) - end - # :nodoc: def self.mark?(char : Char) : Bool in_any_category?(char.ord, category_Mn, category_Me, category_Mc) end - # :nodoc: - def self.mark_nonspacing?(char : Char) : Bool - in_any_category?(char.ord, category_Mn) - end - - # :nodoc: - def self.mark_spacing_combining?(char : Char) : Bool - in_any_category?(char.ord, category_Mc) - end - private def self.search_ranges(haystack, needle) value = haystack.bsearch { |low, high, delta| needle <= high } if value && value[0] <= needle <= value[1]