Skip to content

Commit

Permalink
Restrict identifier grammar (#11508)
Browse files Browse the repository at this point in the history
straight-shoota authored Dec 18, 2021
1 parent 10ea053 commit 1653cf3
Showing 7 changed files with 82 additions and 9 deletions.
2 changes: 1 addition & 1 deletion scripts/generate_unicode_data.cr
Original file line number Diff line number Diff line change
@@ -220,7 +220,7 @@ alternate_ranges = alternate_ranges(downcase_one_ranges)
casefold_ranges = case_ranges entries, &.casefold

all_strides = {} of String => Array(Stride)
categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Cn)
categories = %w(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Zs Zl Zp Cc Cf Cs Co Cn)

categories.each do |category|
all_strides[category] = strides entries, category, &.general_category
6 changes: 3 additions & 3 deletions spec/compiler/crystal/tools/format_spec.cr
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ describe Crystal::Command::FormatCommand do
format_command.run
format_command.status_code.should eq(1)
stdout.to_s.empty?.should be_true
stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
stderr.to_s.should contain("file 'STDIN' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")
end

it "formats stdin (bug)" do
@@ -162,7 +162,7 @@ describe Crystal::Command::FormatCommand do
format_command.status_code.should eq(1)
stdout.to_s.should contain("Format #{Path[".", "format.cr"]}")
stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF")
stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")

File.read(File.join(path, "format.cr")).should eq("if true\n 1\nend\n")
end
@@ -226,7 +226,7 @@ describe Crystal::Command::FormatCommand do
stderr.to_s.should_not contain("not_format.cr")
stderr.to_s.should contain("formatting '#{Path[".", "format.cr"]}' produced changes")
stderr.to_s.should contain("syntax error in '#{Path[".", "syntax_error.cr"]}:1:3': unexpected token: EOF")
stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xff at position 1, malformed UTF-8")
stderr.to_s.should contain("file '#{Path[".", "invalid_byte_sequence_error.cr"]}' is not a valid Crystal source file: Unexpected byte 0xFE at position 0, malformed UTF-8")
end
end
end
17 changes: 15 additions & 2 deletions spec/compiler/lexer/lexer_spec.cr
Original file line number Diff line number Diff line change
@@ -150,8 +150,21 @@ describe "Lexer" do
:pointerof, :sizeof, :instance_sizeof, :offsetof, :as, :as?, :typeof, :for, :in,
:with, :self, :super, :private, :protected, :asm, :uninitialized, :nil?,
:annotation, :verbatim]
it_lexes_idents ["ident", "something", "with_underscores", "with_1", "foo?", "bar!", "fooBar",
"❨╯°□°❩╯︵┻━┻"]
it_lexes_idents ["ident", "something", "with_underscores", "_start_underscore", "with_1", "foo?", "bar!", "fooBar"]
it_lexes_idents [
"ä", # L
"a\u0300", # Mn
"aः", # Mc
"", # Nd
"a_", # Pc
"aⅧ", # Nl
]

assert_syntax_error "\u200B", "unknown token: '\\u200B'"
assert_syntax_error "ident\u200B", "unknown token: '\\u200B'"
assert_syntax_error ":\u200B", %(unexpected token: ":")
assert_syntax_error ":ident\u200B", "unknown token: '\\u200B'"

it_lexes_idents ["def?", "if?", "else?", "elsif?", "end?", "true?", "false?", "class?", "while?",
"do?", "yield?", "return?", "unless?", "next?", "break?", "begin?"]
it_lexes_idents ["def!", "if!", "else!", "elsif!", "end!", "true!", "false!", "class!", "while!",
13 changes: 13 additions & 0 deletions spec/compiler/parser/parser_spec.cr
Original file line number Diff line number Diff line change
@@ -156,6 +156,19 @@ module Crystal
it_parses "a = 1", Assign.new("a".var, 1.int32)
it_parses "a = b = 2", Assign.new("a".var, Assign.new("b".var, 2.int32))

# check control characters: They're allowed inside literals, but not in identifiers.
['\u200B', '\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char|
it_parses %('#{char}'), CharLiteral.new(char)
assert_syntax_error %(ident#{char}), "unknown token: #{char.inspect}"
it_parses %("#{char}"), StringLiteral.new("#{char}")
it_parses %(%w(#{char})), ArrayLiteral.new([StringLiteral.new "#{char}"] of ASTNode, of: Path.new("String", global: true))
assert_syntax_error %(:#{char}), %(unexpected token: ":")
it_parses %(:"#{char}"), SymbolLiteral.new "#{char}"
it_parses %(%i(#{char})), ArrayLiteral.new([SymbolLiteral.new "#{char}"] of ASTNode, of: Path.new("Symbol", global: true))
it_parses %(##{char}), Nop.new
it_parses %(macro foo\n##{char}\nend), Macro.new("foo", body: MacroLiteral.new("##{char}\n"))
end

it_parses "a, b = 1, 2", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32, 2.int32] of ASTNode)
it_parses "a, b = 1", MultiAssign.new(["a".var, "b".var] of ASTNode, [1.int32] of ASTNode)
it_parses "_, _ = 1, 2", MultiAssign.new([Underscore.new, Underscore.new] of ASTNode, [1.int32, 2.int32] of ASTNode)
14 changes: 11 additions & 3 deletions src/compiler/crystal/syntax/lexer.cr
Original file line number Diff line number Diff line change
@@ -54,6 +54,11 @@ module Crystal

def initialize(string, string_pool : StringPool? = nil)
@reader = Char::Reader.new(string)

if error = @reader.error
::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8")
end

@token = Token.new
@temp_token = Token.new
@line_number = 1
@@ -2753,7 +2758,7 @@ module Crystal
def next_char_no_column_increment
char = @reader.next_char
if error = @reader.error
::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16)} at position #{@reader.pos}, malformed UTF-8")
::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16, upcase: true)} at position #{@reader.pos}, malformed UTF-8")
end
char
end
@@ -2855,11 +2860,14 @@ module Crystal
end

def self.ident_start?(char)
char.ascii_letter? || char == '_' || char.ord > 0x9F
char.letter? || char == '_'
end

def self.ident_part?(char)
ident_start?(char) || char.ascii_number?
ident_start?(char) ||
Unicode.mark_nonspacing?(char) || Unicode.mark_spacing_combining?(char) ||
Unicode.number_digit?(char) || Unicode.number_letter?(char) ||
Unicode.punctuation_connector?(char)
end

def self.ident?(name)
14 changes: 14 additions & 0 deletions src/unicode/data.cr
Original file line number Diff line number Diff line change
@@ -1922,6 +1922,20 @@ module Unicode
end
end

@@category_Pc : Array({Int32, Int32, Int32})?

private def self.category_Pc
@@category_Pc ||= begin
data = Array({Int32, Int32, Int32}).new(5)
put(data, 95, 8255, 8160)
put(data, 8256, 8276, 20)
put(data, 65075, 65076, 1)
put(data, 65101, 65103, 1)
put(data, 65343, 65343, 1)
data
end
end

@@category_Zs : Array({Int32, Int32, Int32})?

private def self.category_Zs
25 changes: 25 additions & 0 deletions src/unicode/unicode.cr
Original file line number Diff line number Diff line change
@@ -197,6 +197,16 @@ module Unicode
in_any_category?(char.ord, category_Nd, category_Nl, category_No)
end

# :nodoc:
def self.number_digit?(char : Char) : Bool
in_any_category?(char.ord, category_Nd)
end

# :nodoc:
def self.number_letter?(char : Char) : Bool
in_any_category?(char.ord, category_Nl)
end

# :nodoc:
def self.control?(char : Char) : Bool
in_any_category?(char.ord, category_Cs, category_Co, category_Cn, category_Cf, category_Cc)
@@ -207,11 +217,26 @@ module Unicode
in_any_category?(char.ord, category_Zs, category_Zl, category_Zp)
end

# :nodoc:
def self.punctuation_connector?(char : Char) : Bool
in_any_category?(char.ord, category_Pc)
end

# :nodoc:
def self.mark?(char : Char) : Bool
in_any_category?(char.ord, category_Mn, category_Me, category_Mc)
end

# :nodoc:
def self.mark_nonspacing?(char : Char) : Bool
in_any_category?(char.ord, category_Mn)
end

# :nodoc:
def self.mark_spacing_combining?(char : Char) : Bool
in_any_category?(char.ord, category_Mc)
end

private def self.search_ranges(haystack, needle)
value = haystack.bsearch { |low, high, delta| needle <= high }
if value && value[0] <= needle <= value[1]

0 comments on commit 1653cf3

Please sign in to comment.