Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maint: Restrict empty-matching regexes #1548

Merged
merged 5 commits into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions lib/rouge/lexers/elm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ module exposing port
rule %r/"/, Str::Double, :pop!
end

# Multiple line string with tripple double quotes, e.g. """ multi """
# Multiple line string with triple double quotes, e.g. """ multi """
state :multiline_string do
rule %r/\s*"""/, Str, :pop!
rule %r/.*/, Str
rule %r/\s*/, Str
rule %r/\\"/, Str::Escape
rule %r/"""/, Str, :pop!
rule %r/[^"]+/, Str
rule %r/"/, Str
end

end
end
end
3 changes: 2 additions & 1 deletion lib/rouge/lexers/ghc_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ class GHCCore < RegexLexer
mixin :function

# rest is Text
# TODO: this is really inefficient
rule %r/\s/m, Text
rule %r/.*/, Text
rule %r/./, Text
end

state :expression do
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/graphql.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class GraphQL < RegexLexer
end

rule %r/\bfragment\b/, Keyword, :fragment_definition

rule %r/\bscalar\b/, Keyword, :value

rule %r/\b(?:type|interface|enum)\b/, Keyword, :type_definition
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/isbl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def self.keywords

state :dotted do
mixin :whitespace
rule %r/[a-zа-яё_0-9]*/i do |m|
rule %r/[a-zа-яё_0-9]+/i do |m|
name = m[0]
if self.class.constants.include? name.downcase
token Name::Builtin
Expand All @@ -56,7 +56,7 @@ def self.keywords

state :type do
mixin :whitespace
rule %r/[a-zа-яё_0-9]*/i do |m|
rule %r/[a-zа-яё_0-9]+/i do |m|
name = m[0]
if self.class.interfaces.include? name.downcase
token Keyword::Type
Expand Down
5 changes: 3 additions & 2 deletions lib/rouge/lexers/javascript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,10 @@ def self.id_regex

# template strings
state :template_string do
rule %r/\${/, Punctuation, :template_string_expr
rule %r/[$]{/, Punctuation, :template_string_expr
rule %r/`/, Str::Double, :pop!
rule %r/(\\\\|\\[\$`]|[^\$`]|\$(?!{))*/, Str::Double
rule %r/\\[$`]/, Str::Escape
rule %r/[$]/, Str::Double
end

state :template_string_expr do
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/jsl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class JSL < RegexLexer
rule %r/\\![btrnNf0\\"]/, Str::Escape
rule %r/\\/, Str::Double
rule %r/"/, Str::Double, :pop!
rule %r/[^\\"]*/m, Str::Double
rule %r/[^\\"]+/m, Str::Double
end
end
end
Expand Down
7 changes: 4 additions & 3 deletions lib/rouge/lexers/jsonnet.rb
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def self.builtins

state :string do
rule %r/\\([\\\/bfnrt]|(u[0-9a-fA-F]{4}))/, Str::Escape
rule %r/\\./, Str::Escape
end

state :string_double do
Expand All @@ -137,15 +138,15 @@ def self.builtins

state :string_single do
mixin :string
rule %r/\\'/, Str::Escape
rule %r/'/, Str, :pop!
rule %r/[^\\']+/, Str
end

state :string_block do
mixin :string
rule %r/\|\|\|/, Str, :pop!
rule %r/.*/, Str
rule %r/[|][|][|]/, Str, :pop!
rule %r/[^|\\]+/, Str
rule %r/[|]/, Str
end
end
end
Expand Down
5 changes: 2 additions & 3 deletions lib/rouge/lexers/jsp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ def initialize(*)

state :jsp_interp_literal_start do
rule %r/'/, Literal, :pop!
rule %r/[^']*/, Literal
rule %r/[^']+/, Literal
end

end
end
end
end
7 changes: 4 additions & 3 deletions lib/rouge/lexers/kotlin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,10 @@ class Kotlin < RegexLexer
end

state :comment do
rule %r'\s*/[*].*', Comment::Multiline, :comment
rule %r'.*[*]/', Comment::Multiline, :pop!
rule %r'.*', Comment::Multiline
rule %r'/[*]', Comment::Multiline, :comment
rule %r'[*]/', Comment::Multiline, :pop!
rule %r'[^/*]+', Comment::Multiline
rule %r'[/*]', Comment::Multiline
end
end
end
Expand Down
1 change: 0 additions & 1 deletion lib/rouge/lexers/opentype_feature_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def self.keywords
state :strings do
rule %r/"/, Str, :pop!
rule %r/[^"%\n]+/, Str
rule %r/(\([a-z0-9_]+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?/i, Str
pyrmont marked this conversation as resolved.
Show resolved Hide resolved
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/q.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def self.builtins
end

state :bottom do
rule %r/.*\z/m, Comment::Multiline
rule %r/.+\z/m, Comment::Multiline
end
end
end
Expand Down
14 changes: 7 additions & 7 deletions lib/rouge/lexers/rego.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,29 @@ class Rego < RegexLexer
state :basic do
rule %r/\s+/, Text
rule %r/#.*/, Comment::Single

rule %r/[\[\](){}|.,;!]/, Punctuation

rule %r/"[^"]*"/, Str::Double

rule %r/-?\d+\.\d+([eE][+-]?\d+)?/, Num::Float
rule %r/-?\d+([eE][+-]?\d+)?/, Num

rule %r/\\u[0-9a-fA-F]{4}/, Num::Hex
rule %r/\\["\/bfnrt]/, Str::Escape
end

state :atoms do
rule %r/(true|false|null)/, Keyword::Constant
rule %r/[[:word:]]*/, Str::Symbol
rule %r/[[:word:]]+/, Str::Symbol
end

state :operators do
rule %r/(=|!=|>=|<=|>|<|\+|-|\*|%|\/|\||&|:=)/, Operator
rule %r/(default|not|package|import|as|with|else|some)/, Operator
rule %r/[\/:?@^~]+/, Operator
end

state :root do
mixin :basic
mixin :operators
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/ruby.rb
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def self.detect?(text)
end

state :regex_flags do
rule %r/[mixounse]*/, Str::Regex, :pop!
rule %r/[mixounse]+/, Str::Regex, :pop!
end

# double-quoted string and symbol
Expand Down Expand Up @@ -267,7 +267,7 @@ def self.detect?(text)
end

state :test_heredoc do
rule %r/[^#\\\n]*$/ do |m|
rule %r/[^#\\\n]+$/ do |m|
tolerant, heredoc_name = @heredoc_queue.first
check = tolerant ? m[0].strip : m[0].rstrip

Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/smarty.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def self.builtins
end


rule(/.*?(?={[\/a-zA-Z0-9$#*"'])|.*/m) { delegate parent }
rule(/.+?(?={[\/a-zA-Z0-9$#*"'])/m) { delegate parent }
rule(/.+/m) { delegate parent }
end

Expand Down
1 change: 0 additions & 1 deletion lib/rouge/lexers/wollok.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class Wollok < RegexLexer

state :whitespaces_and_comments do
rule %r/\s+/m, Text::Whitespace
rule %r/$+/m, Text::Whitespace
rule %r(//.*$), Comment::Single
rule %r(/\*(.|\s)*?\*/)m, Comment::Multiline
end
Expand Down
57 changes: 56 additions & 1 deletion lib/rouge/regex_lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@ module Rouge
# A stateful lexer that uses sets of regular expressions to
# tokenize a string. Most lexers are instances of RegexLexer.
class RegexLexer < Lexer
class InvalidRegex < StandardError
def initialize(re)
@re = re
end

def to_s
"regex #{@re.inspect} matches empty string, but has no predicate!"
end
end

class ClosedState < StandardError
attr_reader :state
def initialize(state)
@state = state
end

def rule
@state.rules.last
end

def to_s
rule = @state.rules.last
msg = "State :#{state.name} cannot continue after #{rule.inspect}, which will always match."
if rule.re.source.include?('*')
msg += " Consider replacing * with +."
end

msg
end
end

# A rule is a tuple of a regular expression to test, and a callback
# to perform if the test succeeds.
#
Expand Down Expand Up @@ -42,12 +73,13 @@ def inspect
end

class StateDSL
attr_reader :rules
attr_reader :rules, :name
def initialize(name, &defn)
@name = name
@defn = defn
@rules = []
@loaded = false
@closed = false
end

def to_state(lexer_class)
Expand Down Expand Up @@ -95,10 +127,14 @@ def appended(&defn)
# {RegexLexer#token}, and {RegexLexer#delegate}. The first
# argument can be used to access the match groups.
def rule(re, tok=nil, next_state=nil, &callback)
raise ClosedState.new(self) if @closed

if tok.nil? && callback.nil?
raise "please pass `rule` a token to yield or a callback"
end

matches_empty = re =~ ''

callback ||= case next_state
when :pop!
proc do |stream|
Expand All @@ -123,6 +159,9 @@ def rule(re, tok=nil, next_state=nil, &callback)
@stack.push(state)
end
when nil
# cannot use an empty-matching regexp with no predicate
raise InvalidRegex.new(re) if matches_empty

proc do |stream|
puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug
@output_stream.call(tok, stream[0])
Expand All @@ -132,6 +171,22 @@ def rule(re, tok=nil, next_state=nil, &callback)
end

rules << Rule.new(re, callback)

close! if matches_empty && !context_sensitive?(re)
end

def context_sensitive?(re)
source = re.source
return true if source =~ /[(][?]<?[!=]/

# anchors count as lookahead/behind
return true if source =~ /[$^]/

false
end

def close!
@closed = true
end

# Mix in the rules from another state into this state. The rules
Expand Down