Skip to content

Commit

Permalink
Restrict the use of empty-matching regular expressions (#1548)
Browse files Browse the repository at this point in the history
This commit makes several changes to the way ‘empty’ regular expressions work in
Rouge.

1. The use of ‘empty’ regular expressions is restricted. An empty-matching regex
   may not be used without a predicate (such as `:pop!`, `:push`, or a block).

2. The use of an empty-matching regular expression with no lookahead/lookbehind
   or anchoring "closes" the state. It is invalid to add more rules, as they
   would never be reached.

3. Lexers that were using improper empty-matching regular expressions have been
   fixed.
  • Loading branch information
jneen authored Oct 13, 2020
1 parent b2dcb98 commit 6fdc78c
Show file tree
Hide file tree
Showing 15 changed files with 84 additions and 28 deletions.
10 changes: 5 additions & 5 deletions lib/rouge/lexers/elm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ module exposing port
rule %r/"/, Str::Double, :pop!
end

# Multiple line string with tripple double quotes, e.g. """ multi """
# Multiple line string with triple double quotes, e.g. """ multi """
state :multiline_string do
rule %r/\s*"""/, Str, :pop!
rule %r/.*/, Str
rule %r/\s*/, Str
rule %r/\\"/, Str::Escape
rule %r/"""/, Str, :pop!
rule %r/[^"]+/, Str
rule %r/"/, Str
end

end
end
end
3 changes: 2 additions & 1 deletion lib/rouge/lexers/ghc_core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ class GHCCore < RegexLexer
mixin :function

# rest is Text
# TODO: this is really inefficient
rule %r/\s/m, Text
rule %r/.*/, Text
rule %r/./, Text
end

state :expression do
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/graphql.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class GraphQL < RegexLexer
end

rule %r/\bfragment\b/, Keyword, :fragment_definition

rule %r/\bscalar\b/, Keyword, :value

rule %r/\b(?:type|interface|enum)\b/, Keyword, :type_definition
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/isbl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def self.keywords

state :dotted do
mixin :whitespace
rule %r/[a-zа-яё_0-9]*/i do |m|
rule %r/[a-zа-яё_0-9]+/i do |m|
name = m[0]
if self.class.constants.include? name.downcase
token Name::Builtin
Expand All @@ -56,7 +56,7 @@ def self.keywords

state :type do
mixin :whitespace
rule %r/[a-zа-яё_0-9]*/i do |m|
rule %r/[a-zа-яё_0-9]+/i do |m|
name = m[0]
if self.class.interfaces.include? name.downcase
token Keyword::Type
Expand Down
5 changes: 3 additions & 2 deletions lib/rouge/lexers/javascript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,10 @@ def self.id_regex

# template strings
state :template_string do
rule %r/\${/, Punctuation, :template_string_expr
rule %r/[$]{/, Punctuation, :template_string_expr
rule %r/`/, Str::Double, :pop!
rule %r/(\\\\|\\[\$`]|[^\$`]|\$(?!{))*/, Str::Double
rule %r/\\[$`]/, Str::Escape
rule %r/[$]/, Str::Double
end

state :template_string_expr do
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/jsl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class JSL < RegexLexer
rule %r/\\![btrnNf0\\"]/, Str::Escape
rule %r/\\/, Str::Double
rule %r/"/, Str::Double, :pop!
rule %r/[^\\"]*/m, Str::Double
rule %r/[^\\"]+/m, Str::Double
end
end
end
Expand Down
7 changes: 4 additions & 3 deletions lib/rouge/lexers/jsonnet.rb
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def self.builtins

state :string do
rule %r/\\([\\\/bfnrt]|(u[0-9a-fA-F]{4}))/, Str::Escape
rule %r/\\./, Str::Escape
end

state :string_double do
Expand All @@ -137,15 +138,15 @@ def self.builtins

state :string_single do
mixin :string
rule %r/\\'/, Str::Escape
rule %r/'/, Str, :pop!
rule %r/[^\\']+/, Str
end

state :string_block do
mixin :string
rule %r/\|\|\|/, Str, :pop!
rule %r/.*/, Str
rule %r/[|][|][|]/, Str, :pop!
rule %r/[^|\\]+/, Str
rule %r/[|]/, Str
end
end
end
Expand Down
5 changes: 2 additions & 3 deletions lib/rouge/lexers/jsp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ def initialize(*)

state :jsp_interp_literal_start do
rule %r/'/, Literal, :pop!
rule %r/[^']*/, Literal
rule %r/[^']+/, Literal
end

end
end
end
end
7 changes: 4 additions & 3 deletions lib/rouge/lexers/kotlin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,10 @@ class Kotlin < RegexLexer
end

state :comment do
rule %r'\s*/[*].*', Comment::Multiline, :comment
rule %r'.*[*]/', Comment::Multiline, :pop!
rule %r'.*', Comment::Multiline
rule %r'/[*]', Comment::Multiline, :comment
rule %r'[*]/', Comment::Multiline, :pop!
rule %r'[^/*]+', Comment::Multiline
rule %r'[/*]', Comment::Multiline
end
end
end
Expand Down
1 change: 0 additions & 1 deletion lib/rouge/lexers/opentype_feature_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def self.keywords
state :strings do
rule %r/"/, Str, :pop!
rule %r/[^"%\n]+/, Str
rule %r/(\([a-z0-9_]+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?/i, Str
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/q.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def self.builtins
end

state :bottom do
rule %r/.*\z/m, Comment::Multiline
rule %r/.+\z/m, Comment::Multiline
end
end
end
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/ruby.rb
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def self.detect?(text)
end

state :regex_flags do
rule %r/[mixounse]*/, Str::Regex, :pop!
rule %r/[mixounse]+/, Str::Regex, :pop!
end

# double-quoted string and symbol
Expand Down Expand Up @@ -267,7 +267,7 @@ def self.detect?(text)
end

state :test_heredoc do
rule %r/[^#\\\n]*$/ do |m|
rule %r/[^#\\\n]+$/ do |m|
tolerant, heredoc_name = @heredoc_queue.first
check = tolerant ? m[0].strip : m[0].rstrip

Expand Down
2 changes: 1 addition & 1 deletion lib/rouge/lexers/smarty.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def self.builtins
end


rule(/.*?(?={[\/a-zA-Z0-9$#*"'])|.*/m) { delegate parent }
rule(/.+?(?={[\/a-zA-Z0-9$#*"'])/m) { delegate parent }
rule(/.+/m) { delegate parent }
end

Expand Down
1 change: 0 additions & 1 deletion lib/rouge/lexers/wollok.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class Wollok < RegexLexer

state :whitespaces_and_comments do
rule %r/\s+/m, Text::Whitespace
rule %r/$+/m, Text::Whitespace
rule %r(//.*$), Comment::Single
rule %r(/\*(.|\s)*?\*/)m, Comment::Multiline
end
Expand Down
57 changes: 56 additions & 1 deletion lib/rouge/regex_lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@ module Rouge
# A stateful lexer that uses sets of regular expressions to
# tokenize a string. Most lexers are instances of RegexLexer.
class RegexLexer < Lexer
class InvalidRegex < StandardError
def initialize(re)
@re = re
end

def to_s
"regex #{@re.inspect} matches empty string, but has no predicate!"
end
end

class ClosedState < StandardError
attr_reader :state
def initialize(state)
@state = state
end

def rule
@state.rules.last
end

def to_s
rule = @state.rules.last
msg = "State :#{state.name} cannot continue after #{rule.inspect}, which will always match."
if rule.re.source.include?('*')
msg += " Consider replacing * with +."
end

msg
end
end

# A rule is a tuple of a regular expression to test, and a callback
# to perform if the test succeeds.
#
Expand Down Expand Up @@ -42,12 +73,13 @@ def inspect
end

class StateDSL
attr_reader :rules
attr_reader :rules, :name
def initialize(name, &defn)
@name = name
@defn = defn
@rules = []
@loaded = false
@closed = false
end

def to_state(lexer_class)
Expand Down Expand Up @@ -95,10 +127,14 @@ def appended(&defn)
# {RegexLexer#token}, and {RegexLexer#delegate}. The first
# argument can be used to access the match groups.
def rule(re, tok=nil, next_state=nil, &callback)
raise ClosedState.new(self) if @closed

if tok.nil? && callback.nil?
raise "please pass `rule` a token to yield or a callback"
end

matches_empty = re =~ ''

callback ||= case next_state
when :pop!
proc do |stream|
Expand All @@ -123,6 +159,9 @@ def rule(re, tok=nil, next_state=nil, &callback)
@stack.push(state)
end
when nil
# cannot use an empty-matching regexp with no predicate
raise InvalidRegex.new(re) if matches_empty

proc do |stream|
puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug
@output_stream.call(tok, stream[0])
Expand All @@ -132,6 +171,22 @@ def rule(re, tok=nil, next_state=nil, &callback)
end

rules << Rule.new(re, callback)

close! if matches_empty && !context_sensitive?(re)
end

def context_sensitive?(re)
source = re.source
return true if source =~ /[(][?]<?[!=]/

# anchors count as lookahead/behind
return true if source =~ /[$^]/

false
end

def close!
@closed = true
end

# Mix in the rules from another state into this state. The rules
Expand Down

0 comments on commit 6fdc78c

Please sign in to comment.