Skip to content
This repository has been archived by the owner on Sep 8, 2023. It is now read-only.

Commit

Permalink
Improve support for Unicode identifiers in various lexers (rouge-ruby…
Browse files Browse the repository at this point in the history
…#1537)

Most of Rouge's lexers use rules that only match ASCII characters. This
is often not strictly correct as many languages support the use of
non-ASCII characters in their identifiers.

This commit adds support for non-ASCII characters to the CSS, HTML,
JavaScript, Julia, XML and YAML lexers. The regular expressions used
are more permissive than they should be if they were to be completely
correct but this is intentional. Ease of maintenance has been
prioritised over syntactic correctness.

Co-authored-by: Michael Camilleri <[email protected]>
  • Loading branch information
2 people authored and mattt committed May 19, 2021
1 parent 5845b2f commit d3592ba
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 17 deletions.
4 changes: 3 additions & 1 deletion lib/rouge/lexers/css.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class CSS < RegexLexer
filenames '*.css'
mimetypes 'text/css'

identifier = /[a-zA-Z0-9_-]+/
# Documentation: https://www.w3.org/TR/CSS21/syndata.html#characters

identifier = /[\p{L}_-][\p{Word}\p{Cf}-]*/
number = /-?(?:[0-9]+(\.[0-9]+)?|\.[0-9]+)/

def self.attributes
Expand Down
12 changes: 6 additions & 6 deletions lib/rouge/lexers/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ def self.detect?(text)
rule %r(</), Name::Tag, :tag_end
rule %r/</, Name::Tag, :tag_start

rule %r(<\s*[a-zA-Z0-9:-]+), Name::Tag, :tag # opening tags
rule %r(<\s*/\s*[a-zA-Z0-9:-]+\s*>), Name::Tag # closing tags
rule %r(<\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*), Name::Tag, :tag # opening tags
rule %r(<\s*/\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*\s*>), Name::Tag # closing tags
end

state :tag_end do
mixin :tag_end_end
rule %r/[a-zA-Z0-9:-]+/ do
rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
token Name::Tag
goto :tag_end_end
end
Expand All @@ -67,7 +67,7 @@ def self.detect?(text)
state :tag_start do
rule %r/\s+/, Text

rule %r/[a-zA-Z0-9:-]+/ do
rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
token Name::Tag
goto :tag
end
Expand All @@ -83,8 +83,8 @@ def self.detect?(text)

state :tag do
rule %r/\s+/m, Text
rule %r/[a-zA-Z0-9_:\[\]()*.-]+\s*=\s*/m, Name::Attribute, :attr
rule %r/[a-zA-Z0-9_:#*-]+/, Name::Attribute
rule %r/[\p{L}:_\[\]()*.-][\p{Word}\p{Cf}:.·\[\]()*-]*\s*=\s*/m, Name::Attribute, :attr
rule %r/[\p{L}:_*#-][\p{Word}\p{Cf}:.·*#-]*/, Name::Attribute
rule %r(/?\s*>)m, Name::Tag, :pop!
end

Expand Down
4 changes: 3 additions & 1 deletion lib/rouge/lexers/javascript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Javascript < RegexLexer
mimetypes 'application/javascript', 'application/x-javascript',
'text/javascript', 'text/x-javascript'

# Pseudo-documentation: https://stackoverflow.com/questions/1661197/what-characters-are-valid-for-javascript-variable-names

def self.detect?(text)
return 1 if text.shebang?('node')
return 1 if text.shebang?('jsc')
Expand Down Expand Up @@ -138,7 +140,7 @@ def self.builtins
end

def self.id_regex
/[$a-z_][a-z0-9_]*/io
/[\p{L}\p{Nl}$_][\p{Word}]*/io
end

id = self.id_regex
Expand Down
6 changes: 4 additions & 2 deletions lib/rouge/lexers/julia.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class Julia < RegexLexer
filenames '*.jl'
mimetypes 'text/x-julia', 'application/x-julia'

# Documentation: https://docs.julialang.org/en/v1/manual/variables/#Allowed-Variable-Names-1

def self.detect?(text)
return true if text.shebang? 'julia'
end
Expand Down Expand Up @@ -252,13 +254,13 @@ def self.detect?(text)


state :funcname do
rule %r/[a-zA-Z_]\w*/, Name::Function, :pop!
rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Function, :pop!
rule %r/\([^\s\w{]{1,2}\)/, Operator, :pop!
rule %r/[^\s\w{]{1,2}/, Operator, :pop!
end

state :typename do
rule %r/[a-zA-Z_]\w*/, Name::Class, :pop!
rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Class, :pop!
end

state :stringescape do
Expand Down
8 changes: 5 additions & 3 deletions lib/rouge/lexers/xml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ class XML < RegexLexer
mimetypes 'text/xml', 'application/xml', 'image/svg+xml',
'application/rss+xml', 'application/atom+xml'

# Documentation: https://www.w3.org/TR/xml11/#charsets and https://www.w3.org/TR/xml11/#sec-suggested-names

def self.detect?(text)
return false if text.doctype?(/html/)
return true if text =~ /\A<\?xml\b/
Expand All @@ -27,10 +29,10 @@ def self.detect?(text)
rule %r/<![^>]*>/, Comment::Preproc

# open tags
rule %r(<\s*[\w:.-]+)m, Name::Tag, :tag
rule %r(<\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*)m, Name::Tag, :tag

# self-closing tags
rule %r(<\s*/\s*[\w:.-]+\s*>)m, Name::Tag
rule %r(<\s*/\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*>)m, Name::Tag
end

state :comment do
Expand All @@ -41,7 +43,7 @@ def self.detect?(text)

state :tag do
rule %r/\s+/m, Text
rule %r/[\w.:-]+\s*=/m, Name::Attribute, :attr
rule %r/[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*=/m, Name::Attribute, :attr
rule %r(/?\s*>), Name::Tag, :pop!
end

Expand Down
8 changes: 5 additions & 3 deletions lib/rouge/lexers/yaml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class YAML < RegexLexer
aliases 'yml'
filenames '*.yaml', '*.yml'

# Documentation: https://yaml.org/spec/1.2/spec.html

def self.detect?(text)
# look for the %YAML directive
return true if text =~ /\A\s*%YAML/m
Expand Down Expand Up @@ -165,15 +167,15 @@ def set_indent(match, opts={})
)x, Keyword::Type

# an anchor
rule %r/&[\w-]+/, Name::Label
rule %r/&[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Label

# an alias
rule %r/\*[\w-]+/, Name::Variable
rule %r/\*[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Variable
end

state :block_nodes do
# implicit key
rule %r/((?:\w[\w -]*)?)(:)(?=\s|$)/ do |m|
rule %r/((?:[\p{L}\p{Nl}\p{Nd}_][\p{L}\p{Nl}\p{Nd}\p{Blank}_-]*)?)(:)(?=\s|$)/ do |m|
groups Name::Attribute, Punctuation::Indicator
set_indent m[0], :implicit => true
end
Expand Down
7 changes: 6 additions & 1 deletion spec/visual/samples/css
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ ul#nav li.new {
font-size: 29px ! important;
}

a[target="_blank"] {
a[target="_blank"] {
background-color: yellow;
}

/* Unicode example */
œuvre 书名[语言="français"] {
color: blue;
}
5 changes: 5 additions & 0 deletions spec/visual/samples/html
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,8 @@ Hello tagless world!
<custom-element #ref></custom-element>
<custom-element [target]="expression"></custom-element>
<custom-element (target)="expression"></custom-element>

<!-- Unicode example -->
<œuvre>
<书名 语言="français">Les Misérables</书名>
</œuvre>
7 changes: 7 additions & 0 deletions spec/visual/samples/javascript
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,10 @@ var myOct = 0o67;

let x = /abc/u;
let x = /abc/y;

// Unicode example
class Œuvre {
résumer(语言 = "français") {
书名 = "Les Misérables";
}
}
11 changes: 11 additions & 0 deletions spec/visual/samples/julia
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,14 @@ end
# the author.
#
# "Learn Julia in Y Minutes" is licensed under http://creativecommons.org/licenses/by-sa/3.0/legalcode

# Unicode example
mutable struct Œuvre end
⇵ = uppercase

function résumer_œuvre(书名::Œuvre="Les Misérables")
语言 = "français"
for ϕ ∈ 1:1
⇵(语言) # "FRANÇAIS"
end
end
4 changes: 4 additions & 0 deletions spec/visual/samples/xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@

</xsl:stylesheet>

<!-- Unicode example -->
<œuvre>
<书名 语言="français">Les Misérables</书名>
</œuvre>
6 changes: 6 additions & 0 deletions spec/visual/samples/yaml
Original file line number Diff line number Diff line change
Expand Up @@ -347,3 +347,9 @@ Stack:
code: |-
foo = bar

# Unicode example
œuvre:
书名: Les Misérables
语言: français
référence: &réf_01
alias: *λ-01

0 comments on commit d3592ba

Please sign in to comment.