Skip to content

Commit

Permalink
Fix #4248: Unicode code point escapes (#4498)
Browse files Browse the repository at this point in the history
  • Loading branch information
Julian Rosse authored and lydell committed Apr 20, 2017
1 parent bfce054 commit 96b6c5f
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 24 deletions.
75 changes: 62 additions & 13 deletions lib/coffee-script/lexer.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 46 additions & 11 deletions src/lexer.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,14 @@ exports.Lexer = class Lexer
indent = attempt if indent is null or 0 < attempt.length < indent.length
indentRegex = /// \n#{indent} ///g if indent
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace indentRegex, '\n' if indentRegex
value = value.replace LEADING_BLANK_LINE, '' if i is 0
value = value.replace TRAILING_BLANK_LINE, '' if i is $
value
else
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace SIMPLE_STRING_OMIT, (match, offset) ->
if (i is 0 and offset is 0) or
(i is $ and offset + match.length is value.length)
Expand Down Expand Up @@ -318,6 +318,7 @@ exports.Lexer = class Lexer
when match = REGEX.exec @chunk
[regex, body, closed] = match
@validateEscapes body, isRegex: yes, offsetInChunk: 1
body = @formatRegex body, delimiter: '/'
index = regex.length
[..., prev] = @tokens
if prev
Expand Down Expand Up @@ -632,7 +633,7 @@ exports.Lexer = class Lexer
tokensToPush = value
when 'NEOSTRING'
# Convert 'NEOSTRING' into 'STRING'.
converted = fn token[1], i
converted = fn.call this, token[1], i
# Optimize out empty strings. We ensure that the tokens stream always
# starts with a string token, though, to make sure that the result
# really is a string.
Expand Down Expand Up @@ -762,11 +763,37 @@ exports.Lexer = class Lexer
'**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||',
'BIN?', 'THROW', 'EXTENDS']

formatString: (str) ->
str.replace STRING_OMIT, '$1'
formatString: (str, options) ->
@replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options

formatHeregex: (str) ->
str.replace HEREGEX_OMIT, '$1$2'
@formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'

formatRegex: (str, options) ->
@replaceUnicodeCodePointEscapes str, options

unicodeCodePointToUnicodeEscapes: (codePoint) ->
toUnicodeEscape = (val) ->
str = val.toString 16
"\\u#{repeat '0', 4 - str.length}#{str}"
return toUnicodeEscape(codePoint) if codePoint < 0x10000
# surrogate pair
high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800
low = (codePoint - 0x10000) % 0x400 + 0xDC00
"#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"

# Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
replaceUnicodeCodePointEscapes: (str, options) ->
str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
return escapedBackslash if escapedBackslash

codePointDecimal = parseInt codePointHex, 16
if codePointDecimal > 0x10ffff
@error "unicode code point escapes greater than \\u{10ffff} are not allowed",
offset: offset + options.delimiter.length
length: codePointHex.length + 4

@unicodeCodePointToUnicodeEscapes codePointDecimal

# Validates escapes in strings and regexes.
validateEscapes: (str, options = {}) ->
Expand All @@ -777,13 +804,13 @@ exports.Lexer = class Lexer
STRING_INVALID_ESCAPE
match = invalidEscapeRegex.exec str
return unless match
[[], before, octal, hex, unicode] = match
[[], before, octal, hex, unicodeCodePoint, unicode] = match
message =
if octal
"octal escape sequences are not allowed"
else
"invalid escape sequence"
invalidEscape = "\\#{octal or hex or unicode}"
invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}"
@error "#{message} #{invalidEscape}",
offset: (options.offsetInChunk ? 0) + match.index + before.length
length: invalidEscape.length
Expand Down Expand Up @@ -970,7 +997,7 @@ REGEX = /// ^
///

REGEX_FLAGS = /^\w*/
VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/
VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/

HEREGEX = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* ///

Expand All @@ -994,18 +1021,26 @@ STRING_INVALID_ESCAPE = ///
\\ (
?: (0[0-7]|[1-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///
REGEX_INVALID_ESCAPE = ///
( (?:^|[^\\]) (?:\\\\)* ) # make sure the escape isn’t escaped
\\ (
?: (0[0-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///
UNICODE_CODE_POINT_ESCAPE = ///
( \\\\ ) # make sure the escape isn’t escaped
|
\\u\{ ( [\da-fA-F]+ ) \}
///g
LEADING_BLANK_LINE = /^[^\n\S]*\n/
TRAILING_BLANK_LINE = /\n[^\n\S]*$/
Expand Down
62 changes: 62 additions & 0 deletions test/error_messages.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", ->
a for b, {c} in d
^^^
'''

test "#4248: Unicode code point escapes", ->
assertErrorFormat '''
"a
#{b} \\u{G02}
c"
''', '''
[stdin]:2:8: error: invalid escape sequence \\u{G02}
#{b} \\u{G02}
^\^^^^^^
'''
assertErrorFormat '''
/a\\u{}b/
''', '''
[stdin]:1:3: error: invalid escape sequence \\u{}
/a\\u{}b/
^\^^^
'''
assertErrorFormat '''
///a \\u{01abc///
''', '''
[stdin]:1:6: error: invalid escape sequence \\u{01abc
///a \\u{01abc///
^\^^^^^^^
'''

assertErrorFormat '''
/\\u{123} \\u{110000}/
''', '''
[stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed
/\\u{123} \\u{110000}/
\ ^\^^^^^^^^^
'''

assertErrorFormat '''
///abc\\\\\\u{123456}///u
''', '''
[stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed
///abc\\\\\\u{123456}///u
\ \^\^^^^^^^^^
'''

assertErrorFormat '''
"""
\\u{123}
a
\\u{00110000}
#{ 'b' }
"""
''', '''
[stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed
\\u{00110000}
^\^^^^^^^^^^^
'''

assertErrorFormat '''
'\\u{a}\\u{1111110000}'
''', '''
[stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed
'\\u{a}\\u{1111110000}'
\ ^\^^^^^^^^^^^^^
'''
35 changes: 35 additions & 0 deletions test/regexps.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
# * Regexen
# * Heregexen

# Helper function
toJS = (str) ->
CoffeeScript.compile str, bare: yes
.replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace


test "basic regular expression literals", ->
ok 'a'.match(/a/)
ok 'a'.match /a/
Expand Down Expand Up @@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", ->
ok ///#{a}\
///.test 'a\u2029'
ok ///#{a}\0
1///.test 'a\x001'

test "#4248: Unicode code point escapes", ->
ok /a\u{1ab}c/u.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///u.test 'a\u{1ab}c'
ok /a\u{12345}c/u.test 'a\ud808\udf45c'

# and now without u flag
ok /a\u{1ab}c/.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
ok /a\u{12345}c/.test 'a\ud808\udf45c'

# rewrite code point escapes
input = """
/\\u{bcdef}\\u{abc}/u
"""
output = """
/\\udab3\\uddef\\u0abc/u;
"""
eq toJS(input), output

input = """
///#{ 'a' }\\u{bcdef}///
"""
output = """
/a\\udab3\\uddef/;
"""
eq toJS(input), output
Loading

0 comments on commit 96b6c5f

Please sign in to comment.