Fix unicode escape in identifiers (#1102)

boa-dev · May 22, 2021 · 08f232f · 08f232f
1 parent e187489
commit 08f232f
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 39 deletions.
diff --git a/boa/src/syntax/ast/keyword.rs b/boa/src/syntax/ast/keyword.rs
@@ -199,6 +199,16 @@ pub enum Keyword {
     /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends
     Extends,
 
+    /// The `false` keyword.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+    False,
+
     /// The `finally` keyword.
     ///
     /// More information:
@@ -301,6 +311,16 @@ pub enum Keyword {
     /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new
     New,
 
+    /// The `null` keyword.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-NullLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null
+    Null,
+
     /// The `of` keyword.
     ///
     /// More information:
@@ -369,6 +389,16 @@ pub enum Keyword {
     /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions
     Throw,
 
+    /// The `true` keyword
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+    True,
+
     /// The `try` keyword.
     ///
     /// More information:
@@ -479,6 +509,7 @@ impl Keyword {
             Self::Enum => "enum",
             Self::Extends => "extends",
             Self::Export => "export",
+            Self::False => "false",
             Self::Finally => "finally",
             Self::For => "for",
             Self::Function => "function",
@@ -488,12 +519,14 @@ impl Keyword {
             Self::Import => "import",
             Self::Let => "let",
             Self::New => "new",
+            Self::Null => "null",
             Self::Of => "of",
             Self::Return => "return",
             Self::Super => "super",
             Self::Switch => "switch",
             Self::This => "this",
             Self::Throw => "throw",
+            Self::True => "true",
             Self::Try => "try",
             Self::TypeOf => "typeof",
             Self::Var => "var",
@@ -552,6 +585,7 @@ impl FromStr for Keyword {
             "enum" => Ok(Self::Enum),
             "extends" => Ok(Self::Extends),
             "export" => Ok(Self::Export),
+            "false" => Ok(Self::False),
             "finally" => Ok(Self::Finally),
             "for" => Ok(Self::For),
             "function" => Ok(Self::Function),
@@ -561,12 +595,14 @@ impl FromStr for Keyword {
             "import" => Ok(Self::Import),
             "let" => Ok(Self::Let),
             "new" => Ok(Self::New),
+            "null" => Ok(Self::Null),
             "of" => Ok(Self::Of),
             "return" => Ok(Self::Return),
             "super" => Ok(Self::Super),
             "switch" => Ok(Self::Switch),
             "this" => Ok(Self::This),
             "throw" => Ok(Self::Throw),
+            "true" => Ok(Self::True),
             "try" => Ok(Self::Try),
             "typeof" => Ok(Self::TypeOf),
             "var" => Ok(Self::Var),

diff --git a/boa/src/syntax/lexer/cursor.rs b/boa/src/syntax/lexer/cursor.rs
@@ -130,6 +130,7 @@ where
     /// predicate on the ascii char
     ///
     /// The buffer is not incremented.
+    #[allow(dead_code)]
     #[inline]
     pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
     where
@@ -191,6 +192,7 @@ where
     /// It also stops when there is no next character.
     ///
     /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    #[allow(dead_code)]
     pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
     where
         F: Fn(u32) -> bool,

diff --git a/boa/src/syntax/lexer/identifier.rs b/boa/src/syntax/lexer/identifier.rs
@@ -5,7 +5,7 @@ use crate::{
     profiler::BoaProfiler,
     syntax::{
         ast::{Keyword, Position, Span},
-        lexer::{Token, TokenKind},
+        lexer::{StringLiteral, Token, TokenKind},
     },
 };
 use boa_unicode::UnicodeProperties;
@@ -86,43 +86,100 @@ impl<R> Tokenizer<R> for Identifier {
     {
         let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");
 
-        let mut init_buf = [0u8; 4];
-        let mut buf = Vec::new();
-        self.init.encode_utf8(&mut init_buf);
-        buf.extend(init_buf.iter().take(self.init.len_utf8()));
-
-        cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;
-
-        let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
-        let tk = match token_str {
-            "true" => TokenKind::BooleanLiteral(true),
-            "false" => TokenKind::BooleanLiteral(false),
-            "null" => TokenKind::NullLiteral,
-            slice => {
-                if let Ok(keyword) = slice.parse() {
-                    if cursor.strict_mode() && keyword == Keyword::With {
-                        return Err(Error::Syntax(
-                            "using 'with' statement not allowed in strict mode".into(),
-                            start_pos,
-                        ));
-                    }
-                    TokenKind::Keyword(keyword)
-                } else {
-                    if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) {
-                        return Err(Error::Syntax(
-                            format!(
-                                "using future reserved keyword '{}' not allowed in strict mode",
-                                slice
-                            )
-                            .into(),
-                            start_pos,
-                        ));
-                    }
-                    TokenKind::identifier(slice)
-                }
+        let (identifier_name, contains_escaped_chars) =
+            Self::take_identifier_name(cursor, start_pos, self.init)?;
+
+        let token_kind = if let Ok(keyword) = identifier_name.parse() {
+            if contains_escaped_chars {
+                return Err(Error::Syntax(
+                    "unicode escaped characters are not allowed in keyword".into(),
+                    start_pos,
+                ));
+            }
+
+            if cursor.strict_mode() && keyword == Keyword::With {
+                return Err(Error::Syntax(
+                    "using 'with' statement not allowed in strict mode".into(),
+                    start_pos,
+                ));
+            }
+
+            match keyword {
+                Keyword::True => TokenKind::BooleanLiteral(true),
+                Keyword::False => TokenKind::BooleanLiteral(false),
+                Keyword::Null => TokenKind::NullLiteral,
+                _ => TokenKind::Keyword(keyword),
             }
+        } else {
+            if cursor.strict_mode()
+                && STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str())
+            {
+                return Err(Error::Syntax(
+                    format!(
+                        "using future reserved keyword '{}' not allowed in strict mode",
+                        identifier_name
+                    )
+                    .into(),
+                    start_pos,
+                ));
+            }
+            TokenKind::identifier(identifier_name.into_boxed_str())
         };
 
-        Ok(Token::new(tk, Span::new(start_pos, cursor.pos())))
+        Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
+    }
+}
+
+impl Identifier {
+    #[inline]
+    fn take_identifier_name<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        init: char,
+    ) -> Result<(String, bool), Error>
+    where
+        R: Read,
+    {
+        let mut contains_escaped_chars = false;
+        let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? {
+            let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;
+
+            if Self::is_identifier_start(ch) {
+                contains_escaped_chars = true;
+                String::from(char::try_from(ch).unwrap())
+            } else {
+                return Err(Error::Syntax("invalid identifier start".into(), start_pos));
+            }
+        } else {
+            // The caller guarantees that `init` is a valid identifier start
+            String::from(init)
+        };
+
+        loop {
+            let ch = match cursor.peek_char()? {
+                Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => {
+                    let pos = cursor.pos();
+                    let _ = cursor.next_byte();
+                    let _ = cursor.next_byte();
+                    let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?;
+
+                    if Self::is_identifier_part(ch) {
+                        contains_escaped_chars = true;
+                        ch
+                    } else {
+                        return Err(Error::Syntax("invalid identifier part".into(), pos));
+                    }
+                }
+                Some(ch) if Self::is_identifier_part(ch) => {
+                    let _ = cursor.next_char()?;
+                    ch
+                },
+                _ => break,
+            };
+
+            identifier_name.push(char::try_from(ch).unwrap());
+        }
+
+        Ok((identifier_name, contains_escaped_chars))
     }
 }
diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs
@@ -246,12 +246,15 @@ impl<R> Lexer<R> {
                 '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
                     Operator::new(next_ch as u8).lex(&mut self.cursor, start)
                 }
-                _ if c.is_digit(10) => {
-                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                '\\' if self.cursor.peek()? == Some(b'u') => {
+                    Identifier::new(c).lex(&mut self.cursor, start)
                 }
                 _ if Identifier::is_identifier_start(c as u32) => {
                     Identifier::new(c).lex(&mut self.cursor, start)
                 }
+                _ if c.is_digit(10) => {
+                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
                 _ => {
                     let details = format!(
                         "unexpected '{}' at line {}, column {}",

diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs
@@ -73,7 +73,7 @@ fn check_multi_line_comment() {
 
 #[test]
 fn check_identifier() {
-    let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
+    let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}";
     let mut lexer = Lexer::new(s.as_bytes());
 
     let expected = [
@@ -86,6 +86,9 @@ fn check_identifier() {
         TokenKind::identifier("Ѐ"),
         TokenKind::identifier("ЀЀ"),
         TokenKind::identifier("x\u{200C}\u{200D}"),
+        TokenKind::identifier("x"),
+        TokenKind::identifier("xx"),
+        TokenKind::identifier("xxx"),
     ];
 
     expect_tokens(&mut lexer, &expected);