From bded0ae580918b901eda7c65186486fc244f0a50 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Sun, 17 Jan 2021 17:29:47 -0800 Subject: [PATCH 01/23] Refactor StringLiteral --- boa/src/syntax/lexer/string.rs | 346 ++++++++++++++++++------------- boa/src/syntax/lexer/template.rs | 6 +- boa/src/syntax/lexer/tests.rs | 6 +- 3 files changed, 203 insertions(+), 155 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 6f20599811e..52d1b8c59cf 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -58,171 +58,219 @@ impl Tokenizer for StringLiteral { let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let (lit, span) = - unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; + Self::unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; Ok(Token::new(TokenKind::string_literal(lit), span)) } } -pub(super) fn unescape_string( - cursor: &mut Cursor, - start_pos: Position, - terminator: StringTerminator, - strict_mode: bool, -) -> Result<(String, Span), Error> -where - R: Read, -{ - let mut buf = Vec::new(); - loop { - let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); - - match next_chr { - Some('\'') if terminator == StringTerminator::SingleQuote => { - break; - } - Some('"') if terminator == StringTerminator::DoubleQuote => { - break; - } - Some('\\') => { - let _timer = - BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing"); +impl StringLiteral { + pub(super) fn unescape_string( + cursor: &mut Cursor, + start_pos: Position, + terminator: StringTerminator, + strict_mode: bool, + ) -> Result<(String, Span), Error> + where + R: Read, + { + let mut buf = Vec::new(); + loop { + let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); - let escape = cursor.peek()?.ok_or_else(|| { - Error::from(io::Error::new( - ErrorKind::UnexpectedEof, - "unterminated escape sequence in literal", - )) - })?; - - if escape <= 0x7f { - let _ = cursor.next_byte()?; - match escape { - b'\n' => (), - b'n' => buf.push('\n' as u16), - b'r' => buf.push('\r' as u16), - b't' => buf.push('\t' as u16), - b'b' => buf.push('\x08' as u16), - b'f' => buf.push('\x0c' as u16), - b'0' => buf.push('\0' as u16), - b'x' => { - let mut code_point_utf8_bytes = [0u8; 2]; - cursor.fill_bytes(&mut code_point_utf8_bytes)?; - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Hexadecimal character escape sequence"); - let code_point = - u16::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax( - "invalid Hexadecimal escape sequence", - cursor.pos(), - ) - })?; - - buf.push(code_point); - } - b'u' => { - // Support \u{X..X} (Unicode Codepoint) - if cursor.next_is(b'{')? { - // TODO: use bytes for a bit better performance (using stack) - let mut code_point_buf = Vec::with_capacity(6); - cursor.take_until(b'}', &mut code_point_buf)?; - - let code_point_str = - unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; - // We know this is a single unicode codepoint, convert to u32 - let code_point = - u32::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax( - "malformed Unicode character escape sequence", - cursor.pos(), - ) - })?; - - // UTF16Encoding of a numeric code point value - if code_point > 0x10_FFFF { - return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos())); - } else if code_point <= 65535 { - buf.push(code_point as u16); - } else { - let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; - let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; - buf.push(cu1); - buf.push(cu2); - } - } else { - // Collect each character after \u e.g \uD83D will give "D83D" - let mut code_point_utf8_bytes = [0u8; 4]; - cursor.fill_bytes(&mut code_point_utf8_bytes)?; - - // Convert to u16 - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Unicode character escape sequence"); - let code_point = - u16::from_str_radix(code_point_str, 16).map_err(|_| { - Error::syntax( - "invalid Unicode escape sequence", - cursor.pos(), - ) - })?; - - buf.push(code_point); + match next_chr { + Some('\'') if terminator == StringTerminator::SingleQuote => { + break; + } + Some('"') if terminator == StringTerminator::DoubleQuote => { + break; + } + Some('\\') => { + let _timer = BoaProfiler::global() + .start_event("StringLiteral - escape sequence", "Lexing"); + + let escape = cursor.peek()?.ok_or_else(|| { + Error::from(io::Error::new( + ErrorKind::UnexpectedEof, + "unterminated escape sequence in literal", + )) + })?; + + if escape <= 0x7f { + let _ = cursor.next_byte()?; + match escape { + b'\n' => (), + b'n' => buf.push('\n' as u16), + b'r' => buf.push('\r' as u16), + b't' => buf.push('\t' as u16), + b'b' => buf.push('\x08' as u16), + b'f' => buf.push('\x0c' as u16), + b'0' => buf.push('\0' as u16), + b'x' => { + Self::hex_escape_sequence(cursor, Some(&mut buf))?; } - } - n if char::is_digit(char::from(n), 8) => { - if strict_mode { - return Err(Error::syntax( - "octal escape sequences are deprecated", - cursor.pos(), - )); + b'u' => { + Self::unicode_escape_sequence(cursor, Some(&mut buf))?; } - let mut o = char::from(n).to_digit(8).unwrap(); - - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { - let _ = cursor.next_byte()?; - o = o * 8 + char::from(n).to_digit(8).unwrap(); - if n <= b'3' { - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { - let _ = cursor.next_byte(); - o = o * 8 + char::from(n).to_digit(8).unwrap(); - } - _ => (), - } - } - } - _ => (), + n if char::is_digit(char::from(n), 8) => { + Self::legacy_octal_escape_sequence( + cursor, + Some(&mut buf), + strict_mode, + n, + )?; } - buf.push(o as u16); - } - _ => buf.push(escape as u16), - }; + _ => buf.push(escape as u16), + }; + } } - } - Some(next_ch) => { - if next_ch.len_utf16() == 1 { - buf.push(next_ch as u16); - } else { - let mut code_point_bytes_buf = [0u16; 2]; - let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); + Some(next_ch) => { + if next_ch.len_utf16() == 1 { + buf.push(next_ch as u16); + } else { + let mut code_point_bytes_buf = [0u16; 2]; + let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); - buf.extend(code_point_bytes.iter()); + buf.extend(code_point_bytes.iter()); + } + } + None if terminator != StringTerminator::End => { + return Err(Error::from(io::Error::new( + ErrorKind::UnexpectedEof, + "unterminated string literal", + ))); + } + None => { + break; } } - None if terminator != StringTerminator::End => { - return Err(Error::from(io::Error::new( - ErrorKind::UnexpectedEof, - "unterminated string literal", - ))); + } + + Ok(( + String::from_utf16_lossy(buf.as_slice()), + Span::new(start_pos, cursor.pos()), + )) + } + + #[inline] + pub(super) fn unicode_escape_sequence( + cursor: &mut Cursor, + code_units_buf: Option<&mut Vec>, + ) -> Result + where + R: Read, + { + // Support \u{X..X} (Unicode CodePoint) + if cursor.next_is(b'{')? { + // TODO: use bytes for a bit better performance (using stack) + let mut code_point_buf = Vec::with_capacity(6); + cursor.take_until(b'}', &mut code_point_buf)?; + + let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; + // We know this is a single unicode codepoint, convert to u32 + let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| { + Error::syntax("malformed Unicode character escape sequence", cursor.pos()) + })?; + + // UTF16Encoding of a numeric code point value + if code_point > 0x10_FFFF { + return Err(Error::syntax( + "Unicode codepoint must not be greater than 0x10FFFF in escape sequence", + cursor.pos(), + )); + } else if let Some(code_units_buf) = code_units_buf { + if code_point <= 65535 { + code_units_buf.push(code_point as u16); + } else { + let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; + let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; + code_units_buf.push(cu1); + code_units_buf.push(cu2); + } } - None => { - break; + + Ok(code_point) + } else { + // Hex4Digits + // Collect each character after \u e.g \uD83D will give "D83D" + let mut code_point_utf8_bytes = [0u8; 4]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + + // Convert to u16 + let code_point_str = str::from_utf8(&code_point_utf8_bytes) + .expect("malformed Unicode character escape sequence"); + let code_point = u16::from_str_radix(code_point_str, 16) + .map_err(|_| Error::syntax("invalid Unicode escape sequence", cursor.pos()))?; + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point); } + + Ok(code_point as u32) + } + } + + #[inline] + fn hex_escape_sequence( + cursor: &mut Cursor, + code_units_buf: Option<&mut Vec>, + ) -> Result + where + R: Read, + { + let mut code_point_utf8_bytes = [0u8; 2]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + let code_point_str = str::from_utf8(&code_point_utf8_bytes) + .expect("malformed Hexadecimal character escape sequence"); + let code_point = u16::from_str_radix(&code_point_str, 16) + .map_err(|_| Error::syntax("invalid Hexadecimal escape sequence", cursor.pos()))?; + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point); } + + Ok(code_point as u32) } - Ok(( - String::from_utf16_lossy(buf.as_slice()), - Span::new(start_pos, cursor.pos()), - )) + #[inline] + fn legacy_octal_escape_sequence( + cursor: &mut Cursor, + code_units_buf: Option<&mut Vec>, + strict_mode: bool, + init: u8, + ) -> Result + where + R: Read, + { + if strict_mode { + return Err(Error::syntax( + "octal escape sequences are deprecated", + cursor.pos(), + )); + } + let mut code_point = char::from(init).to_digit(8).unwrap(); + + match cursor.peek()? { + Some(c) if char::is_digit(char::from(c), 8) => { + let _ = cursor.next_byte()?; + code_point = code_point * 8 + char::from(init).to_digit(8).unwrap(); + if init <= b'3' { + match cursor.peek()? { + Some(c) if char::is_digit(char::from(c), 8) => { + let _ = cursor.next_byte(); + code_point = code_point * 8 + char::from(init).to_digit(8).unwrap(); + } + _ => (), + } + } + } + _ => (), + } + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point as u16); + } + + Ok(code_point) + } } diff --git a/boa/src/syntax/lexer/template.rs b/boa/src/syntax/lexer/template.rs index a34ba025238..23171e333a8 100644 --- a/boa/src/syntax/lexer/template.rs +++ b/boa/src/syntax/lexer/template.rs @@ -3,7 +3,7 @@ use super::{Cursor, Error, Tokenizer}; use crate::{ profiler::BoaProfiler, - syntax::lexer::string::{unescape_string, StringTerminator}, + syntax::lexer::string::{StringLiteral, StringTerminator}, syntax::{ ast::{Position, Span}, lexer::{Token, TokenKind}, @@ -44,7 +44,7 @@ impl Tokenizer for TemplateLiteral { match next_chr { '`' => { let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = unescape_string( + let (cooked, _) = StringLiteral::unescape_string( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, @@ -58,7 +58,7 @@ impl Tokenizer for TemplateLiteral { '$' if cursor.peek()? == Some(b'{') => { let _ = cursor.next_byte()?; let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = unescape_string( + let (cooked, _) = StringLiteral::unescape_string( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index f54b8f4b338..4f3e400b401 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -6,7 +6,7 @@ use super::token::Numeric; use super::*; use super::{Error, Position}; use crate::syntax::ast::Keyword; -use crate::syntax::lexer::string::{unescape_string, StringTerminator}; +use crate::syntax::lexer::string::{StringLiteral, StringTerminator}; use std::str; fn span(start: (u32, u32), end: (u32, u32)) -> Span { @@ -864,7 +864,7 @@ fn unicode_escape_with_braces_() { let mut cursor = Cursor::new(s.as_bytes()); - if let Ok((s, _)) = unescape_string( + if let Ok((s, _)) = StringLiteral::unescape_string( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -880,7 +880,7 @@ fn unicode_escape_with_braces_() { fn unescape_string_with_single_escape() { let s = r#"\Б"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); - let (s, _) = unescape_string( + let (s, _) = StringLiteral::unescape_string( &mut cursor, Position::new(1, 1), StringTerminator::End, From 54c6ffec7291a0c77519dbecba4f3c4602ba2676 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Sun, 17 Jan 2021 18:50:24 -0800 Subject: [PATCH 02/23] Fix octal escape in string literal --- boa/src/syntax/lexer/string.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 52d1b8c59cf..7fc0e21351a 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -112,12 +112,12 @@ impl StringLiteral { b'u' => { Self::unicode_escape_sequence(cursor, Some(&mut buf))?; } - n if char::is_digit(char::from(n), 8) => { + byte if (b'0'..b'8').contains(&byte) => { Self::legacy_octal_escape_sequence( cursor, Some(&mut buf), strict_mode, - n, + byte, )?; } _ => buf.push(escape as u16), @@ -237,7 +237,7 @@ impl StringLiteral { cursor: &mut Cursor, code_units_buf: Option<&mut Vec>, strict_mode: bool, - init: u8, + init_byte: u8, ) -> Result where R: Read, @@ -248,23 +248,26 @@ impl StringLiteral { cursor.pos(), )); } - let mut code_point = char::from(init).to_digit(8).unwrap(); + // Grammar: OctalDigit + let mut code_point = (init_byte - b'0') as u32; - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { + // Grammar: ZeroToThree OctalDigit + // Grammar: FourToSeven OctalDigit + if let Some(byte) = cursor.peek()? { + if (b'0'..b'8').contains(&byte) { let _ = cursor.next_byte()?; - code_point = code_point * 8 + char::from(init).to_digit(8).unwrap(); - if init <= b'3' { - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { - let _ = cursor.next_byte(); - code_point = code_point * 8 + char::from(init).to_digit(8).unwrap(); + code_point = (code_point * 8) + (byte - b'0') as u32; + + if (b'0'..b'4').contains(&init_byte) { + // Grammar: ZeroToThree OctalDigit OctalDigit + if let Some(byte) = cursor.peek()? { + if (b'0'..b'8').contains(&byte) { + let _ = cursor.next_byte()?; + code_point = (code_point * 8) + (byte - b'0') as u32; } - _ => (), } } } - _ => (), } if let Some(code_units_buf) = code_units_buf { From 4c9a78fa55848bb80dbd67a00e9e3e104ba0b185 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Sun, 17 Jan 2021 19:29:33 -0800 Subject: [PATCH 03/23] Add tests --- boa/src/syntax/lexer/tests.rs | 44 +++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index 4f3e400b401..dca682a011f 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -890,6 +890,50 @@ fn unescape_string_with_single_escape() { assert_eq!(s, "Б"); } +#[test] +fn legacy_octal_escape() { + let test_cases = [ + (r#"\3"#, "\u{3}"), + (r#"\03"#, "\u{3}"), + (r#"\003"#, "\u{3}"), + (r#"\0003"#, "\u{0}3"), + (r#"\43"#, "#"), + (r#"\043"#, "#"), + (r#"\101"#, "A"), + ]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::unescape_string( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } +} + +#[test] +fn zero_escape() { + let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::unescape_string( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } +} + mod carriage_return { use super::*; From 06513e953108a7a0977dbd449cad1d5a56b5f700 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Sun, 17 Jan 2021 19:29:45 -0800 Subject: [PATCH 04/23] Fix zero escape --- boa/src/syntax/lexer/string.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 7fc0e21351a..489c364768c 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -105,7 +105,13 @@ impl StringLiteral { b't' => buf.push('\t' as u16), b'b' => buf.push('\x08' as u16), b'f' => buf.push('\x0c' as u16), - b'0' => buf.push('\0' as u16), + b'0' if cursor + .peek()? + .filter(|next_byte| (*next_byte as char).is_digit(10)) + .is_none() => + { + buf.push('\0' as u16) + } b'x' => { Self::hex_escape_sequence(cursor, Some(&mut buf))?; } From 0cae4439ecfb411bb16e140d013b603c1c5a7d6b Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 00:50:46 -0800 Subject: [PATCH 05/23] Fix zero escape lookahead --- boa/src/syntax/lexer/string.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 489c364768c..da8f73636c1 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -107,7 +107,8 @@ impl StringLiteral { b'f' => buf.push('\x0c' as u16), b'0' if cursor .peek()? - .filter(|next_byte| (*next_byte as char).is_digit(10)) + .and_then(|next_byte| char::try_from(next_byte).ok()) + .filter(|next_ch| next_ch.is_digit(10)) .is_none() => { buf.push('\0' as u16) @@ -197,7 +198,7 @@ impl StringLiteral { Ok(code_point) } else { - // Hex4Digits + // Grammar: Hex4Digits // Collect each character after \u e.g \uD83D will give "D83D" let mut code_point_utf8_bytes = [0u8; 4]; cursor.fill_bytes(&mut code_point_utf8_bytes)?; From e783fe464c9a2b563530dad802c21c2cea99824d Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 10:40:09 -0800 Subject: [PATCH 06/23] Rename variables --- boa/src/syntax/lexer/string.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index da8f73636c1..2fccd56eebc 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -76,9 +76,9 @@ impl StringLiteral { { let mut buf = Vec::new(); loop { - let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); + let next_ch = cursor.next_char()?.map(char::try_from).transpose().unwrap(); - match next_chr { + match next_ch { Some('\'') if terminator == StringTerminator::SingleQuote => { break; } @@ -135,10 +135,10 @@ impl StringLiteral { if next_ch.len_utf16() == 1 { buf.push(next_ch as u16); } else { - let mut code_point_bytes_buf = [0u16; 2]; - let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); + let mut code_units_buf = [0u16; 2]; + let code_units_buf = next_ch.encode_utf16(&mut code_units_buf); - buf.extend(code_point_bytes.iter()); + buf.extend(code_units_buf.iter()); } } None if terminator != StringTerminator::End => { From 1f6b7b2b28ddcf60117e8ff3b8d575a34df21cb4 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 10:43:43 -0800 Subject: [PATCH 07/23] Rename helper functions --- boa/src/syntax/lexer/string.rs | 16 ++++++++-------- boa/src/syntax/lexer/template.rs | 4 ++-- boa/src/syntax/lexer/tests.rs | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 2fccd56eebc..eed996f7068 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -58,14 +58,14 @@ impl Tokenizer for StringLiteral { let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let (lit, span) = - Self::unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; + Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?; Ok(Token::new(TokenKind::string_literal(lit), span)) } } impl StringLiteral { - pub(super) fn unescape_string( + pub(super) fn take_string_characters( cursor: &mut Cursor, start_pos: Position, terminator: StringTerminator, @@ -114,13 +114,13 @@ impl StringLiteral { buf.push('\0' as u16) } b'x' => { - Self::hex_escape_sequence(cursor, Some(&mut buf))?; + Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; } b'u' => { - Self::unicode_escape_sequence(cursor, Some(&mut buf))?; + Self::take_unicode_escape_sequence(cursor, Some(&mut buf))?; } byte if (b'0'..b'8').contains(&byte) => { - Self::legacy_octal_escape_sequence( + Self::take_legacy_octal_escape_sequence( cursor, Some(&mut buf), strict_mode, @@ -160,7 +160,7 @@ impl StringLiteral { } #[inline] - pub(super) fn unicode_escape_sequence( + pub(super) fn take_unicode_escape_sequence( cursor: &mut Cursor, code_units_buf: Option<&mut Vec>, ) -> Result @@ -218,7 +218,7 @@ impl StringLiteral { } #[inline] - fn hex_escape_sequence( + fn take_hex_escape_sequence( cursor: &mut Cursor, code_units_buf: Option<&mut Vec>, ) -> Result @@ -240,7 +240,7 @@ impl StringLiteral { } #[inline] - fn legacy_octal_escape_sequence( + fn take_legacy_octal_escape_sequence( cursor: &mut Cursor, code_units_buf: Option<&mut Vec>, strict_mode: bool, diff --git a/boa/src/syntax/lexer/template.rs b/boa/src/syntax/lexer/template.rs index 23171e333a8..ecec7a7387f 100644 --- a/boa/src/syntax/lexer/template.rs +++ b/boa/src/syntax/lexer/template.rs @@ -44,7 +44,7 @@ impl Tokenizer for TemplateLiteral { match next_chr { '`' => { let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = StringLiteral::unescape_string( + let (cooked, _) = StringLiteral::take_string_characters( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, @@ -58,7 +58,7 @@ impl Tokenizer for TemplateLiteral { '$' if cursor.peek()? == Some(b'{') => { let _ = cursor.next_byte()?; let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = StringLiteral::unescape_string( + let (cooked, _) = StringLiteral::take_string_characters( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index dca682a011f..61e8962f190 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -864,7 +864,7 @@ fn unicode_escape_with_braces_() { let mut cursor = Cursor::new(s.as_bytes()); - if let Ok((s, _)) = StringLiteral::unescape_string( + if let Ok((s, _)) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -880,7 +880,7 @@ fn unicode_escape_with_braces_() { fn unescape_string_with_single_escape() { let s = r#"\Б"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); - let (s, _) = StringLiteral::unescape_string( + let (s, _) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -904,7 +904,7 @@ fn legacy_octal_escape() { for (s, expected) in test_cases.iter() { let mut cursor = Cursor::new(s.as_bytes()); - let (s, _) = StringLiteral::unescape_string( + let (s, _) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -922,7 +922,7 @@ fn zero_escape() { for (s, expected) in test_cases.iter() { let mut cursor = Cursor::new(s.as_bytes()); - let (s, _) = StringLiteral::unescape_string( + let (s, _) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, From 067f2a1c27bd3c11e41608530bb416cc5e876a01 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 10:48:28 -0800 Subject: [PATCH 08/23] Refactor match arms --- boa/src/syntax/lexer/string.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index eed996f7068..008f7857a56 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -85,6 +85,9 @@ impl StringLiteral { Some('"') if terminator == StringTerminator::DoubleQuote => { break; } + None if terminator == StringTerminator::End => { + break; + } Some('\\') => { let _timer = BoaProfiler::global() .start_event("StringLiteral - escape sequence", "Lexing"); @@ -141,15 +144,12 @@ impl StringLiteral { buf.extend(code_units_buf.iter()); } } - None if terminator != StringTerminator::End => { + None => { return Err(Error::from(io::Error::new( ErrorKind::UnexpectedEof, "unterminated string literal", ))); } - None => { - break; - } } } From 6cca4d9b274d1b8394e0dcc31dfd932ff51aeb57 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 19:02:53 -0800 Subject: [PATCH 09/23] Fix escape line terminator sequence --- boa/src/syntax/lexer/string.rs | 98 ++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 008f7857a56..f97837748f5 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -65,6 +65,20 @@ impl Tokenizer for StringLiteral { } impl StringLiteral { + /// Checks if a character is LineTerminator as per ECMAScript standards. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#prod-LineTerminator + #[inline] + pub(super) fn is_line_terminator(ch: char) -> bool { + matches!( + ch, + '\u{000A}' /* */ | '\u{000D}' /* */ | '\u{2028}' /* */ | '\u{2029}' /* */ + ) + } + pub(super) fn take_string_characters( cursor: &mut Cursor, start_pos: Position, @@ -92,47 +106,51 @@ impl StringLiteral { let _timer = BoaProfiler::global() .start_event("StringLiteral - escape sequence", "Lexing"); - let escape = cursor.peek()?.ok_or_else(|| { - Error::from(io::Error::new( - ErrorKind::UnexpectedEof, - "unterminated escape sequence in literal", - )) - })?; + let escape_ch = cursor + .next_char()? + .and_then(|byte| char::try_from(byte).ok()) + .ok_or_else(|| { + Error::from(io::Error::new( + ErrorKind::UnexpectedEof, + "unterminated escape sequence in literal", + )) + })?; - if escape <= 0x7f { - let _ = cursor.next_byte()?; - match escape { - b'\n' => (), - b'n' => buf.push('\n' as u16), - b'r' => buf.push('\r' as u16), - b't' => buf.push('\t' as u16), - b'b' => buf.push('\x08' as u16), - b'f' => buf.push('\x0c' as u16), - b'0' if cursor - .peek()? - .and_then(|next_byte| char::try_from(next_byte).ok()) - .filter(|next_ch| next_ch.is_digit(10)) - .is_none() => - { - buf.push('\0' as u16) - } - b'x' => { - Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; - } - b'u' => { - Self::take_unicode_escape_sequence(cursor, Some(&mut buf))?; - } - byte if (b'0'..b'8').contains(&byte) => { - Self::take_legacy_octal_escape_sequence( - cursor, - Some(&mut buf), - strict_mode, - byte, - )?; - } - _ => buf.push(escape as u16), - }; - } + match escape_ch { + 'b' => buf.push('\x08' as u16), + 'f' => buf.push('\x0c' as u16), + 'n' => buf.push('\n' as u16), + 'r' => buf.push('\r' as u16), + 't' => buf.push('\t' as u16), + '0' if cursor + .peek()? + .and_then(|next_byte| char::try_from(next_byte).ok()) + .filter(|next_ch| next_ch.is_digit(10)) + .is_none() => + { + buf.push('\0' as u16) + } + 'x' => { + Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; + } + 'u' => { + Self::take_unicode_escape_sequence(cursor, Some(&mut buf))?; + } + _ if escape_ch.is_digit(10) => { + Self::take_legacy_octal_escape_sequence( + cursor, + Some(&mut buf), + strict_mode, + escape_ch as u8, + )?; + } + _ if Self::is_line_terminator(escape_ch) => { + // Check match LineContinuation + // Grammar: \ LineTerminatorSequence + // do nothing, continue lexing + } + _ => buf.push(escape_ch as u16), + }; } Some(next_ch) => { if next_ch.len_utf16() == 1 { From 83e86475250ab6498c815d266d4179380b9279e8 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 19:11:50 -0800 Subject: [PATCH 10/23] Fix single character escape --- boa/src/syntax/lexer/string.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index f97837748f5..6849791f5a6 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -117,11 +117,15 @@ impl StringLiteral { })?; match escape_ch { - 'b' => buf.push('\x08' as u16), - 'f' => buf.push('\x0c' as u16), - 'n' => buf.push('\n' as u16), - 'r' => buf.push('\r' as u16), - 't' => buf.push('\t' as u16), + 'b' => buf.push('\u{0008}' as u16 /* */), + 't' => buf.push('\u{0009}' as u16 /* */), + 'n' => buf.push('\u{000A}' as u16 /* */), + 'v' => buf.push('\u{000B}' as u16 /* */), + 'f' => buf.push('\u{000C}' as u16 /* */), + 'r' => buf.push('\u{000D}' as u16 /* */), + '"' => buf.push('\u{0022}' as u16 /* " */), + '\'' => buf.push('\u{0027}' as u16 /* ' */), + '\\' => buf.push('\u{005C}' as u16 /* \ */), '0' if cursor .peek()? .and_then(|next_byte| char::try_from(next_byte).ok()) @@ -145,9 +149,9 @@ impl StringLiteral { )?; } _ if Self::is_line_terminator(escape_ch) => { - // Check match LineContinuation + // Match LineContinuation // Grammar: \ LineTerminatorSequence - // do nothing, continue lexing + // LineContinuation is the empty String. Do nothing and continue lexing. } _ => buf.push(escape_ch as u16), }; From 5fd86a1a67301b253b7d7137ad81772ea7336c51 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:04:22 -0800 Subject: [PATCH 11/23] Fix escape followed by unicode char --- boa/src/syntax/lexer/string.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 6849791f5a6..fd203763649 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -132,7 +132,7 @@ impl StringLiteral { .filter(|next_ch| next_ch.is_digit(10)) .is_none() => { - buf.push('\0' as u16) + buf.push('\u{0000}' as u16 /* NULL */) } 'x' => { Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; @@ -153,17 +153,20 @@ impl StringLiteral { // Grammar: \ LineTerminatorSequence // LineContinuation is the empty String. Do nothing and continue lexing. } - _ => buf.push(escape_ch as u16), + _ => { + if escape_ch.len_utf16() == 1 { + buf.push(escape_ch as u16); + } else { + buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter()); + } + } }; } Some(next_ch) => { if next_ch.len_utf16() == 1 { buf.push(next_ch as u16); } else { - let mut code_units_buf = [0u16; 2]; - let code_units_buf = next_ch.encode_utf16(&mut code_units_buf); - - buf.extend(code_units_buf.iter()); + buf.extend(next_ch.encode_utf16(&mut [0u16; 2]).iter()); } } None => { From b176702fae9c91ed3acc2e7dc910bf7f74fee2d0 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:21:37 -0800 Subject: [PATCH 12/23] Add NonOctalDecimalEscapeSequence --- boa/src/syntax/lexer/string.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index fd203763649..2e02b374427 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -140,7 +140,18 @@ impl StringLiteral { 'u' => { Self::take_unicode_escape_sequence(cursor, Some(&mut buf))?; } - _ if escape_ch.is_digit(10) => { + '8' | '9' => { + // Grammar: NonOctalDecimalEscapeSequence + if strict_mode { + return Err(Error::syntax( + "\\8 and \\9 are not allowed in strict mode.", + cursor.pos(), + )); + } else { + buf.push(escape_ch as u16); + } + } + _ if escape_ch.is_digit(8) => { Self::take_legacy_octal_escape_sequence( cursor, Some(&mut buf), From 858a74d9927dadbd171fe89608d3cf7739af9158 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:22:05 -0800 Subject: [PATCH 13/23] Fix comment --- boa/src/syntax/lexer/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 2e02b374427..88894b9290c 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -160,7 +160,7 @@ impl StringLiteral { )?; } _ if Self::is_line_terminator(escape_ch) => { - // Match LineContinuation + // Grammar: LineContinuation // Grammar: \ LineTerminatorSequence // LineContinuation is the empty String. Do nothing and continue lexing. } From e4bf635b94e37c1f0e0dd0439df082246cbf18fa Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:24:59 -0800 Subject: [PATCH 14/23] Refactor --- boa/src/syntax/lexer/string.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 88894b9290c..b8b27fd3412 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -117,22 +117,22 @@ impl StringLiteral { })?; match escape_ch { - 'b' => buf.push('\u{0008}' as u16 /* */), - 't' => buf.push('\u{0009}' as u16 /* */), - 'n' => buf.push('\u{000A}' as u16 /* */), - 'v' => buf.push('\u{000B}' as u16 /* */), - 'f' => buf.push('\u{000C}' as u16 /* */), - 'r' => buf.push('\u{000D}' as u16 /* */), - '"' => buf.push('\u{0022}' as u16 /* " */), - '\'' => buf.push('\u{0027}' as u16 /* ' */), - '\\' => buf.push('\u{005C}' as u16 /* \ */), + 'b' => buf.push(0x0008 /* */), + 't' => buf.push(0x0009 /* */), + 'n' => buf.push(0x000A /* */), + 'v' => buf.push(0x000B /* */), + 'f' => buf.push(0x000C /* */), + 'r' => buf.push(0x000D /* */), + '"' => buf.push(0x0022 /* " */), + '\'' => buf.push(0x0027 /* ' */), + '\\' => buf.push(0x005C /* \ */), '0' if cursor .peek()? .and_then(|next_byte| char::try_from(next_byte).ok()) .filter(|next_ch| next_ch.is_digit(10)) .is_none() => { - buf.push('\u{0000}' as u16 /* NULL */) + buf.push(0x0000 /* NULL */) } 'x' => { Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; From 835a5bb88cd818fe2c259ef5242af2e78bdbca71 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:26:22 -0800 Subject: [PATCH 15/23] Modify error message --- boa/src/syntax/lexer/string.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index b8b27fd3412..6cf1e991b88 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -144,7 +144,7 @@ impl StringLiteral { // Grammar: NonOctalDecimalEscapeSequence if strict_mode { return Err(Error::syntax( - "\\8 and \\9 are not allowed in strict mode.", + "\\8 and \\9 are not allowed in strict mode", cursor.pos(), )); } else { @@ -287,7 +287,7 @@ impl StringLiteral { { if strict_mode { return Err(Error::syntax( - "octal escape sequences are deprecated", + "octal escape sequences are not allowed in strict mode", cursor.pos(), )); } From d67cec2dbff4cf9f6282ea5d5a6bdfde2ea49e60 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 20:56:19 -0800 Subject: [PATCH 16/23] Add tests --- boa/src/syntax/lexer/tests.rs | 55 +++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index 61e8962f190..496ca44174a 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -914,6 +914,17 @@ fn legacy_octal_escape() { assert_eq!(s, *expected); } + + for (s, _) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + true, + ) + .expect_err("Octal-escape in strict mode not rejected as expected"); + } } #[test] @@ -934,6 +945,50 @@ fn zero_escape() { } } +#[test] +fn non_octal_decimal_escape() { + let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } + + for (s, _) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + true, + ) + .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected"); + } +} + +#[test] +fn line_continuation() { + let s = "hello \\\nworld"; + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, "hello world"); +} + mod carriage_return { use super::*; From f59a002816aa88711ee9ad383e44e23dd4efd2f6 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 21:02:07 -0800 Subject: [PATCH 17/23] Rename tests --- boa/src/syntax/lexer/tests.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index 496ca44174a..e9d01d46915 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() { } #[test] -fn codepoint_with_no_braces() { +fn string_codepoint_with_no_braces() { let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]); assert!(lexer.next().is_ok()); } @@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() { } #[test] -fn non_english_str() { +fn string_unicode() { let str = r#"'中文';"#; let mut lexer = Lexer::new(str.as_bytes()); @@ -828,7 +828,7 @@ fn non_english_str() { } #[test] -fn unicode_escape_with_braces() { +fn string_unicode_escape_with_braces() { let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]); let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())]; @@ -859,7 +859,7 @@ fn unicode_escape_with_braces() { } #[test] -fn unicode_escape_with_braces_() { +fn take_string_characters_unicode_escape_with_braces_2() { let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); @@ -877,7 +877,7 @@ fn unicode_escape_with_braces_() { } #[test] -fn unescape_string_with_single_escape() { +fn take_string_characters_with_single_escape() { let s = r#"\Б"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); let (s, _) = StringLiteral::take_string_characters( @@ -891,7 +891,7 @@ fn unescape_string_with_single_escape() { } #[test] -fn legacy_octal_escape() { +fn take_string_characters_legacy_octal_escape() { let test_cases = [ (r#"\3"#, "\u{3}"), (r#"\03"#, "\u{3}"), @@ -928,7 +928,7 @@ fn legacy_octal_escape() { } #[test] -fn zero_escape() { +fn take_string_characters_zero_escape() { let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")]; for (s, expected) in test_cases.iter() { @@ -946,7 +946,7 @@ fn zero_escape() { } #[test] -fn non_octal_decimal_escape() { +fn take_string_characters_non_octal_decimal_escape() { let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")]; for (s, expected) in test_cases.iter() { @@ -975,7 +975,7 @@ fn non_octal_decimal_escape() { } #[test] -fn line_continuation() { +fn take_string_characters_line_continuation() { let s = "hello \\\nworld"; let mut cursor = Cursor::new(s.as_bytes()); let (s, _) = StringLiteral::take_string_characters( From 983d786d6b67cfe65aa16c2f3cabca2d14c42f34 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Mon, 18 Jan 2021 21:30:16 -0800 Subject: [PATCH 18/23] Add test for error --- boa/src/syntax/lexer/string.rs | 33 +++++++++++++++++++-------------- boa/src/syntax/lexer/tests.rs | 20 ++++++++++++++++---- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 6cf1e991b88..6449d554c90 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -90,9 +90,10 @@ impl StringLiteral { { let mut buf = Vec::new(); loop { - let next_ch = cursor.next_char()?.map(char::try_from).transpose().unwrap(); + let ch_start_pos = cursor.pos(); + let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap(); - match next_ch { + match ch { Some('\'') if terminator == StringTerminator::SingleQuote => { break; } @@ -135,17 +136,17 @@ impl StringLiteral { buf.push(0x0000 /* NULL */) } 'x' => { - Self::take_hex_escape_sequence(cursor, Some(&mut buf))?; + Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?; } 'u' => { - Self::take_unicode_escape_sequence(cursor, Some(&mut buf))?; + Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?; } '8' | '9' => { // Grammar: NonOctalDecimalEscapeSequence if strict_mode { return Err(Error::syntax( "\\8 and \\9 are not allowed in strict mode", - cursor.pos(), + ch_start_pos, )); } else { buf.push(escape_ch as u16); @@ -154,6 +155,7 @@ impl StringLiteral { _ if escape_ch.is_digit(8) => { Self::take_legacy_octal_escape_sequence( cursor, + ch_start_pos, Some(&mut buf), strict_mode, escape_ch as u8, @@ -173,11 +175,11 @@ impl StringLiteral { } }; } - Some(next_ch) => { - if next_ch.len_utf16() == 1 { - buf.push(next_ch as u16); + Some(ch) => { + if ch.len_utf16() == 1 { + buf.push(ch as u16); } else { - buf.extend(next_ch.encode_utf16(&mut [0u16; 2]).iter()); + buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter()); } } None => { @@ -198,6 +200,7 @@ impl StringLiteral { #[inline] pub(super) fn take_unicode_escape_sequence( cursor: &mut Cursor, + start_pos: Position, code_units_buf: Option<&mut Vec>, ) -> Result where @@ -212,14 +215,14 @@ impl StringLiteral { let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; // We know this is a single unicode codepoint, convert to u32 let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax("malformed Unicode character escape sequence", cursor.pos()) + Error::syntax("malformed Unicode character escape sequence", start_pos) })?; // UTF16Encoding of a numeric code point value if code_point > 0x10_FFFF { return Err(Error::syntax( "Unicode codepoint must not be greater than 0x10FFFF in escape sequence", - cursor.pos(), + start_pos, )); } else if let Some(code_units_buf) = code_units_buf { if code_point <= 65535 { @@ -243,7 +246,7 @@ impl StringLiteral { let code_point_str = str::from_utf8(&code_point_utf8_bytes) .expect("malformed Unicode character escape sequence"); let code_point = u16::from_str_radix(code_point_str, 16) - .map_err(|_| Error::syntax("invalid Unicode escape sequence", cursor.pos()))?; + .map_err(|_| Error::syntax("invalid Unicode escape sequence", start_pos))?; if let Some(code_units_buf) = code_units_buf { code_units_buf.push(code_point); @@ -256,6 +259,7 @@ impl StringLiteral { #[inline] fn take_hex_escape_sequence( cursor: &mut Cursor, + start_pos: Position, code_units_buf: Option<&mut Vec>, ) -> Result where @@ -266,7 +270,7 @@ impl StringLiteral { let code_point_str = str::from_utf8(&code_point_utf8_bytes) .expect("malformed Hexadecimal character escape sequence"); let code_point = u16::from_str_radix(&code_point_str, 16) - .map_err(|_| Error::syntax("invalid Hexadecimal escape sequence", cursor.pos()))?; + .map_err(|_| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; if let Some(code_units_buf) = code_units_buf { code_units_buf.push(code_point); @@ -278,6 +282,7 @@ impl StringLiteral { #[inline] fn take_legacy_octal_escape_sequence( cursor: &mut Cursor, + start_pos: Position, code_units_buf: Option<&mut Vec>, strict_mode: bool, init_byte: u8, @@ -288,7 +293,7 @@ impl StringLiteral { if strict_mode { return Err(Error::syntax( "octal escape sequences are not allowed in strict mode", - cursor.pos(), + start_pos, )); } // Grammar: OctalDigit diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index e9d01d46915..7ef4a34bc04 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -917,13 +917,19 @@ fn take_string_characters_legacy_octal_escape() { for (s, _) in test_cases.iter() { let mut cursor = Cursor::new(s.as_bytes()); - StringLiteral::take_string_characters( + + if let Error::Syntax(_, pos) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, true, ) - .expect_err("Octal-escape in strict mode not rejected as expected"); + .expect_err("Octal-escape in strict mode not rejected as expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } } } @@ -964,13 +970,19 @@ fn take_string_characters_non_octal_decimal_escape() { for (s, _) in test_cases.iter() { let mut cursor = Cursor::new(s.as_bytes()); - StringLiteral::take_string_characters( + + if let Error::Syntax(_, pos) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, true, ) - .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected"); + .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } } } From 17803d0e8f418ef63086f287173edd3562d70bb2 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Tue, 19 Jan 2021 12:12:49 -0800 Subject: [PATCH 19/23] Add comments for unsafe bytes to str --- boa/src/syntax/lexer/string.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 6449d554c90..99c1964617c 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -212,8 +212,9 @@ impl StringLiteral { let mut code_point_buf = Vec::with_capacity(6); cursor.take_until(b'}', &mut code_point_buf)?; + // Safty: invalid UTF-8 bytes will be handled by returning Err in the following `u32::from_str_radix` let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; - // We know this is a single unicode codepoint, convert to u32 + // The `code_point_str` should represent a single unicode codepoint, convert to u32 let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| { Error::syntax("malformed Unicode character escape sequence", start_pos) })?; From 99a6096fb11293f0275261249c1dd52fcceafb00 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Tue, 19 Jan 2021 12:52:38 -0800 Subject: [PATCH 20/23] Update boa/src/syntax/lexer/string.rs Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com> --- boa/src/syntax/lexer/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 99c1964617c..9fc4d888042 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -212,7 +212,7 @@ impl StringLiteral { let mut code_point_buf = Vec::with_capacity(6); cursor.take_until(b'}', &mut code_point_buf)?; - // Safty: invalid UTF-8 bytes will be handled by returning Err in the following `u32::from_str_radix` + // Safety: invalid UTF-8 bytes will be handled by returning Err in the following `u32::from_str_radix` let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; // The `code_point_str` should represent a single unicode codepoint, convert to u32 let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| { From b9d7b02d902d91e7aaca13767be35b105db55f12 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Tue, 19 Jan 2021 13:50:02 -0800 Subject: [PATCH 21/23] Minor refactor --- boa/src/syntax/lexer/string.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 9fc4d888042..a6c9e198e48 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -129,8 +129,7 @@ impl StringLiteral { '\\' => buf.push(0x005C /* \ */), '0' if cursor .peek()? - .and_then(|next_byte| char::try_from(next_byte).ok()) - .filter(|next_ch| next_ch.is_digit(10)) + .filter(|next_byte| (b'0'..=b'9').contains(next_byte)) .is_none() => { buf.push(0x0000 /* NULL */) @@ -303,14 +302,14 @@ impl StringLiteral { // Grammar: ZeroToThree OctalDigit // Grammar: FourToSeven OctalDigit if let Some(byte) = cursor.peek()? { - if (b'0'..b'8').contains(&byte) { + if (b'0'..=b'7').contains(&byte) { let _ = cursor.next_byte()?; code_point = (code_point * 8) + (byte - b'0') as u32; - if (b'0'..b'4').contains(&init_byte) { + if (b'0'..=b'3').contains(&init_byte) { // Grammar: ZeroToThree OctalDigit OctalDigit if let Some(byte) = cursor.peek()? { - if (b'0'..b'8').contains(&byte) { + if (b'0'..=b'7').contains(&byte) { let _ = cursor.next_byte()?; code_point = (code_point * 8) + (byte - b'0') as u32; } From dcf668e001fd46e254a6dd113c8bdc5966dc319c Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Tue, 19 Jan 2021 15:32:29 -0800 Subject: [PATCH 22/23] Remove unsafe bytes to str --- boa/src/syntax/lexer/string.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index a6c9e198e48..f0a55e8cf4d 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -211,12 +211,15 @@ impl StringLiteral { let mut code_point_buf = Vec::with_capacity(6); cursor.take_until(b'}', &mut code_point_buf)?; - // Safety: invalid UTF-8 bytes will be handled by returning Err in the following `u32::from_str_radix` - let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; - // The `code_point_str` should represent a single unicode codepoint, convert to u32 - let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax("malformed Unicode character escape sequence", start_pos) - })?; + let code_point = str::from_utf8(code_point_buf.as_slice()) + .ok() + .and_then(|code_point_str| { + // The `code_point_str` should represent a single unicode codepoint, convert to u32 + u32::from_str_radix(&code_point_str, 16).ok() + }) + .ok_or_else(|| { + Error::syntax("malformed Unicode character escape sequence", start_pos) + })?; // UTF16Encoding of a numeric code point value if code_point > 0x10_FFFF { From 6fa0642376a3c18c36f5732442a68328a03fb2e1 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Tue, 19 Jan 2021 15:39:09 -0800 Subject: [PATCH 23/23] Fix panic when reading invalid utf-8 chars --- boa/src/syntax/lexer/string.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index f0a55e8cf4d..b4542a70d41 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -246,10 +246,10 @@ impl StringLiteral { cursor.fill_bytes(&mut code_point_utf8_bytes)?; // Convert to u16 - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Unicode character escape sequence"); - let code_point = u16::from_str_radix(code_point_str, 16) - .map_err(|_| Error::syntax("invalid Unicode escape sequence", start_pos))?; + let code_point = str::from_utf8(&code_point_utf8_bytes) + .ok() + .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok()) + .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; if let Some(code_units_buf) = code_units_buf { code_units_buf.push(code_point); @@ -270,10 +270,10 @@ impl StringLiteral { { let mut code_point_utf8_bytes = [0u8; 2]; cursor.fill_bytes(&mut code_point_utf8_bytes)?; - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Hexadecimal character escape sequence"); - let code_point = u16::from_str_radix(&code_point_str, 16) - .map_err(|_| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; + let code_point = str::from_utf8(&code_point_utf8_bytes) + .ok() + .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok()) + .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; if let Some(code_units_buf) = code_units_buf { code_units_buf.push(code_point);