Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix lexing escapes in string literal and minor refactor #1079

Merged
merged 23 commits into from
Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
356 changes: 207 additions & 149 deletions boa/src/syntax/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,171 +58,229 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");

let (lit, span) =
unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Self::unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;

Ok(Token::new(TokenKind::string_literal(lit), span))
}
}

pub(super) fn unescape_string<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
strict_mode: bool,
) -> Result<(String, Span), Error>
where
R: Read,
{
let mut buf = Vec::new();
loop {
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();

match next_chr {
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
Some('\\') => {
let _timer =
BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
impl StringLiteral {
pub(super) fn unescape_string<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
strict_mode: bool,
) -> Result<(String, Span), Error>
where
R: Read,
{
let mut buf = Vec::new();
loop {
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();

let escape = cursor.peek()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal",
))
})?;

if escape <= 0x7f {
let _ = cursor.next_byte()?;
match escape {
b'\n' => (),
b'n' => buf.push('\n' as u16),
b'r' => buf.push('\r' as u16),
b't' => buf.push('\t' as u16),
b'b' => buf.push('\x08' as u16),
b'f' => buf.push('\x0c' as u16),
b'0' => buf.push('\0' as u16),
b'x' => {
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
let code_point =
u16::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Hexadecimal escape sequence",
cursor.pos(),
)
})?;

buf.push(code_point);
}
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;

let code_point_str =
unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
// We know this is a single unicode codepoint, convert to u32
let code_point =
u32::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
})?;

// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
}
} else {
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;

// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
let code_point =
u16::from_str_radix(code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?;

buf.push(code_point);
match next_chr {
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
Some('\\') => {
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");

let escape = cursor.peek()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal",
))
})?;

if escape <= 0x7f {
let _ = cursor.next_byte()?;
match escape {
b'\n' => (),
b'n' => buf.push('\n' as u16),
b'r' => buf.push('\r' as u16),
b't' => buf.push('\t' as u16),
b'b' => buf.push('\x08' as u16),
b'f' => buf.push('\x0c' as u16),
b'0' if cursor
.peek()?
.and_then(|next_byte| char::try_from(next_byte).ok())
.filter(|next_ch| next_ch.is_digit(10))
.is_none() =>
{
buf.push('\0' as u16)
}
}
n if char::is_digit(char::from(n), 8) => {
if strict_mode {
return Err(Error::syntax(
"octal escape sequences are deprecated",
cursor.pos(),
));
b'x' => {
Self::hex_escape_sequence(cursor, Some(&mut buf))?;
}
let mut o = char::from(n).to_digit(8).unwrap();

match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte()?;
o = o * 8 + char::from(n).to_digit(8).unwrap();
if n <= b'3' {
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte();
o = o * 8 + char::from(n).to_digit(8).unwrap();
}
_ => (),
}
}
}
_ => (),
b'u' => {
Self::unicode_escape_sequence(cursor, Some(&mut buf))?;
}
buf.push(o as u16);
}
_ => buf.push(escape as u16),
};
byte if (b'0'..b'8').contains(&byte) => {
Self::legacy_octal_escape_sequence(
cursor,
Some(&mut buf),
strict_mode,
byte,
)?;
}
_ => buf.push(escape as u16),
};
}
}
}
Some(next_ch) => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
Some(next_ch) => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);

buf.extend(code_point_bytes.iter());
buf.extend(code_point_bytes.iter());
}
}
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
None => {
break;
}
}
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}

Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
}

#[inline]
pub(super) fn unicode_escape_sequence<R>(
cursor: &mut Cursor<R>,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
// Support \u{X..X} (Unicode CodePoint)
if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;

let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
jevancc marked this conversation as resolved.
Show resolved Hide resolved
// We know this is a single unicode codepoint, convert to u32
let code_point = u32::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax("malformed Unicode character escape sequence", cursor.pos())
})?;

// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax(
"Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
cursor.pos(),
));
} else if let Some(code_units_buf) = code_units_buf {
if code_point <= 65535 {
code_units_buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
code_units_buf.push(cu1);
code_units_buf.push(cu2);
}
}
None => {
break;

Ok(code_point)
} else {
// Grammar: Hex4Digits
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;

// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
let code_point = u16::from_str_radix(code_point_str, 16)
.map_err(|_| Error::syntax("invalid Unicode escape sequence", cursor.pos()))?;

if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
}

Ok(code_point as u32)
}
}

Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
#[inline]
fn hex_escape_sequence<R>(
cursor: &mut Cursor<R>,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
let code_point = u16::from_str_radix(&code_point_str, 16)
.map_err(|_| Error::syntax("invalid Hexadecimal escape sequence", cursor.pos()))?;

if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
}

Ok(code_point as u32)
}

#[inline]
fn legacy_octal_escape_sequence<R>(
cursor: &mut Cursor<R>,
code_units_buf: Option<&mut Vec<u16>>,
strict_mode: bool,
init_byte: u8,
) -> Result<u32, Error>
where
R: Read,
{
if strict_mode {
return Err(Error::syntax(
"octal escape sequences are deprecated",
cursor.pos(),
));
}
// Grammar: OctalDigit
let mut code_point = (init_byte - b'0') as u32;

// Grammar: ZeroToThree OctalDigit
// Grammar: FourToSeven OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..b'8').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;

if (b'0'..b'4').contains(&init_byte) {
// Grammar: ZeroToThree OctalDigit OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..b'8').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
}
}
}
}
}

if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point as u16);
}

Ok(code_point)
}
}
Loading