From ae05f562477b006a326bfde722f828b7c32bc835 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sun, 17 Dec 2023 04:36:06 +0100 Subject: [PATCH 1/6] Remove unused lexer code --- core/parser/src/lexer/cursor.rs | 43 ---------------------------- core/parser/src/lexer/tests.rs | 50 --------------------------------- 2 files changed, 93 deletions(-) diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index b191781136e..d6f05098878 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -128,21 +128,6 @@ where }) } - /// Applies the predicate to the next UTF-8 character and returns the result. - /// Returns false if there is no next character, otherwise returns the result from the - /// predicate on the ascii char - /// - /// The buffer is not incremented. - #[cfg(test)] - pub(super) fn next_is_char_pred(&mut self, pred: &F) -> io::Result - where - F: Fn(u32) -> bool, - { - let _timer = Profiler::global().start_event("cursor::next_is_char_pred()", "Lexing"); - - Ok(self.peek_char()?.map_or(false, pred)) - } - /// Fills the buffer with all bytes until the stop byte is found. /// Returns error when reaching the end of the buffer. /// @@ -186,34 +171,6 @@ where } } - /// Fills the buffer with characters until the first character for which the predicate (pred) is false. - /// It also stops when there is no next character. - /// - /// Note that all characters up until the stop character are added to the buffer, including the character right before. - #[cfg(test)] - pub(super) fn take_while_char_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> - where - F: Fn(u32) -> bool, - { - let _timer = Profiler::global().start_event("cursor::take_while_char_pred()", "Lexing"); - - loop { - if !self.next_is_char_pred(pred)? { - return Ok(()); - } else if let Some(ch) = self.peek_char()? { - for _ in 0..utf8_len(ch) { - buf.push( - self.next_byte()? - .expect("already checked that the next character exists"), - ); - } - } else { - // next_is_pred will return false if the next value is None so the None case should already be handled. - unreachable!(); - } - } - } - /// It will fill the buffer with bytes. /// /// This expects for the buffer to be fully filled. If it's not, it will fail with an diff --git a/core/parser/src/lexer/tests.rs b/core/parser/src/lexer/tests.rs index ff363d26213..5925166facb 100644 --- a/core/parser/src/lexer/tests.rs +++ b/core/parser/src/lexer/tests.rs @@ -826,56 +826,6 @@ fn take_while_ascii_pred_non_ascii_stop() { assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde"); } -#[test] -fn take_while_char_pred_simple() { - let mut cur = Cursor::new(&b"abcdefghijk"[..]); - - let mut buf: Vec = Vec::new(); - - cur.take_while_char_pred(&mut buf, &|c| { - c == 'a' as u32 || c == 'b' as u32 || c == 'c' as u32 - }) - .unwrap(); - - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc"); -} - -#[test] -fn take_while_char_pred_immediate_stop() { - let mut cur = Cursor::new(&b"abcdefghijk"[..]); - - let mut buf: Vec = Vec::new(); - - cur.take_while_char_pred(&mut buf, &|_| false).unwrap(); - - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), ""); -} - -#[test] -fn take_while_char_pred_entire_str() { - let mut cur = Cursor::new(&b"abcdefghijk"[..]); - - let mut buf: Vec = Vec::new(); - - cur.take_while_char_pred(&mut buf, &|_| true).unwrap(); - - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk"); -} - -#[test] -fn take_while_char_pred_utf8_char() { - let mut cur = Cursor::new("abc😀defghijk".as_bytes()); - - let mut buf: Vec = Vec::new(); - - cur.take_while_char_pred(&mut buf, &|c| { - char::try_from(c).map_or(false, |c| c == 'a' || c == 'b' || c == 'c' || c == '😀') - }) - .unwrap(); - - assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc😀"); -} - #[test] fn illegal_following_numeric_literal() { // Checks as per https://tc39.es/ecma262/#sec-literals-numeric-literals that a NumericLiteral cannot From 5089493a23a6fa803b6c1c51a04d4f6303c53a5a Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sat, 23 Dec 2023 00:39:42 +0100 Subject: [PATCH 2/6] Remove utf8 byte handling in lexer --- core/engine/src/tests/operators.rs | 2 +- core/parser/src/lexer/comment.rs | 2 +- core/parser/src/lexer/cursor.rs | 281 ++++---------------- core/parser/src/lexer/identifier.rs | 8 +- core/parser/src/lexer/mod.rs | 60 +++-- core/parser/src/lexer/number.rs | 142 +++++----- core/parser/src/lexer/operator.rs | 51 ++-- core/parser/src/lexer/private_identifier.rs | 2 +- core/parser/src/lexer/regex.rs | 81 +++--- core/parser/src/lexer/spread.rs | 4 +- core/parser/src/lexer/string.rs | 122 ++++++--- core/parser/src/lexer/template.rs | 23 +- 12 files changed, 338 insertions(+), 440 deletions(-) diff --git a/core/engine/src/tests/operators.rs b/core/engine/src/tests/operators.rs index 6d688e1041b..1ec6f657dd6 100644 --- a/core/engine/src/tests/operators.rs +++ b/core/engine/src/tests/operators.rs @@ -334,7 +334,7 @@ fn assignment_to_non_assignable_ctd() { TestAction::assert_native_error( src, JsNativeErrorKind::Syntax, - "Invalid left-hand side in assignment at line 1, col 13", + "Invalid left-hand side in assignment at line 1, col 12", ) }), ); diff --git a/core/parser/src/lexer/comment.rs b/core/parser/src/lexer/comment.rs index 6b23e19ed7e..5588682dcb9 100644 --- a/core/parser/src/lexer/comment.rs +++ b/core/parser/src/lexer/comment.rs @@ -74,7 +74,7 @@ impl Tokenizer for MultiLineComment { while let Some(ch) = cursor.next_char()? { let tried_ch = char::try_from(ch); match tried_ch { - Ok(c) if c == '*' && cursor.next_is(b'/')? => { + Ok(c) if c == '*' && cursor.next_if(0x2F /* / */)? => { return Ok(Token::new( if new_line { TokenKind::LineTerminator diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index d6f05098878..11261eafc38 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -10,6 +10,7 @@ pub(super) struct Cursor { pos: Position, module: bool, strict: bool, + peeked: [Option; 4], } impl Cursor { @@ -19,7 +20,7 @@ impl Cursor { } /// Advances the position to the next column. - pub(super) fn next_column(&mut self) { + fn next_column(&mut self) { let current_line = self.pos.line_number(); let next_column = self.pos.column_number() + 1; self.pos = Position::new(current_line, next_column); @@ -64,6 +65,7 @@ where pos: Position::new(1, 1), strict: false, module: false, + peeked: [None; 4], } } @@ -74,41 +76,47 @@ where pos, strict: false, module: false, + peeked: [None; 4], } } - /// Peeks the next byte. - pub(super) fn peek(&mut self) -> Result, Error> { - let _timer = Profiler::global().start_event("cursor::peek()", "Lexing"); - - self.iter.peek_byte() - } - /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). - pub(super) fn peek_n(&mut self, n: u8) -> Result<&[u8], Error> { + pub(super) fn peek_n(&mut self, n: u8) -> Result<&[Option; 4], Error> { let _timer = Profiler::global().start_event("cursor::peek_n()", "Lexing"); - self.iter.peek_n_bytes(n) + let peeked = self.peeked.iter().filter(|c| c.is_some()).count(); + let needs_peek = n as usize - peeked; + + for i in 0..needs_peek { + let next = self.iter.next_char()?; + self.peeked[i + peeked] = next; + } + + Ok(&self.peeked) } /// Peeks the next UTF-8 character in u32 code point. pub(super) fn peek_char(&mut self) -> Result, Error> { let _timer = Profiler::global().start_event("cursor::peek_char()", "Lexing"); - self.iter.peek_char() + if let Some(c) = self.peeked[0] { + return Ok(Some(c)); + } + + let next = self.iter.next_char()?; + self.peeked[0] = next; + Ok(next) } - /// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented. - pub(super) fn next_is(&mut self, byte: u8) -> io::Result { - let _timer = Profiler::global().start_event("cursor::next_is()", "Lexing"); + pub(super) fn next_if(&mut self, c: u32) -> io::Result { + let _timer = Profiler::global().start_event("cursor::next_if()", "Lexing"); - Ok(match self.peek()? { - Some(next) if next == byte => { - self.next_byte()?; - true - } - _ => false, - }) + if self.peek_char()? == Some(c) { + self.next_char()?; + Ok(true) + } else { + Ok(false) + } } /// Applies the predicate to the next character and returns the result. @@ -120,10 +128,14 @@ where where F: Fn(char) -> bool, { - let _timer = Profiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing"); + let _timer = Profiler::global().start_event("cursor::next_is_pred()", "Lexing"); - Ok(match self.peek()? { - Some(byte) if (0..=0x7F).contains(&byte) => pred(char::from(byte)), + Ok(match self.peek_char()? { + Some(byte) if (0..=0x7F).contains(&byte) => + { + #[allow(clippy::cast_possible_truncation)] + pred(char::from(byte as u8)) + } Some(_) | None => false, }) } @@ -132,14 +144,14 @@ where /// Returns error when reaching the end of the buffer. /// /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before. - pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec) -> io::Result<()> { + pub(super) fn take_until(&mut self, stop: u32, buf: &mut Vec) -> io::Result<()> { let _timer = Profiler::global().start_event("cursor::take_until()", "Lexing"); loop { - if self.next_is(stop)? { + if self.next_if(stop)? { return Ok(()); - } else if let Some(byte) = self.next_byte()? { - buf.push(byte); + } else if let Some(c) = self.next_char()? { + buf.push(c); } else { return Err(io::Error::new( ErrorKind::UnexpectedEof, @@ -162,8 +174,9 @@ where loop { if !self.next_is_ascii_pred(pred)? { return Ok(()); - } else if let Some(byte) = self.next_byte()? { - buf.push(byte); + } else if let Some(byte) = self.next_char()? { + #[allow(clippy::cast_possible_truncation)] + buf.push(byte as u8); } else { // next_is_pred will return false if the next value is None so the None case should already be handled. unreachable!(); @@ -171,61 +184,25 @@ where } } - /// It will fill the buffer with bytes. - /// - /// This expects for the buffer to be fully filled. If it's not, it will fail with an - /// `UnexpectedEof` I/O error. - pub(super) fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { - let _timer = Profiler::global().start_event("cursor::fill_bytes()", "Lexing"); - - self.iter.fill_bytes(buf) - } - - /// Retrieves the next byte. - pub(crate) fn next_byte(&mut self) -> Result, Error> { - let _timer = Profiler::global().start_event("cursor::next_byte()", "Lexing"); - - let byte = self.iter.next_byte()?; - - match byte { - Some(b'\r') => { - // Try to take a newline if it's next, for windows "\r\n" newlines - // Otherwise, treat as a Mac OS9 bare '\r' newline - if self.peek()? == Some(b'\n') { - let _next = self.iter.next_byte(); - } - self.next_line(); - } - Some(b'\n') => self.next_line(), - Some(0xE2) => { - // Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) - let next_bytes = self.peek_n(2)?; - if next_bytes == [0x80, 0xA8] || next_bytes == [0x80, 0xA9] { - self.next_line(); - } else { - // 0xE2 is a utf8 first byte - self.next_column(); - } - } - Some(b) if utf8_is_first_byte(b) => self.next_column(), - _ => {} - } - - Ok(byte) - } - /// Retrieves the next UTF-8 character. pub(crate) fn next_char(&mut self) -> Result, Error> { let _timer = Profiler::global().start_event("cursor::next_char()", "Lexing"); - let ch = self.iter.next_char()?; + let ch = if let Some(c) = self.peeked[0] { + self.peeked[0] = None; + self.peeked.rotate_left(1); + Some(c) + } else { + self.iter.next_char()? + }; match ch { Some(0xD) => { // Try to take a newline if it's next, for windows "\r\n" newlines // Otherwise, treat as a Mac OS9 bare '\r' newline - if self.peek()? == Some(0xA) { - let _next = self.iter.next_byte(); + if self.peek_char()? == Some(0xA) { + self.peeked[0] = None; + self.peeked.rotate_left(1); } self.next_line(); } @@ -243,21 +220,12 @@ where #[derive(Debug)] struct InnerIter { iter: Bytes, - num_peeked_bytes: u8, - peeked_bytes: [u8; 4], - #[allow(clippy::option_option)] - peeked_char: Option>, } impl InnerIter { /// Creates a new inner iterator. const fn new(iter: Bytes) -> Self { - Self { - iter, - num_peeked_bytes: 0, - peeked_bytes: [0; 4], - peeked_char: None, - } + Self { iter } } } @@ -265,134 +233,13 @@ impl InnerIter where R: Read, { - /// It will fill the buffer with checked ascii bytes. - /// - /// This expects for the buffer to be fully filled. If it's not, it will fail with an - /// `UnexpectedEof` I/O error. - fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { - for byte in &mut *buf { - *byte = self.next_byte()?.ok_or_else(|| { - io::Error::new( - io::ErrorKind::UnexpectedEof, - "unexpected EOF when filling buffer", - ) - })?; - } - Ok(()) - } - - /// Increments the iter by n bytes. - fn increment(&mut self, n: u32) -> Result<(), Error> { - for _ in 0..n { - if (self.next_byte()?).is_none() { - break; - } - } - Ok(()) - } - - /// Peeks the next byte. - pub(super) fn peek_byte(&mut self) -> Result, Error> { - if self.num_peeked_bytes > 0 { - let byte = self.peeked_bytes[0]; - Ok(Some(byte)) - } else { - match self.iter.next().transpose()? { - Some(byte) => { - self.num_peeked_bytes = 1; - self.peeked_bytes[0] = byte; - Ok(Some(byte)) - } - None => Ok(None), - } - } - } - - /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). - pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<&[u8], Error> { - while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 { - match self.iter.next().transpose()? { - Some(byte) => { - self.peeked_bytes[usize::from(self.num_peeked_bytes)] = byte; - self.num_peeked_bytes += 1; - } - None => break, - }; - } - Ok(&self.peeked_bytes[..usize::from(u8::min(n, self.num_peeked_bytes))]) - } - - /// Peeks the next unchecked character in u32 code point. - pub(super) fn peek_char(&mut self) -> Result, Error> { - if let Some(ch) = self.peeked_char { - Ok(ch) - } else { - // Decode UTF-8 - let (x, y, z, w) = match self.peek_n_bytes(4)? { - [b, ..] if *b < 128 => { - let char = u32::from(*b); - self.peeked_char = Some(Some(char)); - return Ok(Some(char)); - } - [] => { - self.peeked_char = None; - return Ok(None); - } - bytes => ( - bytes[0], - bytes.get(1).copied(), - bytes.get(2).copied(), - bytes.get(3).copied(), - ), - }; - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte(x, 2); - let y = y.unwrap_or_default(); - let mut ch = utf8_acc_cont_byte(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = z.unwrap_or_default(); - let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = w.unwrap_or_default(); - ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); - } - }; - - self.peeked_char = Some(Some(ch)); - Ok(Some(ch)) - } - } - /// Retrieves the next byte fn next_byte(&mut self) -> io::Result> { - self.peeked_char = None; - if self.num_peeked_bytes > 0 { - let byte = self.peeked_bytes[0]; - self.num_peeked_bytes -= 1; - self.peeked_bytes.rotate_left(1); - Ok(Some(byte)) - } else { - self.iter.next().transpose() - } + self.iter.next().transpose() } /// Retrieves the next unchecked char in u32 code point. fn next_char(&mut self) -> io::Result> { - if let Some(ch) = self.peeked_char.take() { - if let Some(c) = ch { - self.increment(utf8_len(c))?; - } - return Ok(ch); - } - // Decode UTF-8 let x = match self.next_byte()? { Some(b) if b < 128 => return Ok(Some(u32::from(b))), @@ -439,24 +286,6 @@ fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | u32::from(byte & CONT_MASK) } -/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the -/// bits `11`). -const fn utf8_is_first_byte(byte: u8) -> bool { - byte <= 0x7F || (byte >> 6) == 0x11 -} - fn unwrap_or_0(opt: Option) -> u8 { opt.unwrap_or(0) } - -const fn utf8_len(ch: u32) -> u32 { - if ch <= 0x7F { - 1 - } else if ch <= 0x7FF { - 2 - } else if ch <= 0xFFFF { - 3 - } else { - 4 - } -} diff --git a/core/parser/src/lexer/identifier.rs b/core/parser/src/lexer/identifier.rs index a1ff167e904..86b6e950092 100644 --- a/core/parser/src/lexer/identifier.rs +++ b/core/parser/src/lexer/identifier.rs @@ -100,7 +100,7 @@ impl Identifier { let _timer = Profiler::global().start_event("Identifier::take_identifier_name", "Lexing"); let mut contains_escaped_chars = false; - let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? { + let mut identifier_name = if init == '\\' && cursor.next_if(0x75 /* u */)? { let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?; if Self::is_identifier_start(ch) { @@ -119,10 +119,10 @@ impl Identifier { loop { let ch = match cursor.peek_char()? { - Some(0x005C /* \ */) if cursor.peek_n(2)?.get(1) == Some(&0x75) /* u */ => { + Some(0x005C /* \ */) if cursor.peek_n(2)?[1] == Some(0x75) /* u */ => { let pos = cursor.pos(); - let _next = cursor.next_byte(); - let _next = cursor.next_byte(); + let _next = cursor.next_char(); + let _next = cursor.next_char(); let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?; if Self::is_identifier_part(ch) { diff --git a/core/parser/src/lexer/mod.rs b/core/parser/src/lexer/mod.rs index 192c24d88bc..1d1e55bca1b 100644 --- a/core/parser/src/lexer/mod.rs +++ b/core/parser/src/lexer/mod.rs @@ -129,14 +129,16 @@ impl Lexer { { let _timer = Profiler::global().start_event("lex_slash_token", "Lexing"); - if let Some(c) = self.cursor.peek()? { + if let Some(c) = self.cursor.peek_char()? { match c { - b'/' => { - self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/' + // / + 0x002F => { + self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/' SingleLineComment.lex(&mut self.cursor, start, interner) } - b'*' => { - self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*' + // * + 0x002A => { + self.cursor.next_char()?.expect("* token vanished"); // Consume the '*' MultiLineComment.lex(&mut self.cursor, start, interner) } ch => { @@ -144,9 +146,10 @@ impl Lexer { InputElement::Div | InputElement::TemplateTail => { // Only div punctuator allowed, regex not. - if ch == b'=' { + // = + if ch == 0x003D { // Indicates this is an AssignDiv. - self.cursor.next_byte()?.expect("= token vanished"); // Consume the '=' + self.cursor.next_char()?.expect("= token vanished"); // Consume the '=' Ok(Token::new( Punctuator::AssignDiv.into(), Span::new(start, self.cursor.pos()), @@ -186,10 +189,11 @@ impl Lexer { let _next = self.cursor.next_char(); } - if self.cursor.peek_n(3)? == [b'-', b'-', b'>'] { - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); + // --> + if self.cursor.peek_n(3)?[..3] == [Some(0x2D), Some(0x2D), Some(0x3E)] { + let _next = self.cursor.next_char(); + let _next = self.cursor.next_char(); + let _next = self.cursor.next_char(); let start = self.cursor.pos(); SingleLineComment.lex(&mut self.cursor, start, interner)?; @@ -224,13 +228,13 @@ impl Lexer { //handle hashbang here so the below match block still throws error on //# if position isn't (1, 1) - if start.column_number() == 1 && start.line_number() == 1 && next_ch == 0x23 { - if let Some(hashbang_peek) = self.cursor.peek()? { - if hashbang_peek == 0x21 { - let _token = HashbangComment.lex(&mut self.cursor, start, interner); - return self.next(interner); - } - } + if start.column_number() == 1 + && start.line_number() == 1 + && next_ch == 0x23 + && self.cursor.peek_char()? == Some(0x21) + { + let _token = HashbangComment.lex(&mut self.cursor, start, interner); + return self.next(interner); }; if let Ok(c) = char::try_from(next_ch) { @@ -250,7 +254,12 @@ impl Lexer { Span::new(start, self.cursor.pos()), )), '.' => { - if self.cursor.peek()?.as_ref().map(u8::is_ascii_digit) == Some(true) { + if self + .cursor + .peek_char()? + .filter(|c| (48..=57).contains(c)) + .is_some() + { NumberLiteral::new(b'.').lex(&mut self.cursor, start, interner) } else { SpreadLiteral::new().lex(&mut self.cursor, start, interner) @@ -287,10 +296,13 @@ impl Lexer { '#' => PrivateIdentifier::new().lex(&mut self.cursor, start, interner), '/' => self.lex_slash_token(start, interner), #[cfg(feature = "annex-b")] - '<' if !self.module() && self.cursor.peek_n(3)? == [b'!', b'-', b'-'] => { - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); + // `) if the `annex-b` feature is enabled. pub(crate) fn skip_html_close(&mut self, interner: &mut Interner) -> Result<(), Error> where - R: Read, + R: ReadChar, { if cfg!(not(feature = "annex-b")) || self.module() { return Ok(()); @@ -210,7 +210,7 @@ impl Lexer { // We intentionally don't implement Iterator trait as Result