From 30698e4f893c386b9d1c8315fbcca0ab5b2241da Mon Sep 17 00:00:00 2001 From: Victorien Elvinger Date: Tue, 3 Sep 2024 23:57:19 +0200 Subject: [PATCH] perf(css_parser): avoid expensive check when parsing an id (#3774) --- crates/biome_css_parser/src/lexer/mod.rs | 23 +++++++++++++---------- crates/biome_js_parser/src/lexer/mod.rs | 6 +++--- crates/biome_json_parser/src/lexer/mod.rs | 4 ++-- crates/biome_unicode_table/src/bytes.rs | 7 +++++-- crates/biome_unicode_table/src/lib.rs | 15 +++------------ 5 files changed, 26 insertions(+), 29 deletions(-) diff --git a/crates/biome_css_parser/src/lexer/mod.rs b/crates/biome_css_parser/src/lexer/mod.rs index 67f052040323..1ab710860676 100644 --- a/crates/biome_css_parser/src/lexer/mod.rs +++ b/crates/biome_css_parser/src/lexer/mod.rs @@ -10,7 +10,8 @@ use biome_parser::lexer::{ }; use biome_rowan::SyntaxKind; use biome_unicode_table::{ - is_css_id_continue, is_css_id_start, lookup_byte, Dispatch, Dispatch::*, + is_css_non_ascii, lookup_byte, + Dispatch::{self, *}, }; use std::char::REPLACEMENT_CHARACTER; @@ -319,7 +320,7 @@ impl<'src> CssLexer<'src> { LSS => self.consume_lss(), - IDT if self.peek_byte() == Some(b'=') => { + IDT | DOL if self.peek_byte() == Some(b'=') => { self.advance(1); self.consume_byte(T!["$="]) } @@ -461,7 +462,7 @@ impl<'src> CssLexer<'src> { return match dispatch { // TLD byte covers `url(~package/tilde.css)`; // HAS byte covers `url(#IDofSVGpath);` - IDT | UNI | PRD | SLH | ZER | DIG | TLD | HAS => self.consume_url_raw_value(), + IDT | DOL | UNI | PRD | SLH | ZER | DIG | TLD | HAS => self.consume_url_raw_value(), _ => self.consume_token(current), }; } @@ -990,16 +991,16 @@ impl<'src> CssLexer<'src> { /// and `None` if it is not. fn consume_ident_part(&mut self, current: u8) -> Option { let chr = match lookup_byte(current) { - MIN | DIG | ZER => { + IDT | MIN | DIG | ZER => { self.advance(1); // SAFETY: We know that the current byte is a hyphen or a number. current as char } // name code point - UNI | IDT => { + UNI => { // SAFETY: We know that the current byte is a valid unicode code point let chr = self.current_char_unchecked(); - if is_css_id_continue(chr) { + if is_css_non_ascii(chr) { self.advance(chr.len_utf8()); chr } else { @@ -1273,26 +1274,28 @@ impl<'src> CssLexer<'src> { return false; }; match lookup_byte(next) { - MIN | DIG | ZER => true, + IDT | MIN | DIG | ZER => true, // If the third code point is a name-start code point // return true. - UNI | IDT if is_css_id_continue(self.char_unchecked_at(2)) => true, + UNI => is_css_non_ascii(self.char_unchecked_at(2)), // or the third and fourth code points are a valid escape // return true. BSL => self.is_valid_escape_at(3), _ => false, } } + IDT => true, // If the second code point is a name-start code point // return true. - UNI | IDT if is_css_id_start(self.peek_char_unchecked()) => true, + UNI => is_css_non_ascii(self.peek_char_unchecked()), // or the second and third code points are a valid escape // return true. BSL => self.is_valid_escape_at(2), _ => false, } } - UNI | IDT if is_css_id_start(self.current_char_unchecked()) => true, + IDT => true, + UNI => is_css_non_ascii(self.current_char_unchecked()), // U+005C REVERSE SOLIDUS (\) // If the first and second code points are a valid escape, return true. Otherwise, // return false. diff --git a/crates/biome_js_parser/src/lexer/mod.rs b/crates/biome_js_parser/src/lexer/mod.rs index 84e2cf2e97e7..251a4758646f 100644 --- a/crates/biome_js_parser/src/lexer/mod.rs +++ b/crates/biome_js_parser/src/lexer/mod.rs @@ -852,7 +852,7 @@ impl<'src> JsLexer<'src> { let b = unsafe { self.current_unchecked() }; match lookup_byte(b) { - IDT | DIG | ZER => Some((b as char, false)), + IDT | DOL | DIG | ZER => Some((b as char, false)), // FIXME: This should use ID_Continue, not XID_Continue UNI => { let chr = self.current_char_unchecked(); @@ -920,7 +920,7 @@ impl<'src> JsLexer<'src> { false } } - IDT => true, + IDT | DOL => true, _ => false, } } @@ -1880,7 +1880,7 @@ impl<'src> JsLexer<'src> { ERROR_TOKEN } } - IDT => self.resolve_identifier(byte as char), + IDT | DOL => self.resolve_identifier(byte as char), DIG => { self.read_number(false); self.verify_number_end() diff --git a/crates/biome_json_parser/src/lexer/mod.rs b/crates/biome_json_parser/src/lexer/mod.rs index ea5656ab1e01..930f82e3fbf8 100644 --- a/crates/biome_json_parser/src/lexer/mod.rs +++ b/crates/biome_json_parser/src/lexer/mod.rs @@ -307,7 +307,7 @@ impl<'src> Lexer<'src> { match dispatched { WHS => self.consume_newline_or_whitespaces(), QOT => self.lex_string_literal(current), - IDT => self.lex_identifier(current), + IDT | DOL => self.lex_identifier(current), COM => self.eat_byte(T![,]), MIN | DIG | ZER => self.lex_number(current), COL => self.eat_byte(T![:]), @@ -689,7 +689,7 @@ impl<'src> Lexer<'src> { while let Some(byte) = self.current_byte() { self.current_char_unchecked(); match lookup_byte(byte) { - IDT | DIG | ZER => { + IDT | DOL | DIG | ZER => { keyword = keyword.next_character(byte); self.advance(1) } diff --git a/crates/biome_unicode_table/src/bytes.rs b/crates/biome_unicode_table/src/bytes.rs index 1f5fa58bb2ac..37b6b830cf7b 100644 --- a/crates/biome_unicode_table/src/bytes.rs +++ b/crates/biome_unicode_table/src/bytes.rs @@ -16,9 +16,12 @@ pub enum Dispatch { /// Single `'` or Double quote `"` QOT, - /// ASCII identifier, or `$`, `_` + /// ASCII letter or `_` IDT, + /// Dollar sign `$` + DOL, + /// Hash `#` HAS, @@ -115,7 +118,7 @@ pub(crate) static DISPATCHER: [Dispatch; 256] = [ //0 1 2 3 4 5 6 7 8 9 A B C D E F // ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, WHS, WHS, WHS, WHS, WHS, ERR, ERR, // 0 ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 - WHS, EXL, QOT, HAS, IDT, PRC, AMP, QOT, PNO, PNC, MUL, PLS, COM, MIN, PRD, SLH, // 2 + WHS, EXL, QOT, HAS, DOL, PRC, AMP, QOT, PNO, PNC, MUL, PLS, COM, MIN, PRD, SLH, // 2 ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3 AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, BSL, BTC, CRT, IDT, // 5 diff --git a/crates/biome_unicode_table/src/lib.rs b/crates/biome_unicode_table/src/lib.rs index b70cbb9422f9..f7639eb16adf 100644 --- a/crates/biome_unicode_table/src/lib.rs +++ b/crates/biome_unicode_table/src/lib.rs @@ -12,15 +12,12 @@ pub fn is_html_id_start(c: char) -> bool { ID_Start(c) } -/// Tests if `c` is a valid start of a CSS identifier +/// Is `c` a CSS non-ascii character. #[inline] -pub fn is_css_id_start(c: char) -> bool { +pub fn is_css_non_ascii(c: char) -> bool { matches!( c as u32, - 0x41..=0x5a // A-Z - | 0x5f // `_` - | 0x61..=0x7a // a-z - | 0xB7 + 0xB7 | 0xc0..=0xd6 | 0xd8..=0xf6 | 0xf8..=0x37D @@ -38,12 +35,6 @@ pub fn is_css_id_start(c: char) -> bool { ) } -/// Tests if `c` is a valid continuation of a CSS identifier. -#[inline] -pub fn is_css_id_continue(c: char) -> bool { - matches!(c, '0'..='9' | '-') || is_css_id_start(c) -} - /// Tests if `c` is a valid start of a js identifier #[inline] pub fn is_js_id_start(c: char) -> bool {