From 40b27728653df13a4e7dd6d344965d0b65a72a65 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 9 Feb 2024 16:03:27 -0500 Subject: [PATCH] Remove unnecessary string cloning from the parser (#9884) Closes https://github.com/astral-sh/ruff/issues/9869. --- Cargo.lock | 13 +- Cargo.toml | 1 + .../rules/hardcoded_bind_all_interfaces.rs | 4 +- crates/ruff_linter/src/rules/flynt/helpers.rs | 4 +- crates/ruff_python_ast/src/comparable.rs | 2 +- crates/ruff_python_ast/src/nodes.rs | 8 +- crates/ruff_python_parser/Cargo.toml | 5 +- crates/ruff_python_parser/src/lib.rs | 4 +- crates/ruff_python_parser/src/python.lalrpop | 4 +- crates/ruff_python_parser/src/python.rs | 6 +- crates/ruff_python_parser/src/string.rs | 271 ++++++++++++------ 11 files changed, 215 insertions(+), 107 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5703ae61ca9f5..97511968ff8c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "bstr" -version = "1.6.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.3.9", + "regex-automata 0.4.3", "serde", ] @@ -1921,12 +1921,6 @@ dependencies = [ "regex-syntax 0.6.29", ] -[[package]] -name = "regex-automata" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" - [[package]] name = "regex-automata" version = "0.4.3" @@ -2342,6 +2336,7 @@ version = "0.0.0" dependencies = [ "anyhow", "bitflags 2.4.1", + "bstr", "insta", "is-macro", "itertools 0.12.1", diff --git a/Cargo.toml b/Cargo.toml index a783bbebef3e2..c4f4492c18e80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ argfile = { version = "0.1.6" } assert_cmd = { version = "2.0.13" } bincode = { version = "1.3.3" } bitflags = { version = "2.4.1" } +bstr = { version = "1.9.0" } cachedir = { version = "0.3.1" } chrono = { version = "0.4.33", default-features = false, features = ["clock"] } clap = { version = "4.4.18", features = ["derive"] } diff --git a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs index 38295b71316a2..0e4301ee44c07 100644 --- a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs +++ b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs @@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces { pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) { let is_bind_all_interface = match string { StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0", - StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0", + StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => { + &**value == "0.0.0.0" + } StringLike::BytesLiteral(_) => return, }; diff --git a/crates/ruff_linter/src/rules/flynt/helpers.rs b/crates/ruff_linter/src/rules/flynt/helpers.rs index 7a6af204d13f9..640f922d6faa2 100644 --- a/crates/ruff_linter/src/rules/flynt/helpers.rs +++ b/crates/ruff_linter/src/rules/flynt/helpers.rs @@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement { /// Convert a string to a [`ast::FStringElement::Literal`]. pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement { ast::FStringElement::Literal(ast::FStringLiteralElement { - value: s.to_owned(), + value: s.to_string().into_boxed_str(), range: TextRange::default(), }) } @@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option { match expr { Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => { Some(ast::FStringElement::Literal(ast::FStringLiteralElement { - value: value.to_string(), + value: value.to_string().into_boxed_str(), range: *range, })) } diff --git a/crates/ruff_python_ast/src/comparable.rs b/crates/ruff_python_ast/src/comparable.rs index bc6327f01dca0..344bb615ce95e 100644 --- a/crates/ruff_python_ast/src/comparable.rs +++ b/crates/ruff_python_ast/src/comparable.rs @@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> { impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> { fn from(bytes_literal: &'a ast::BytesLiteral) -> Self { Self { - value: bytes_literal.value.as_slice(), + value: &bytes_literal.value, } } } diff --git a/crates/ruff_python_ast/src/nodes.rs b/crates/ruff_python_ast/src/nodes.rs index cfb8355c69f05..b6581eef40524 100644 --- a/crates/ruff_python_ast/src/nodes.rs +++ b/crates/ruff_python_ast/src/nodes.rs @@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement { #[derive(Clone, Debug, PartialEq)] pub struct FStringLiteralElement { pub range: TextRange, - pub value: String, + pub value: Box, } impl Ranged for FStringLiteralElement { @@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement { type Target = str; fn deref(&self) -> &Self::Target { - self.value.as_str() + &self.value } } @@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner { #[derive(Clone, Debug, Default, PartialEq)] pub struct BytesLiteral { pub range: TextRange, - pub value: Vec, + pub value: Box<[u8]>, } impl Ranged for BytesLiteral { @@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral { type Target = [u8]; fn deref(&self) -> &Self::Target { - self.value.as_slice() + &self.value } } diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 6bcdf6c902172..886bb07fec0b6 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } bitflags = { workspace = true } +bstr = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { workspace = true, default-features = false } memchr = { workspace = true } -unicode-ident = { workspace = true } -unicode_names2 = { workspace = true } rustc-hash = { workspace = true } static_assertions = { workspace = true } +unicode-ident = { workspace = true } +unicode_names2 = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 2f95c684e87d9..7c9c5402fb442 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind}; use crate::lexer::LexResult; -mod function; -// Skip flattening lexer to distinguish from full ruff_python_parser mod context; +mod function; mod invalid; +// Skip flattening lexer to distinguish from full ruff_python_parser pub mod lexer; mod parser; mod soft_keywords; diff --git a/crates/ruff_python_parser/src/python.lalrpop b/crates/ruff_python_parser/src/python.lalrpop index 2d628ae74a805..f61ae2c2b4eff 100644 --- a/crates/ruff_python_parser/src/python.lalrpop +++ b/crates/ruff_python_parser/src/python.lalrpop @@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = { StringLiteral: StringType = { =>? { let (source, kind, triple_quoted) = string; - Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?) + Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?) } }; @@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = { FStringReplacementField, =>? { let (source, is_raw, _) = fstring_middle; - Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?) + Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?) } }; diff --git a/crates/ruff_python_parser/src/python.rs b/crates/ruff_python_parser/src/python.rs index 1372b6e4fb260..95de336aa7614 100644 --- a/crates/ruff_python_parser/src/python.rs +++ b/crates/ruff_python_parser/src/python.rs @@ -1,5 +1,5 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c +// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use ruff_python_ast::{self as ast, Int, IpyEscapeKind}; use crate::{ @@ -36369,7 +36369,7 @@ fn __action217< { { let (source, kind, triple_quoted) = string; - Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?) + Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?) } } @@ -36419,7 +36419,7 @@ fn __action220< { { let (source, is_raw, _) = fstring_middle; - Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?) + Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?) } } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 5b15474cf2dd6..fb536537216a0 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -1,7 +1,9 @@ //! Parsing of string literals, bytes literals, and implicit string concatenation. +use bstr::ByteSlice; + use ruff_python_ast::{self as ast, Expr}; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; +use ruff_text_size::{Ranged, TextRange, TextSize}; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; @@ -32,34 +34,40 @@ impl From for Expr { } } -struct StringParser<'a> { - rest: &'a str, +enum EscapedChar { + Literal(char), + Escape(char), +} + +struct StringParser { + source: Box, + cursor: usize, kind: StringKind, - location: TextSize, + offset: TextSize, range: TextRange, } -impl<'a> StringParser<'a> { - fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self { +impl StringParser { + fn new(source: Box, kind: StringKind, offset: TextSize, range: TextRange) -> Self { Self { - rest: source, + source, + cursor: 0, kind, - location: start, + offset, range, } } #[inline] - fn skip_bytes(&mut self, bytes: usize) -> &'a str { - let skipped_str = &self.rest[..bytes]; - self.rest = &self.rest[bytes..]; - self.location += skipped_str.text_len(); + fn skip_bytes(&mut self, bytes: usize) -> &str { + let skipped_str = &self.source[self.cursor..self.cursor + bytes]; + self.cursor += bytes; skipped_str } #[inline] fn get_pos(&self) -> TextSize { - self.location + self.offset + TextSize::try_from(self.cursor).unwrap() } /// Returns the next byte in the string, if there is one. @@ -69,25 +77,23 @@ impl<'a> StringParser<'a> { /// When the next byte is a part of a multi-byte character. #[inline] fn next_byte(&mut self) -> Option { - self.rest.as_bytes().first().map(|&byte| { - self.rest = &self.rest[1..]; - self.location += TextSize::new(1); + self.source[self.cursor..].as_bytes().first().map(|&byte| { + self.cursor += 1; byte }) } #[inline] fn next_char(&mut self) -> Option { - self.rest.chars().next().map(|c| { - self.rest = &self.rest[c.len_utf8()..]; - self.location += c.text_len(); + self.source[self.cursor..].chars().next().map(|c| { + self.cursor += c.len_utf8(); c }) } #[inline] fn peek_byte(&self) -> Option { - self.rest.as_bytes().first().copied() + self.source[self.cursor..].as_bytes().first().copied() } fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { @@ -135,7 +141,7 @@ impl<'a> StringParser<'a> { }; let start_pos = self.get_pos(); - let Some(close_idx) = self.rest.find('}') else { + let Some(close_idx) = self.source[self.cursor..].find('}') else { return Err(LexicalError::new( LexicalErrorType::StringError, self.get_pos(), @@ -149,7 +155,8 @@ impl<'a> StringParser<'a> { .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) } - fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { + /// Parse an escaped character, returning the new character. + fn parse_escaped_char(&mut self) -> Result, LexicalError> { let Some(first_char) = self.next_char() else { return Err(LexicalError::new( LexicalErrorType::StringError, @@ -174,13 +181,13 @@ impl<'a> StringParser<'a> { 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, // Special cases where the escape sequence is not a single character - '\n' => return Ok(()), + '\n' => return Ok(None), '\r' => { if self.peek_byte() == Some(b'\n') { self.next_byte(); } - return Ok(()); + return Ok(None); } _ => { if self.kind.is_any_bytes() && !first_char.is_ascii() { @@ -194,21 +201,42 @@ impl<'a> StringParser<'a> { )); } - string.push('\\'); - - first_char + return Ok(Some(EscapedChar::Escape(first_char))); } }; - string.push(new_char); - - Ok(()) + Ok(Some(EscapedChar::Literal(new_char))) } - fn parse_fstring_middle(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { + fn parse_fstring_middle(mut self) -> Result { + // Fast-path: if the f-string doesn't contain any escape sequences, return the literal. + let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else { + return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { + value: self.source, + range: self.range, + })); + }; + + let mut value = String::with_capacity(self.source.len()); + loop { + // Add the characters before the escape sequence (or curly brace) to the string. + let before_with_slash_or_brace = self.skip_bytes(index + 1); + let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match &self.source.as_bytes()[self.cursor - 1] { + // If there are any curly braces inside a `FStringMiddle` token, + // then they were escaped (i.e. `{{` or `}}`). This means that + // we need increase the location by 2 instead of 1. + b'{' => { + self.offset += TextSize::from(1); + value.push('{'); + } + b'}' => { + self.offset += TextSize::from(1); + value.push('}'); + } // We can encounter a `\` as the last character in a `FStringMiddle` // token which is valid in this context. For example, // @@ -229,71 +257,152 @@ impl<'a> StringParser<'a> { // This is still an invalid escape sequence, but we don't want to // raise a syntax error as is done by the CPython parser. It might // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas - '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { - self.parse_escaped_char(&mut value)?; + b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push('\\'); + value.push(c); + } + } } - // If there are any curly braces inside a `FStringMiddle` token, - // then they were escaped (i.e. `{{` or `}}`). This means that - // we need increase the location by 2 instead of 1. - ch @ ('{' | '}') => { - self.location += ch.text_len(); - value.push(ch); + ch => { + value.push(char::from(*ch)); } - ch => value.push(ch), } + + let Some(next_index) = + memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + index = next_index; } + Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { - value, + value: value.into_boxed_str(), range: self.range, })) } - fn parse_bytes(&mut self) -> Result { - let mut content = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - self.parse_escaped_char(&mut content)?; - } - ch => { - if !ch.is_ascii() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters" - .to_string() - .into_boxed_str(), - ), - self.get_pos(), - )); - } - content.push(ch); + fn parse_bytes(mut self) -> Result { + if let Some(index) = self.source.as_bytes().find_non_ascii_byte() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters" + .to_string() + .into_boxed_str(), + ), + self.offset + TextSize::try_from(index).unwrap(), + )); + } + + if self.kind.is_raw() { + // For raw strings, no escaping is necessary. + return Ok(StringType::Bytes(ast::BytesLiteral { + value: self.source.into_boxed_bytes(), + range: self.range, + })); + } + + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + // If the string doesn't contain any escape sequences, return the owned string. + return Ok(StringType::Bytes(ast::BytesLiteral { + value: self.source.into_boxed_bytes(), + range: self.range, + })); + }; + + // If the string contains escape sequences, we need to parse them. + let mut value = Vec::with_capacity(self.source.len()); + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(escape + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.extend_from_slice(before.as_bytes()); + + // Add the escaped character to the string. + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c as u8), + Some(EscapedChar::Escape(c)) => { + value.push(b'\\'); + value.push(c as u8); } } + + let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.extend_from_slice(rest.as_bytes()); + break; + }; + + // Update the position of the next escape sequence. + escape = next_escape; } + Ok(StringType::Bytes(ast::BytesLiteral { - value: content.chars().map(|c| c as u8).collect::>(), + value: value.into_boxed_slice(), range: self.range, })) } - fn parse_string(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); + fn parse_string(mut self) -> Result { if self.kind.is_raw() { - value.push_str(self.skip_bytes(self.rest.len())); - } else { - loop { - let Some(escape_idx) = self.rest.find('\\') else { - value.push_str(self.skip_bytes(self.rest.len())); - break; - }; + // For raw strings, no escaping is necessary. + return Ok(StringType::Str(ast::StringLiteral { + value: self.source, + unicode: self.kind.is_unicode(), + range: self.range, + })); + } - let before_with_slash = self.skip_bytes(escape_idx + 1); - let before = &before_with_slash[..before_with_slash.len() - 1]; + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + // If the string doesn't contain any escape sequences, return the owned string. + return Ok(StringType::Str(ast::StringLiteral { + value: self.source, + unicode: self.kind.is_unicode(), + range: self.range, + })); + }; - value.push_str(before); - self.parse_escaped_char(&mut value)?; + // If the string contains escape sequences, we need to parse them. + let mut value = String::with_capacity(self.source.len()); + + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(escape + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push('\\'); + value.push(c); + } } + + let Some(next_escape) = self.source[self.cursor..].find('\\') else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + // Update the position of the next escape sequence. + escape = next_escape; } + Ok(StringType::Str(ast::StringLiteral { value: value.into_boxed_str(), unicode: self.kind.is_unicode(), @@ -301,7 +410,7 @@ impl<'a> StringParser<'a> { })) } - fn parse(&mut self) -> Result { + fn parse(self) -> Result { if self.kind.is_any_bytes() { self.parse_bytes() } else { @@ -311,7 +420,7 @@ impl<'a> StringParser<'a> { } pub(crate) fn parse_string_literal( - source: &str, + source: Box, kind: StringKind, triple_quoted: bool, range: TextRange, @@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal( } pub(crate) fn parse_fstring_literal_element( - source: &str, + source: Box, is_raw: bool, range: TextRange, ) -> Result { @@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings( if has_bytes && byte_literal_count < strings.len() { return Err(LexicalError::new( LexicalErrorType::OtherError( - "cannot mix bytes and nonbytes literals" + "cannot mix bytes and non-bytes literals" .to_string() .into_boxed_str(), ),