From 0871805c7d2d5ecdc7e2af4b82ee33809bd371d5 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Wed, 7 Feb 2024 17:04:34 -0500 Subject: [PATCH 1/5] Remove unnecessary string cloning from the parser --- crates/ruff_python_parser/src/ascii.rs | 345 +++++++++++++++++++ crates/ruff_python_parser/src/lib.rs | 1 + crates/ruff_python_parser/src/python.lalrpop | 4 +- crates/ruff_python_parser/src/python.rs | 6 +- crates/ruff_python_parser/src/string.rs | 270 ++++++++++----- 5 files changed, 541 insertions(+), 85 deletions(-) create mode 100644 crates/ruff_python_parser/src/ascii.rs diff --git a/crates/ruff_python_parser/src/ascii.rs b/crates/ruff_python_parser/src/ascii.rs new file mode 100644 index 0000000000000..87614dc98a0e8 --- /dev/null +++ b/crates/ruff_python_parser/src/ascii.rs @@ -0,0 +1,345 @@ +#![allow( + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_ptr_alignment, + clippy::inline_always, + clippy::ptr_as_ptr, + unsafe_code +)] + +//! Source: + +// The following ~400 lines of code exists for exactly one purpose, which is +// to optimize this code: +// +// byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len()) +// +// Yes... Overengineered is a word that comes to mind, but this is effectively +// a very similar problem to memchr, and virtually nobody has been able to +// resist optimizing the crap out of that (except for perhaps the BSD and MUSL +// folks). In particular, this routine makes a very common case (ASCII) very +// fast, which seems worth it. We do stop short of adding AVX variants of the +// code below in order to retain our sanity and also to avoid needing to deal +// with runtime target feature detection. RESIST! +// +// In order to understand the SIMD version below, it would be good to read this +// comment describing how my memchr routine works: +// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106 +// +// The primary difference with memchr is that for ASCII, we can do a bit less +// work. In particular, we don't need to detect the presence of a specific +// byte, but rather, whether any byte has its most significant bit set. That +// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to +// _mm_movemask_epi8. + +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +const USIZE_BYTES: usize = core::mem::size_of::(); +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES; + +// This is a mask where the most significant bit of each byte in the usize +// is set. We test this bit to determine whether a character is ASCII or not. +// Namely, a single byte is regarded as an ASCII codepoint if and only if it's +// most significant bit is not set. +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080; +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +const ASCII_MASK: usize = ASCII_MASK_U64 as usize; + +/// Returns the index of the first non ASCII byte in the given slice. +/// +/// If slice only contains ASCII bytes, then the length of the slice is +/// returned. +pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize { + #[cfg(any(miri, not(target_arch = "x86_64")))] + { + first_non_ascii_byte_fallback(slice) + } + + #[cfg(all(not(miri), target_arch = "x86_64"))] + { + first_non_ascii_byte_sse2(slice) + } +} + +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { + let align = USIZE_BYTES - 1; + let start_ptr = slice.as_ptr(); + let end_ptr = slice[slice.len()..].as_ptr(); + let mut ptr = start_ptr; + + unsafe { + if slice.len() < USIZE_BYTES { + return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); + } + + let chunk = read_unaligned_usize(ptr); + let mask = chunk & ASCII_MASK; + if mask != 0 { + return first_non_ascii_byte_mask(mask); + } + + ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); + if slice.len() >= FALLBACK_LOOP_SIZE { + while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let a = *ptr.cast::(); + let b = *ptr_add(ptr, USIZE_BYTES).cast::(); + if (a | b) & ASCII_MASK != 0 { + // What a kludge. We wrap the position finding code into + // a non-inlineable function, which makes the codegen in + // the tight loop above a bit better by avoiding a + // couple extra movs. We pay for it by two additional + // stores, but only in the case of finding a non-ASCII + // byte. + #[inline(never)] + unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize { + let a = *ptr.cast::(); + let b = *ptr_add(ptr, USIZE_BYTES).cast::(); + + let mut at = sub(ptr, start_ptr); + let maska = a & ASCII_MASK; + if maska != 0 { + return at + first_non_ascii_byte_mask(maska); + } + + at += USIZE_BYTES; + let maskb = b & ASCII_MASK; + debug_assert!(maskb != 0); + at + first_non_ascii_byte_mask(maskb) + } + return findpos(start_ptr, ptr); + } + ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE); + } + } + first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) + } +} + +#[cfg(all(not(miri), target_arch = "x86_64"))] +fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { + use core::arch::x86_64::{ + __m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, + }; + + const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); + const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; + const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; + + let start_ptr = slice.as_ptr(); + let end_ptr = slice[slice.len()..].as_ptr(); + let mut ptr = start_ptr; + + unsafe { + if slice.len() < VECTOR_SIZE { + return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); + } + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(chunk); + if mask != 0 { + return mask.trailing_zeros() as usize; + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); + if slice.len() >= VECTOR_LOOP_SIZE { + while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); + let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); + + let or1 = _mm_or_si128(a, b); + let or2 = _mm_or_si128(c, d); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask = _mm_movemask_epi8(a); + if mask != 0 { + return at + mask.trailing_zeros() as usize; + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(b); + if mask != 0 { + return at + mask.trailing_zeros() as usize; + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(c); + if mask != 0 { + return at + mask.trailing_zeros() as usize; + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(d); + debug_assert!(mask != 0); + return at + mask.trailing_zeros() as usize; + } + ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); + } + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(chunk); + if mask != 0 { + return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; + } + ptr = ptr.add(VECTOR_SIZE); + } + first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) + } +} + +#[inline(always)] +unsafe fn first_non_ascii_byte_slow( + start_ptr: *const u8, + end_ptr: *const u8, + mut ptr: *const u8, +) -> usize { + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr); + + while ptr < end_ptr { + if *ptr > 0x7F { + return sub(ptr, start_ptr); + } + ptr = ptr.offset(1); + } + sub(end_ptr, start_ptr) +} + +/// Compute the position of the first ASCII byte in the given mask. +/// +/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is +/// 8 contiguous bytes of the slice being checked where *at least* one of those +/// bytes is not an ASCII byte. +/// +/// The position returned is always in the inclusive range [0, 7]. +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +fn first_non_ascii_byte_mask(mask: usize) -> usize { + #[cfg(target_endian = "little")] + { + mask.trailing_zeros() as usize / 8 + } + #[cfg(target_endian = "big")] + { + mask.leading_zeros() as usize / 8 + } +} + +/// Increment the given pointer by the given amount. +unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { + debug_assert!(amt < ::core::isize::MAX as usize); + ptr.add(amt) +} + +/// Decrement the given pointer by the given amount. +unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { + debug_assert!(amt < ::core::isize::MAX as usize); + ptr.offset((amt as isize).wrapping_neg()) +} + +#[cfg(any(test, miri, not(target_arch = "x86_64")))] +unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { + use core::ptr; + + let mut n: usize = 0; + ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES); + n +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} + +#[cfg(test)] +mod tests { + use super::*; + + // Our testing approach here is to try and exhaustively test every case. + // This includes the position at which a non-ASCII byte occurs in addition + // to the alignment of the slice that we're searching. + + #[test] + fn positive_fallback_forward() { + for i in 0..517 { + let s = "a".repeat(i); + assert_eq!( + i, + first_non_ascii_byte_fallback(s.as_bytes()), + "i: {:?}, len: {:?}, s: {:?}", + i, + s.len(), + s + ); + } + } + + #[test] + #[cfg(target_arch = "x86_64")] + #[cfg(not(miri))] + fn positive_sse2_forward() { + for i in 0..517 { + let b = "a".repeat(i).into_bytes(); + assert_eq!(b.len(), first_non_ascii_byte_sse2(&b)); + } + } + + #[test] + #[cfg(not(miri))] + fn negative_fallback_forward() { + for i in 0..517 { + for align in 0..65 { + let mut s = "a".repeat(i); + s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); + let s = s.get(align..).unwrap_or(""); + assert_eq!( + i.saturating_sub(align), + first_non_ascii_byte_fallback(s.as_bytes()), + "i: {:?}, align: {:?}, len: {:?}, s: {:?}", + i, + align, + s.len(), + s + ); + } + } + } + + #[test] + #[cfg(target_arch = "x86_64")] + #[cfg(not(miri))] + fn negative_sse2_forward() { + for i in 0..517 { + for align in 0..65 { + let mut s = "a".repeat(i); + s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); + let s = s.get(align..).unwrap_or(""); + assert_eq!( + i.saturating_sub(align), + first_non_ascii_byte_sse2(s.as_bytes()), + "i: {:?}, align: {:?}, len: {:?}, s: {:?}", + i, + align, + s.len(), + s + ); + } + } + } +} diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 2f95c684e87d9..074746ec01ab8 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -121,6 +121,7 @@ use crate::lexer::LexResult; mod function; // Skip flattening lexer to distinguish from full ruff_python_parser +mod ascii; mod context; mod invalid; pub mod lexer; diff --git a/crates/ruff_python_parser/src/python.lalrpop b/crates/ruff_python_parser/src/python.lalrpop index 2d628ae74a805..f61ae2c2b4eff 100644 --- a/crates/ruff_python_parser/src/python.lalrpop +++ b/crates/ruff_python_parser/src/python.lalrpop @@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = { StringLiteral: StringType = { =>? { let (source, kind, triple_quoted) = string; - Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?) + Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?) } }; @@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = { FStringReplacementField, =>? { let (source, is_raw, _) = fstring_middle; - Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?) + Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?) } }; diff --git a/crates/ruff_python_parser/src/python.rs b/crates/ruff_python_parser/src/python.rs index 1372b6e4fb260..95de336aa7614 100644 --- a/crates/ruff_python_parser/src/python.rs +++ b/crates/ruff_python_parser/src/python.rs @@ -1,5 +1,5 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c +// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use ruff_python_ast::{self as ast, Int, IpyEscapeKind}; use crate::{ @@ -36369,7 +36369,7 @@ fn __action217< { { let (source, kind, triple_quoted) = string; - Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?) + Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?) } } @@ -36419,7 +36419,7 @@ fn __action220< { { let (source, is_raw, _) = fstring_middle; - Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?) + Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?) } } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 5b15474cf2dd6..124ba688a3a7a 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -1,8 +1,9 @@ //! Parsing of string literals, bytes literals, and implicit string concatenation. use ruff_python_ast::{self as ast, Expr}; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; +use ruff_text_size::{Ranged, TextRange, TextSize}; +use crate::ascii::first_non_ascii_byte; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; @@ -32,34 +33,40 @@ impl From for Expr { } } -struct StringParser<'a> { - rest: &'a str, +enum EscapedChar { + Literal(char), + Escape(char), +} + +struct StringParser { + source: Box, + cursor: usize, kind: StringKind, - location: TextSize, + offset: TextSize, range: TextRange, } -impl<'a> StringParser<'a> { - fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self { +impl StringParser { + fn new(source: Box, kind: StringKind, offset: TextSize, range: TextRange) -> Self { Self { - rest: source, + source, + cursor: 0, kind, - location: start, + offset, range, } } #[inline] - fn skip_bytes(&mut self, bytes: usize) -> &'a str { - let skipped_str = &self.rest[..bytes]; - self.rest = &self.rest[bytes..]; - self.location += skipped_str.text_len(); + fn skip_bytes(&mut self, bytes: usize) -> &str { + let skipped_str = &self.source[self.cursor..self.cursor + bytes]; + self.cursor += bytes; skipped_str } #[inline] fn get_pos(&self) -> TextSize { - self.location + self.offset + TextSize::try_from(self.cursor).unwrap() } /// Returns the next byte in the string, if there is one. @@ -69,25 +76,23 @@ impl<'a> StringParser<'a> { /// When the next byte is a part of a multi-byte character. #[inline] fn next_byte(&mut self) -> Option { - self.rest.as_bytes().first().map(|&byte| { - self.rest = &self.rest[1..]; - self.location += TextSize::new(1); + self.source[self.cursor..].as_bytes().first().map(|&byte| { + self.cursor += 1; byte }) } #[inline] fn next_char(&mut self) -> Option { - self.rest.chars().next().map(|c| { - self.rest = &self.rest[c.len_utf8()..]; - self.location += c.text_len(); + self.source[self.cursor..].chars().next().map(|c| { + self.cursor += c.len_utf8(); c }) } #[inline] fn peek_byte(&self) -> Option { - self.rest.as_bytes().first().copied() + self.source[self.cursor..].as_bytes().first().copied() } fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { @@ -135,7 +140,7 @@ impl<'a> StringParser<'a> { }; let start_pos = self.get_pos(); - let Some(close_idx) = self.rest.find('}') else { + let Some(close_idx) = self.source[self.cursor..].find('}') else { return Err(LexicalError::new( LexicalErrorType::StringError, self.get_pos(), @@ -149,7 +154,8 @@ impl<'a> StringParser<'a> { .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) } - fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { + /// Parse an escaped character, returning the new character. + fn parse_escaped_char(&mut self) -> Result, LexicalError> { let Some(first_char) = self.next_char() else { return Err(LexicalError::new( LexicalErrorType::StringError, @@ -174,13 +180,13 @@ impl<'a> StringParser<'a> { 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, // Special cases where the escape sequence is not a single character - '\n' => return Ok(()), + '\n' => return Ok(None), '\r' => { if self.peek_byte() == Some(b'\n') { self.next_byte(); } - return Ok(()); + return Ok(None); } _ => { if self.kind.is_any_bytes() && !first_char.is_ascii() { @@ -194,21 +200,42 @@ impl<'a> StringParser<'a> { )); } - string.push('\\'); - - first_char + return Ok(Some(EscapedChar::Escape(first_char))); } }; - string.push(new_char); - - Ok(()) + Ok(Some(EscapedChar::Literal(new_char))) } - fn parse_fstring_middle(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { + fn parse_fstring_middle(mut self) -> Result { + // Fast-path: if the f-string doesn't contain any escape sequences, return the literal. + let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else { + return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { + value: self.source, + range: self.range, + })); + }; + + let mut value = String::with_capacity(self.source.len()); + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(index + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match &self.source.as_bytes()[self.cursor - 1] { + // If there are any curly braces inside a `FStringMiddle` token, + // then they were escaped (i.e. `{{` or `}}`). This means that + // we need increase the location by 2 instead of 1. + b'{' => { + self.offset += TextSize::from(1); + value.push('{'); + } + b'}' => { + self.offset += TextSize::from(1); + value.push('}'); + } // We can encounter a `\` as the last character in a `FStringMiddle` // token which is valid in this context. For example, // @@ -229,71 +256,154 @@ impl<'a> StringParser<'a> { // This is still an invalid escape sequence, but we don't want to // raise a syntax error as is done by the CPython parser. It might // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas - '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { - self.parse_escaped_char(&mut value)?; + b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push('\\'); + value.push(c); + } + } } - // If there are any curly braces inside a `FStringMiddle` token, - // then they were escaped (i.e. `{{` or `}}`). This means that - // we need increase the location by 2 instead of 1. - ch @ ('{' | '}') => { - self.location += ch.text_len(); - value.push(ch); + ch => { + value.push(char::from(*ch)); } - ch => value.push(ch), } + + let Some(next_index) = + memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + index = next_index; } + Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { value, range: self.range, })) } - fn parse_bytes(&mut self) -> Result { - let mut content = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - self.parse_escaped_char(&mut content)?; - } - ch => { - if !ch.is_ascii() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters" - .to_string() - .into_boxed_str(), - ), - self.get_pos(), - )); - } - content.push(ch); + fn parse_bytes(mut self) -> Result { + let index = first_non_ascii_byte(self.source.as_bytes()); + if index < self.source.len() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters" + .to_string() + .into_boxed_str(), + ), + self.offset + TextSize::try_from(index).unwrap(), + )); + } + + if self.kind.is_raw() { + // For raw strings, no escaping is necessary. + return Ok(StringType::Bytes(ast::BytesLiteral { + value: self.source.into_bytes(), + range: self.range, + })); + } + + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + // If the string doesn't contain any escape sequences, return the owned string. + return Ok(StringType::Bytes(ast::BytesLiteral { + value: self.source.into_bytes(), + range: self.range, + })); + }; + + // If the string contains escape sequences, we need to parse them. + let mut value = Vec::with_capacity(self.source.len()); + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(escape + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.extend_from_slice(before.as_bytes()); + + // Add the escaped character to the string. + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c as u8), + Some(EscapedChar::Escape(c)) => { + value.push(b'\\'); + value.push(c as u8); } } + + let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.extend_from_slice(rest.as_bytes()); + break; + }; + + // Update the position of the next escape sequence. + escape = next_escape; } + Ok(StringType::Bytes(ast::BytesLiteral { - value: content.chars().map(|c| c as u8).collect::>(), + value, range: self.range, })) } - fn parse_string(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); + fn parse_string(mut self) -> Result { if self.kind.is_raw() { - value.push_str(self.skip_bytes(self.rest.len())); - } else { - loop { - let Some(escape_idx) = self.rest.find('\\') else { - value.push_str(self.skip_bytes(self.rest.len())); - break; - }; + // For raw strings, no escaping is necessary. + return Ok(StringType::Str(ast::StringLiteral { + value: self.source, + unicode: self.kind.is_unicode(), + range: self.range, + })); + } - let before_with_slash = self.skip_bytes(escape_idx + 1); - let before = &before_with_slash[..before_with_slash.len() - 1]; + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + // If the string doesn't contain any escape sequences, return the owned string. + return Ok(StringType::Str(ast::StringLiteral { + value: self.source, + unicode: self.kind.is_unicode(), + range: self.range, + })); + }; - value.push_str(before); - self.parse_escaped_char(&mut value)?; + // If the string contains escape sequences, we need to parse them. + let mut value = String::with_capacity(self.source.len()); + + loop { + // Add the characters before the escape sequence to the string. + let before_with_slash = self.skip_bytes(escape + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + value.push_str(before); + + // Add the escaped character to the string. + match self.parse_escaped_char()? { + None => {} + Some(EscapedChar::Literal(c)) => value.push(c), + Some(EscapedChar::Escape(c)) => { + value.push('\\'); + value.push(c); + } } + + let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) + else { + // Add the rest of the string to the value. + let rest = &self.source[self.cursor..]; + value.push_str(rest); + break; + }; + + // Update the position of the next escape sequence. + escape = next_escape; } + Ok(StringType::Str(ast::StringLiteral { value: value.into_boxed_str(), unicode: self.kind.is_unicode(), @@ -301,7 +411,7 @@ impl<'a> StringParser<'a> { })) } - fn parse(&mut self) -> Result { + fn parse(self) -> Result { if self.kind.is_any_bytes() { self.parse_bytes() } else { @@ -311,7 +421,7 @@ impl<'a> StringParser<'a> { } pub(crate) fn parse_string_literal( - source: &str, + source: Box, kind: StringKind, triple_quoted: bool, range: TextRange, @@ -327,7 +437,7 @@ pub(crate) fn parse_string_literal( } pub(crate) fn parse_fstring_literal_element( - source: &str, + source: Box, is_raw: bool, range: TextRange, ) -> Result { @@ -360,7 +470,7 @@ pub(crate) fn concatenated_strings( if has_bytes && byte_literal_count < strings.len() { return Err(LexicalError::new( LexicalErrorType::OtherError( - "cannot mix bytes and nonbytes literals" + "cannot mix bytes and non-bytes literals" .to_string() .into_boxed_str(), ), From 46800aeadb9da78dd58a10ee44675c0510b2c115 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 8 Feb 2024 16:25:48 -0500 Subject: [PATCH 2/5] Box other strings --- Cargo.lock | 13 +- Cargo.toml | 1 + .../rules/hardcoded_bind_all_interfaces.rs | 4 +- crates/ruff_linter/src/rules/flynt/helpers.rs | 4 +- crates/ruff_python_ast/src/comparable.rs | 2 +- crates/ruff_python_ast/src/nodes.rs | 8 +- crates/ruff_python_parser/Cargo.toml | 5 +- crates/ruff_python_parser/src/ascii.rs | 345 ------------------ crates/ruff_python_parser/src/lib.rs | 5 +- crates/ruff_python_parser/src/string.rs | 20 +- 10 files changed, 30 insertions(+), 377 deletions(-) delete mode 100644 crates/ruff_python_parser/src/ascii.rs diff --git a/Cargo.lock b/Cargo.lock index 5703ae61ca9f5..97511968ff8c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "bstr" -version = "1.6.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.3.9", + "regex-automata 0.4.3", "serde", ] @@ -1921,12 +1921,6 @@ dependencies = [ "regex-syntax 0.6.29", ] -[[package]] -name = "regex-automata" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" - [[package]] name = "regex-automata" version = "0.4.3" @@ -2342,6 +2336,7 @@ version = "0.0.0" dependencies = [ "anyhow", "bitflags 2.4.1", + "bstr", "insta", "is-macro", "itertools 0.12.1", diff --git a/Cargo.toml b/Cargo.toml index a783bbebef3e2..c4f4492c18e80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ argfile = { version = "0.1.6" } assert_cmd = { version = "2.0.13" } bincode = { version = "1.3.3" } bitflags = { version = "2.4.1" } +bstr = { version = "1.9.0" } cachedir = { version = "0.3.1" } chrono = { version = "0.4.33", default-features = false, features = ["clock"] } clap = { version = "4.4.18", features = ["derive"] } diff --git a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs index 38295b71316a2..0e4301ee44c07 100644 --- a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs +++ b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs @@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces { pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) { let is_bind_all_interface = match string { StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0", - StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0", + StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => { + &**value == "0.0.0.0" + } StringLike::BytesLiteral(_) => return, }; diff --git a/crates/ruff_linter/src/rules/flynt/helpers.rs b/crates/ruff_linter/src/rules/flynt/helpers.rs index 7a6af204d13f9..640f922d6faa2 100644 --- a/crates/ruff_linter/src/rules/flynt/helpers.rs +++ b/crates/ruff_linter/src/rules/flynt/helpers.rs @@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement { /// Convert a string to a [`ast::FStringElement::Literal`]. pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement { ast::FStringElement::Literal(ast::FStringLiteralElement { - value: s.to_owned(), + value: s.to_string().into_boxed_str(), range: TextRange::default(), }) } @@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option { match expr { Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => { Some(ast::FStringElement::Literal(ast::FStringLiteralElement { - value: value.to_string(), + value: value.to_string().into_boxed_str(), range: *range, })) } diff --git a/crates/ruff_python_ast/src/comparable.rs b/crates/ruff_python_ast/src/comparable.rs index bc6327f01dca0..344bb615ce95e 100644 --- a/crates/ruff_python_ast/src/comparable.rs +++ b/crates/ruff_python_ast/src/comparable.rs @@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> { impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> { fn from(bytes_literal: &'a ast::BytesLiteral) -> Self { Self { - value: bytes_literal.value.as_slice(), + value: &bytes_literal.value, } } } diff --git a/crates/ruff_python_ast/src/nodes.rs b/crates/ruff_python_ast/src/nodes.rs index cfb8355c69f05..b6581eef40524 100644 --- a/crates/ruff_python_ast/src/nodes.rs +++ b/crates/ruff_python_ast/src/nodes.rs @@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement { #[derive(Clone, Debug, PartialEq)] pub struct FStringLiteralElement { pub range: TextRange, - pub value: String, + pub value: Box, } impl Ranged for FStringLiteralElement { @@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement { type Target = str; fn deref(&self) -> &Self::Target { - self.value.as_str() + &self.value } } @@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner { #[derive(Clone, Debug, Default, PartialEq)] pub struct BytesLiteral { pub range: TextRange, - pub value: Vec, + pub value: Box<[u8]>, } impl Ranged for BytesLiteral { @@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral { type Target = [u8]; fn deref(&self) -> &Self::Target { - self.value.as_slice() + &self.value } } diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 6bcdf6c902172..886bb07fec0b6 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } bitflags = { workspace = true } +bstr = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { workspace = true, default-features = false } memchr = { workspace = true } -unicode-ident = { workspace = true } -unicode_names2 = { workspace = true } rustc-hash = { workspace = true } static_assertions = { workspace = true } +unicode-ident = { workspace = true } +unicode_names2 = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/ascii.rs b/crates/ruff_python_parser/src/ascii.rs deleted file mode 100644 index 87614dc98a0e8..0000000000000 --- a/crates/ruff_python_parser/src/ascii.rs +++ /dev/null @@ -1,345 +0,0 @@ -#![allow( - clippy::cast_possible_truncation, - clippy::cast_possible_wrap, - clippy::cast_ptr_alignment, - clippy::inline_always, - clippy::ptr_as_ptr, - unsafe_code -)] - -//! Source: - -// The following ~400 lines of code exists for exactly one purpose, which is -// to optimize this code: -// -// byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len()) -// -// Yes... Overengineered is a word that comes to mind, but this is effectively -// a very similar problem to memchr, and virtually nobody has been able to -// resist optimizing the crap out of that (except for perhaps the BSD and MUSL -// folks). In particular, this routine makes a very common case (ASCII) very -// fast, which seems worth it. We do stop short of adding AVX variants of the -// code below in order to retain our sanity and also to avoid needing to deal -// with runtime target feature detection. RESIST! -// -// In order to understand the SIMD version below, it would be good to read this -// comment describing how my memchr routine works: -// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106 -// -// The primary difference with memchr is that for ASCII, we can do a bit less -// work. In particular, we don't need to detect the presence of a specific -// byte, but rather, whether any byte has its most significant bit set. That -// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to -// _mm_movemask_epi8. - -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -const USIZE_BYTES: usize = core::mem::size_of::(); -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES; - -// This is a mask where the most significant bit of each byte in the usize -// is set. We test this bit to determine whether a character is ASCII or not. -// Namely, a single byte is regarded as an ASCII codepoint if and only if it's -// most significant bit is not set. -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080; -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -const ASCII_MASK: usize = ASCII_MASK_U64 as usize; - -/// Returns the index of the first non ASCII byte in the given slice. -/// -/// If slice only contains ASCII bytes, then the length of the slice is -/// returned. -pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize { - #[cfg(any(miri, not(target_arch = "x86_64")))] - { - first_non_ascii_byte_fallback(slice) - } - - #[cfg(all(not(miri), target_arch = "x86_64"))] - { - first_non_ascii_byte_sse2(slice) - } -} - -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { - let align = USIZE_BYTES - 1; - let start_ptr = slice.as_ptr(); - let end_ptr = slice[slice.len()..].as_ptr(); - let mut ptr = start_ptr; - - unsafe { - if slice.len() < USIZE_BYTES { - return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); - } - - let chunk = read_unaligned_usize(ptr); - let mask = chunk & ASCII_MASK; - if mask != 0 { - return first_non_ascii_byte_mask(mask); - } - - ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align)); - debug_assert!(ptr > start_ptr); - debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); - if slice.len() >= FALLBACK_LOOP_SIZE { - while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let a = *ptr.cast::(); - let b = *ptr_add(ptr, USIZE_BYTES).cast::(); - if (a | b) & ASCII_MASK != 0 { - // What a kludge. We wrap the position finding code into - // a non-inlineable function, which makes the codegen in - // the tight loop above a bit better by avoiding a - // couple extra movs. We pay for it by two additional - // stores, but only in the case of finding a non-ASCII - // byte. - #[inline(never)] - unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize { - let a = *ptr.cast::(); - let b = *ptr_add(ptr, USIZE_BYTES).cast::(); - - let mut at = sub(ptr, start_ptr); - let maska = a & ASCII_MASK; - if maska != 0 { - return at + first_non_ascii_byte_mask(maska); - } - - at += USIZE_BYTES; - let maskb = b & ASCII_MASK; - debug_assert!(maskb != 0); - at + first_non_ascii_byte_mask(maskb) - } - return findpos(start_ptr, ptr); - } - ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE); - } - } - first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) - } -} - -#[cfg(all(not(miri), target_arch = "x86_64"))] -fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { - use core::arch::x86_64::{ - __m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, - }; - - const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>(); - const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; - const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE; - - let start_ptr = slice.as_ptr(); - let end_ptr = slice[slice.len()..].as_ptr(); - let mut ptr = start_ptr; - - unsafe { - if slice.len() < VECTOR_SIZE { - return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr); - } - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(chunk); - if mask != 0 { - return mask.trailing_zeros() as usize; - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr); - debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr); - if slice.len() >= VECTOR_LOOP_SIZE { - while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); - let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); - - let or1 = _mm_or_si128(a, b); - let or2 = _mm_or_si128(c, d); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr, start_ptr); - let mask = _mm_movemask_epi8(a); - if mask != 0 { - return at + mask.trailing_zeros() as usize; - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(b); - if mask != 0 { - return at + mask.trailing_zeros() as usize; - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(c); - if mask != 0 { - return at + mask.trailing_zeros() as usize; - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(d); - debug_assert!(mask != 0); - return at + mask.trailing_zeros() as usize; - } - ptr = ptr_add(ptr, VECTOR_LOOP_SIZE); - } - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(chunk); - if mask != 0 { - return sub(ptr, start_ptr) + mask.trailing_zeros() as usize; - } - ptr = ptr.add(VECTOR_SIZE); - } - first_non_ascii_byte_slow(start_ptr, end_ptr, ptr) - } -} - -#[inline(always)] -unsafe fn first_non_ascii_byte_slow( - start_ptr: *const u8, - end_ptr: *const u8, - mut ptr: *const u8, -) -> usize { - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr); - - while ptr < end_ptr { - if *ptr > 0x7F { - return sub(ptr, start_ptr); - } - ptr = ptr.offset(1); - } - sub(end_ptr, start_ptr) -} - -/// Compute the position of the first ASCII byte in the given mask. -/// -/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is -/// 8 contiguous bytes of the slice being checked where *at least* one of those -/// bytes is not an ASCII byte. -/// -/// The position returned is always in the inclusive range [0, 7]. -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -fn first_non_ascii_byte_mask(mask: usize) -> usize { - #[cfg(target_endian = "little")] - { - mask.trailing_zeros() as usize / 8 - } - #[cfg(target_endian = "big")] - { - mask.leading_zeros() as usize / 8 - } -} - -/// Increment the given pointer by the given amount. -unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { - debug_assert!(amt < ::core::isize::MAX as usize); - ptr.add(amt) -} - -/// Decrement the given pointer by the given amount. -unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { - debug_assert!(amt < ::core::isize::MAX as usize); - ptr.offset((amt as isize).wrapping_neg()) -} - -#[cfg(any(test, miri, not(target_arch = "x86_64")))] -unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { - use core::ptr; - - let mut n: usize = 0; - ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES); - n -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} - -#[cfg(test)] -mod tests { - use super::*; - - // Our testing approach here is to try and exhaustively test every case. - // This includes the position at which a non-ASCII byte occurs in addition - // to the alignment of the slice that we're searching. - - #[test] - fn positive_fallback_forward() { - for i in 0..517 { - let s = "a".repeat(i); - assert_eq!( - i, - first_non_ascii_byte_fallback(s.as_bytes()), - "i: {:?}, len: {:?}, s: {:?}", - i, - s.len(), - s - ); - } - } - - #[test] - #[cfg(target_arch = "x86_64")] - #[cfg(not(miri))] - fn positive_sse2_forward() { - for i in 0..517 { - let b = "a".repeat(i).into_bytes(); - assert_eq!(b.len(), first_non_ascii_byte_sse2(&b)); - } - } - - #[test] - #[cfg(not(miri))] - fn negative_fallback_forward() { - for i in 0..517 { - for align in 0..65 { - let mut s = "a".repeat(i); - s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); - let s = s.get(align..).unwrap_or(""); - assert_eq!( - i.saturating_sub(align), - first_non_ascii_byte_fallback(s.as_bytes()), - "i: {:?}, align: {:?}, len: {:?}, s: {:?}", - i, - align, - s.len(), - s - ); - } - } - } - - #[test] - #[cfg(target_arch = "x86_64")] - #[cfg(not(miri))] - fn negative_sse2_forward() { - for i in 0..517 { - for align in 0..65 { - let mut s = "a".repeat(i); - s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃"); - let s = s.get(align..).unwrap_or(""); - assert_eq!( - i.saturating_sub(align), - first_non_ascii_byte_sse2(s.as_bytes()), - "i: {:?}, align: {:?}, len: {:?}, s: {:?}", - i, - align, - s.len(), - s - ); - } - } - } -} diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 074746ec01ab8..7c9c5402fb442 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -119,11 +119,10 @@ pub use token::{StringKind, Tok, TokenKind}; use crate::lexer::LexResult; -mod function; -// Skip flattening lexer to distinguish from full ruff_python_parser -mod ascii; mod context; +mod function; mod invalid; +// Skip flattening lexer to distinguish from full ruff_python_parser pub mod lexer; mod parser; mod soft_keywords; diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 124ba688a3a7a..807b645aef29a 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -1,9 +1,10 @@ //! Parsing of string literals, bytes literals, and implicit string concatenation. +use bstr::ByteSlice; + use ruff_python_ast::{self as ast, Expr}; use ruff_text_size::{Ranged, TextRange, TextSize}; -use crate::ascii::first_non_ascii_byte; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; @@ -218,9 +219,9 @@ impl StringParser { let mut value = String::with_capacity(self.source.len()); loop { - // Add the characters before the escape sequence to the string. - let before_with_slash = self.skip_bytes(index + 1); - let before = &before_with_slash[..before_with_slash.len() - 1]; + // Add the characters before the escape sequence (or curly brace) to the string. + let before_with_slash_or_brace = self.skip_bytes(index + 1); + let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1]; value.push_str(before); // Add the escaped character to the string. @@ -284,14 +285,13 @@ impl StringParser { } Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { - value, + value: value.into_boxed_str(), range: self.range, })) } fn parse_bytes(mut self) -> Result { - let index = first_non_ascii_byte(self.source.as_bytes()); - if index < self.source.len() { + if let Some(index) = self.source.as_bytes().find_non_ascii_byte() { return Err(LexicalError::new( LexicalErrorType::OtherError( "bytes can only contain ASCII literal characters" @@ -305,7 +305,7 @@ impl StringParser { if self.kind.is_raw() { // For raw strings, no escaping is necessary. return Ok(StringType::Bytes(ast::BytesLiteral { - value: self.source.into_bytes(), + value: self.source.into_boxed_bytes(), range: self.range, })); } @@ -313,7 +313,7 @@ impl StringParser { let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { // If the string doesn't contain any escape sequences, return the owned string. return Ok(StringType::Bytes(ast::BytesLiteral { - value: self.source.into_bytes(), + value: self.source.into_boxed_bytes(), range: self.range, })); }; @@ -349,7 +349,7 @@ impl StringParser { } Ok(StringType::Bytes(ast::BytesLiteral { - value, + value: value.into_boxed_slice(), range: self.range, })) } From 5db7e35e8a9e421dd9d39e1cb42a8445be39d596 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 8 Feb 2024 18:05:51 -0500 Subject: [PATCH 3/5] Use shared finder --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/string.rs | 12 ++++++++---- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 97511968ff8c4..ba459f3f1cf23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2343,6 +2343,7 @@ dependencies = [ "lalrpop", "lalrpop-util", "memchr", + "once_cell", "ruff_python_ast", "ruff_text_size", "rustc-hash", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 886bb07fec0b6..e1291ab6a2e6c 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -28,6 +28,7 @@ rustc-hash = { workspace = true } static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } +once_cell = "1.19.0" [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 807b645aef29a..0451604040882 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -1,6 +1,8 @@ //! Parsing of string literals, bytes literals, and implicit string concatenation. use bstr::ByteSlice; +use memchr::memmem; +use once_cell::sync::Lazy; use ruff_python_ast::{self as ast, Expr}; use ruff_text_size::{Ranged, TextRange, TextSize}; @@ -8,6 +10,8 @@ use ruff_text_size::{Ranged, TextRange, TextSize}; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; +const BACKSLASH_FINDER: Lazy = Lazy::new(|| memmem::Finder::new(b"\\")); + pub(crate) enum StringType { Str(ast::StringLiteral), Bytes(ast::BytesLiteral), @@ -310,7 +314,7 @@ impl StringParser { })); } - let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else { // If the string doesn't contain any escape sequences, return the owned string. return Ok(StringType::Bytes(ast::BytesLiteral { value: self.source.into_boxed_bytes(), @@ -336,7 +340,7 @@ impl StringParser { } } - let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) + let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes()) else { // Add the rest of the string to the value. let rest = &self.source[self.cursor..]; @@ -364,7 +368,7 @@ impl StringParser { })); } - let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { + let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else { // If the string doesn't contain any escape sequences, return the owned string. return Ok(StringType::Str(ast::StringLiteral { value: self.source, @@ -392,7 +396,7 @@ impl StringParser { } } - let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) + let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes()) else { // Add the rest of the string to the value. let rest = &self.source[self.cursor..]; From 09be9b2c26d7f68714606128d22bcfdf0a26d2d3 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 8 Feb 2024 23:03:11 -0500 Subject: [PATCH 4/5] Add benches --- Cargo.lock | 4 + crates/ruff_python_parser/Cargo.toml | 13 + crates/ruff_python_parser/benches/string.rs | 93 +++ crates/ruff_python_parser/src/lib.rs | 3 +- crates/ruff_python_parser/src/old_string.rs | 820 ++++++++++++++++++++ crates/ruff_python_parser/src/string.rs | 21 +- 6 files changed, 940 insertions(+), 14 deletions(-) create mode 100644 crates/ruff_python_parser/benches/string.rs create mode 100644 crates/ruff_python_parser/src/old_string.rs diff --git a/Cargo.lock b/Cargo.lock index ba459f3f1cf23..ae0b8f6a15259 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2337,17 +2337,21 @@ dependencies = [ "anyhow", "bitflags 2.4.1", "bstr", + "codspeed-criterion-compat", + "criterion", "insta", "is-macro", "itertools 0.12.1", "lalrpop", "lalrpop-util", "memchr", + "mimalloc", "once_cell", "ruff_python_ast", "ruff_text_size", "rustc-hash", "static_assertions", + "tikv-jemallocator", "tiny-keccak", "unicode-ident", "unicode_names2", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index e1291ab6a2e6c..075cc4ef8a00a 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -12,6 +12,11 @@ license = { workspace = true } build = "build.rs" [lib] +bench = false + +[[bench]] +name = "string" +harness = false [dependencies] ruff_python_ast = { path = "../ruff_python_ast" } @@ -29,6 +34,14 @@ static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } once_cell = "1.19.0" +criterion = { workspace = true, default-features = false } +codspeed-criterion-compat = { workspace = true, default-features = false, optional = true} + +[target.'cfg(target_os = "windows")'.dev-dependencies] +mimalloc = { workspace = true } + +[target.'cfg(all(not(target_os = "windows"), not(target_os = "openbsd"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "powerpc64")))'.dev-dependencies] +tikv-jemallocator = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/benches/string.rs b/crates/ruff_python_parser/benches/string.rs new file mode 100644 index 0000000000000..8e1a79c312af8 --- /dev/null +++ b/crates/ruff_python_parser/benches/string.rs @@ -0,0 +1,93 @@ +use criterion::{ + black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, Criterion, +}; +use ruff_python_parser::StringKind; +use ruff_text_size::TextRange; + +#[cfg(target_os = "windows")] +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + +#[cfg(all( + not(target_os = "windows"), + not(target_os = "openbsd"), + any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "powerpc64" + ) +))] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +fn benchmark_parser(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("parse"); + + let s = "\"\"\"Validate length based{ on BIN for major brands: + https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_(IIN)\"\"\""; + + // group.bench_with_input("new_string", &s, |b, &s| { + // b.iter_batched( + // || s.to_string().into_boxed_str(), + // |data| { + // ruff_python_parser::string::parse_string_literal( + // black_box(data), + // StringKind::String, + // true, + // TextRange::default(), + // ) + // }, + // BatchSize::SmallInput, + // ); + // }); + // + // group.bench_function("old_string", |b| { + // b.iter_batched( + // || s.to_string(), + // |data| { + // ruff_python_parser::old_string::parse_string_literal( + // black_box(&data), + // StringKind::String, + // true, + // TextRange::default(), + // ) + // }, + // BatchSize::SmallInput, + // ); + // }); + + let s = "Item {i+1}"; + + group.bench_with_input("new_fstring", &s, |b, &s| { + b.iter_batched( + || s.to_string().into_boxed_str(), + |data| { + ruff_python_parser::string::parse_fstring_literal_element( + black_box(data), + true, + TextRange::default(), + ) + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("old_fstring", |b| { + b.iter_batched( + || s.to_string(), + |data| { + ruff_python_parser::old_string::parse_fstring_literal_element( + black_box(&data), + true, + TextRange::default(), + ) + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +criterion_group!(parser, benchmark_parser); +criterion_main!(parser); diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 7c9c5402fb442..8e855e5d92dc6 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -124,9 +124,10 @@ mod function; mod invalid; // Skip flattening lexer to distinguish from full ruff_python_parser pub mod lexer; +pub mod old_string; mod parser; mod soft_keywords; -mod string; +pub mod string; mod token; mod token_source; pub mod typing; diff --git a/crates/ruff_python_parser/src/old_string.rs b/crates/ruff_python_parser/src/old_string.rs new file mode 100644 index 0000000000000..54f2dece59198 --- /dev/null +++ b/crates/ruff_python_parser/src/old_string.rs @@ -0,0 +1,820 @@ +//! Parsing of string literals, bytes literals, and implicit string concatenation. + +use ruff_python_ast::{self as ast, Expr}; +use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; + +use crate::lexer::{LexicalError, LexicalErrorType}; +use crate::string::FStringError; +use crate::token::{StringKind, Tok}; + +pub enum StringType { + Str(ast::StringLiteral), + Bytes(ast::BytesLiteral), + FString(ast::FString), +} + +impl Ranged for StringType { + fn range(&self) -> TextRange { + match self { + Self::Str(node) => node.range(), + Self::Bytes(node) => node.range(), + Self::FString(node) => node.range(), + } + } +} + +impl From for Expr { + fn from(string: StringType) -> Self { + match string { + StringType::Str(node) => Expr::from(node), + StringType::Bytes(node) => Expr::from(node), + StringType::FString(node) => Expr::from(node), + } + } +} + +struct StringParser<'a> { + rest: &'a str, + kind: StringKind, + location: TextSize, + range: TextRange, +} + +impl<'a> StringParser<'a> { + fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self { + Self { + rest: source, + kind, + location: start, + range, + } + } + + #[inline] + fn skip_bytes(&mut self, bytes: usize) -> &'a str { + let skipped_str = &self.rest[..bytes]; + self.rest = &self.rest[bytes..]; + self.location += skipped_str.text_len(); + skipped_str + } + + #[inline] + fn get_pos(&self) -> TextSize { + self.location + } + + /// Returns the next byte in the string, if there is one. + /// + /// # Panics + /// + /// When the next byte is a part of a multi-byte character. + #[inline] + fn next_byte(&mut self) -> Option { + self.rest.as_bytes().first().map(|&byte| { + self.rest = &self.rest[1..]; + self.location += TextSize::new(1); + byte + }) + } + + #[inline] + fn next_char(&mut self) -> Option { + self.rest.chars().next().map(|c| { + self.rest = &self.rest[c.len_utf8()..]; + self.location += c.text_len(); + c + }) + } + + #[inline] + fn peek_byte(&self) -> Option { + self.rest.as_bytes().first().copied() + } + + fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { + let mut p: u32 = 0u32; + let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); + for i in 1..=literal_number { + match self.next_char() { + Some(c) => match c.to_digit(16) { + Some(d) => p += d << ((literal_number - i) * 4), + None => return Err(unicode_error), + }, + None => return Err(unicode_error), + } + } + match p { + 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), + _ => std::char::from_u32(p).ok_or(unicode_error), + } + } + + fn parse_octet(&mut self, o: u8) -> char { + let mut radix_bytes = [o, 0, 0]; + let mut len = 1; + + while len < 3 { + let Some(b'0'..=b'7') = self.peek_byte() else { + break; + }; + + radix_bytes[len] = self.next_byte().unwrap(); + len += 1; + } + + // OK because radix_bytes is always going to be in the ASCII range. + let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes"); + let value = u32::from_str_radix(radix_str, 8).unwrap(); + char::from_u32(value).unwrap() + } + + fn parse_unicode_name(&mut self) -> Result { + let start_pos = self.get_pos(); + + let Some('{') = self.next_char() else { + return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)); + }; + + let start_pos = self.get_pos(); + let Some(close_idx) = self.rest.find('}') else { + return Err(LexicalError::new( + LexicalErrorType::StringError, + self.get_pos(), + )); + }; + + let name_and_ending = self.skip_bytes(close_idx + 1); + let name = &name_and_ending[..name_and_ending.len() - 1]; + + unicode_names2::character(name) + .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) + } + + fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { + let Some(first_char) = self.next_char() else { + return Err(LexicalError::new( + LexicalErrorType::StringError, + self.get_pos(), + )); + }; + + let new_char = match first_char { + '\\' => '\\', + '\'' => '\'', + '\"' => '"', + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0b', + o @ '0'..='7' => self.parse_octet(o as u8), + 'x' => self.parse_unicode_literal(2)?, + 'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, + 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, + 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, + // Special cases where the escape sequence is not a single character + '\n' => return Ok(()), + '\r' => { + if self.peek_byte() == Some(b'\n') { + self.next_byte(); + } + + return Ok(()); + } + _ => { + if self.kind.is_any_bytes() && !first_char.is_ascii() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters" + .to_string() + .into_boxed_str(), + ), + self.get_pos(), + )); + } + + string.push('\\'); + + first_char + } + }; + + string.push(new_char); + + Ok(()) + } + + fn parse_fstring_middle(&mut self) -> Result { + let mut value = String::with_capacity(self.rest.len()); + while let Some(ch) = self.next_char() { + match ch { + // We can encounter a `\` as the last character in a `FStringMiddle` + // token which is valid in this context. For example, + // + // ```python + // f"\{foo} \{bar:\}" + // # ^ ^^ ^ + // ``` + // + // Here, the `FStringMiddle` token content will be "\" and " \" + // which is invalid if we look at the content in isolation: + // + // ```python + // "\" + // ``` + // + // However, the content is syntactically valid in the context of + // the f-string because it's a substring of the entire f-string. + // This is still an invalid escape sequence, but we don't want to + // raise a syntax error as is done by the CPython parser. It might + // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas + '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { + self.parse_escaped_char(&mut value)?; + } + // If there are any curly braces inside a `FStringMiddle` token, + // then they were escaped (i.e. `{{` or `}}`). This means that + // we need increase the location by 2 instead of 1. + ch @ ('{' | '}') => { + self.location += ch.text_len(); + value.push(ch); + } + ch => value.push(ch), + } + } + Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { + value: value.into_boxed_str(), + range: self.range, + })) + } + + fn parse_bytes(&mut self) -> Result { + let mut content = String::with_capacity(self.rest.len()); + while let Some(ch) = self.next_char() { + match ch { + '\\' if !self.kind.is_raw() => { + self.parse_escaped_char(&mut content)?; + } + ch => { + if !ch.is_ascii() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters" + .to_string() + .into_boxed_str(), + ), + self.get_pos(), + )); + } + content.push(ch); + } + } + } + Ok(StringType::Bytes(ast::BytesLiteral { + value: content + .chars() + .map(|c| c as u8) + .collect::>() + .into_boxed_slice(), + range: self.range, + })) + } + + fn parse_string(&mut self) -> Result { + let mut value = String::with_capacity(self.rest.len()); + if self.kind.is_raw() { + value.push_str(self.skip_bytes(self.rest.len())); + } else { + loop { + let Some(escape_idx) = self.rest.find('\\') else { + value.push_str(self.skip_bytes(self.rest.len())); + break; + }; + + let before_with_slash = self.skip_bytes(escape_idx + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + + value.push_str(before); + self.parse_escaped_char(&mut value)?; + } + } + Ok(StringType::Str(ast::StringLiteral { + value: value.into_boxed_str(), + unicode: self.kind.is_unicode(), + range: self.range, + })) + } + + fn parse(&mut self) -> Result { + if self.kind.is_any_bytes() { + self.parse_bytes() + } else { + self.parse_string() + } + } +} + +pub fn parse_string_literal( + source: &str, + kind: StringKind, + triple_quoted: bool, + range: TextRange, +) -> Result { + let start_location = range.start() + + kind.prefix_len() + + if triple_quoted { + TextSize::from(3) + } else { + TextSize::from(1) + }; + StringParser::new(source, kind, start_location, range).parse() +} + +pub fn parse_fstring_literal_element( + source: &str, + is_raw: bool, + range: TextRange, +) -> Result { + let kind = if is_raw { + StringKind::RawString + } else { + StringKind::String + }; + StringParser::new(source, kind, range.start(), range).parse_fstring_middle() +} + +pub(crate) fn concatenated_strings( + strings: Vec, + range: TextRange, +) -> Result { + #[cfg(debug_assertions)] + debug_assert!(strings.len() > 1); + + let mut has_fstring = false; + let mut byte_literal_count = 0; + for string in &strings { + match string { + StringType::FString(_) => has_fstring = true, + StringType::Bytes(_) => byte_literal_count += 1, + StringType::Str(_) => {} + } + } + let has_bytes = byte_literal_count > 0; + + if has_bytes && byte_literal_count < strings.len() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "cannot mix bytes and nonbytes literals" + .to_string() + .into_boxed_str(), + ), + range.start(), + )); + } + + if has_bytes { + let mut values = Vec::with_capacity(strings.len()); + for string in strings { + match string { + StringType::Bytes(value) => values.push(value), + _ => unreachable!("Unexpected non-bytes literal."), + } + } + return Ok(Expr::from(ast::ExprBytesLiteral { + value: ast::BytesLiteralValue::concatenated(values), + range, + })); + } + + if !has_fstring { + let mut values = Vec::with_capacity(strings.len()); + for string in strings { + match string { + StringType::Str(value) => values.push(value), + _ => unreachable!("Unexpected non-string literal."), + } + } + return Ok(Expr::from(ast::ExprStringLiteral { + value: ast::StringLiteralValue::concatenated(values), + range, + })); + } + + let mut parts = Vec::with_capacity(strings.len()); + for string in strings { + match string { + StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)), + StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)), + StringType::Bytes(_) => unreachable!("Unexpected bytes literal."), + } + } + + Ok(ast::ExprFString { + value: ast::FStringValue::concatenated(parts), + range, + } + .into()) +} + +/// Represents the different types of errors that can occur during parsing of an f-string. +#[derive(Copy, Debug, Clone, PartialEq)] +pub enum FStringErrorType { + /// Expected a right brace after an opened left brace. + UnclosedLbrace, + /// An invalid conversion flag was encountered. + InvalidConversionFlag, + /// A single right brace was encountered. + SingleRbrace, + /// Unterminated string. + UnterminatedString, + /// Unterminated triple-quoted string. + UnterminatedTripleQuotedString, + // TODO(dhruvmanila): The parser can't catch all cases of this error, but + // wherever it can, we'll display the correct error message. + /// A lambda expression without parentheses was encountered. + LambdaWithoutParentheses, +} + +impl std::fmt::Display for FStringErrorType { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + use FStringErrorType::{ + InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace, + UnterminatedString, UnterminatedTripleQuotedString, + }; + match self { + UnclosedLbrace => write!(f, "expecting '}}'"), + InvalidConversionFlag => write!(f, "invalid conversion character"), + SingleRbrace => write!(f, "single '}}' is not allowed"), + UnterminatedString => write!(f, "unterminated string"), + UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"), + LambdaWithoutParentheses => { + write!(f, "lambda expressions are not allowed without parentheses") + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::lexer::LexicalErrorType; + use crate::parser::parse_suite; + use crate::{ParseErrorType, Suite}; + + use super::*; + + const WINDOWS_EOL: &str = "\r\n"; + const MAC_EOL: &str = "\r"; + const UNIX_EOL: &str = "\n"; + + fn string_parser_escaped_eol(eol: &str) -> Suite { + let source = format!(r"'text \{eol}more text'"); + parse_suite(&source).unwrap() + } + + #[test] + fn test_string_parser_escaped_unix_eol() { + let parse_ast = string_parser_escaped_eol(UNIX_EOL); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_string_parser_escaped_mac_eol() { + let parse_ast = string_parser_escaped_eol(MAC_EOL); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_string_parser_escaped_windows_eol() { + let parse_ast = string_parser_escaped_eol(WINDOWS_EOL); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring() { + let source = r#"f"{a}{ b }{{foo}}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_nested_spec() { + let source = r#"f"{foo:{spec}}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_not_nested_spec() { + let source = r#"f"{foo:spec}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_empty_fstring() { + insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap()); + } + + #[test] + fn test_fstring_parse_self_documenting_base() { + let source = r#"f"{user=}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_parse_self_documenting_base_more() { + let source = r#"f"mix {user=} with text and {second=}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_parse_self_documenting_format() { + let source = r#"f"{user=:>10}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + fn parse_fstring_error(source: &str) -> FStringErrorType { + parse_suite(source) + .map_err(|e| match e.error { + ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e, + e => unreachable!("Expected FStringError: {:?}", e), + }) + .expect_err("Expected error") + } + + #[test] + fn test_parse_invalid_fstring() { + use FStringErrorType::{InvalidConversionFlag, LambdaWithoutParentheses}; + + assert_eq!(parse_fstring_error(r#"f"{5!x}""#), InvalidConversionFlag); + assert_eq!( + parse_fstring_error("f'{lambda x:{x}}'"), + LambdaWithoutParentheses + ); + assert_eq!( + parse_fstring_error("f'{lambda x: {x}}'"), + LambdaWithoutParentheses + ); + assert!(parse_suite(r#"f"{class}""#,).is_err()); + } + + #[test] + fn test_parse_fstring_not_equals() { + let source = r#"f"{1 != 2}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_equals() { + let source = r#"f"{42 == 42}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_self_doc_prec_space() { + let source = r#"f"{x =}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_self_doc_trailing_space() { + let source = r#"f"{x= }""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_yield_expr() { + let source = r#"f"{yield}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_string_concat() { + let source = "'Hello ' 'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_u_string_concat_1() { + let source = "'Hello ' u'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_u_string_concat_2() { + let source = "u'Hello ' 'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_f_string_concat_1() { + let source = "'Hello ' f'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_f_string_concat_2() { + let source = "'Hello ' f'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_f_string_concat_3() { + let source = "'Hello ' f'world{\"!\"}'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_f_string_concat_4() { + let source = "'Hello ' f'world{\"!\"}' 'again!'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_u_f_string_concat_1() { + let source = "u'Hello ' f'world'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_u_f_string_concat_2() { + let source = "u'Hello ' f'world' '!'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_string_triple_quotes_with_kind() { + let source = "u'''Hello, world!'''"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_single_quoted_byte() { + // single quote + let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_double_quoted_byte() { + // double quote + let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_escape_char_in_byte_literal() { + // backslash does not escape + let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_byte_literal_1() { + let source = r"rb'\x1z'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_byte_literal_2() { + let source = r"rb'\\'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_escape_octet() { + let source = r"b'\43a\4\1234'"; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_escaped_newline() { + let source = r#"f"\n{x}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_constant_range() { + let source = r#"f"aaa{bbb}ccc{ddd}eee""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_unescaped_newline() { + let source = r#"f""" +{x}""""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_escaped_character() { + let source = r#"f"\\{x}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_fstring() { + let source = r#"rf"{x}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_triple_quoted_raw_fstring() { + let source = r#"rf"""{x}""""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_line_continuation() { + let source = r#"rf"\ +{x}""#; + let parse_ast = parse_suite(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_nested_string_spec() { + let source = r#"f"{foo:{''}}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_nested_concatenation_string_spec() { + let source = r#"f"{foo:{'' ''}}""#; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + /// + #[test] + fn test_dont_panic_on_8_in_octal_escape() { + let source = r"bold = '\038[1m'"; + let parse_ast = parse_suite(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + macro_rules! test_aliases_parse { + ($($name:ident: $alias:expr,)*) => { + $( + #[test] + fn $name() { + let source = format!(r#""\N{{{0}}}""#, $alias); + let parse_ast = parse_suite(&source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + )* + } + } + + test_aliases_parse! { + test_backspace_alias: "BACKSPACE", + test_bell_alias: "BEL", + test_carriage_return_alias: "CARRIAGE RETURN", + test_delete_alias: "DELETE", + test_escape_alias: "ESCAPE", + test_form_feed_alias: "FORM FEED", + test_hts_alias: "HTS", + test_character_tabulation_with_justification_alias: "CHARACTER TABULATION WITH JUSTIFICATION", + } +} diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 0451604040882..79d423bfaffb7 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -1,8 +1,6 @@ //! Parsing of string literals, bytes literals, and implicit string concatenation. use bstr::ByteSlice; -use memchr::memmem; -use once_cell::sync::Lazy; use ruff_python_ast::{self as ast, Expr}; use ruff_text_size::{Ranged, TextRange, TextSize}; @@ -10,9 +8,7 @@ use ruff_text_size::{Ranged, TextRange, TextSize}; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; -const BACKSLASH_FINDER: Lazy = Lazy::new(|| memmem::Finder::new(b"\\")); - -pub(crate) enum StringType { +pub enum StringType { Str(ast::StringLiteral), Bytes(ast::BytesLiteral), FString(ast::FString), @@ -314,7 +310,7 @@ impl StringParser { })); } - let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else { + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { // If the string doesn't contain any escape sequences, return the owned string. return Ok(StringType::Bytes(ast::BytesLiteral { value: self.source.into_boxed_bytes(), @@ -340,7 +336,7 @@ impl StringParser { } } - let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes()) + let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes()) else { // Add the rest of the string to the value. let rest = &self.source[self.cursor..]; @@ -368,7 +364,7 @@ impl StringParser { })); } - let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else { + let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else { // If the string doesn't contain any escape sequences, return the owned string. return Ok(StringType::Str(ast::StringLiteral { value: self.source, @@ -396,8 +392,7 @@ impl StringParser { } } - let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes()) - else { + let Some(next_escape) = self.source[self.cursor..].find('\\') else { // Add the rest of the string to the value. let rest = &self.source[self.cursor..]; value.push_str(rest); @@ -424,7 +419,7 @@ impl StringParser { } } -pub(crate) fn parse_string_literal( +pub fn parse_string_literal( source: Box, kind: StringKind, triple_quoted: bool, @@ -440,7 +435,7 @@ pub(crate) fn parse_string_literal( StringParser::new(source, kind, start_location, range).parse() } -pub(crate) fn parse_fstring_literal_element( +pub fn parse_fstring_literal_element( source: Box, is_raw: bool, range: TextRange, @@ -529,7 +524,7 @@ pub(crate) fn concatenated_strings( // TODO: consolidate these with ParseError /// An error that occurred during parsing of an f-string. #[derive(Debug, Clone, PartialEq)] -struct FStringError { +pub(crate) struct FStringError { /// The type of error that occurred. pub(crate) error: FStringErrorType, /// The location of the error. From 58178b30c0eae58977715f09697bca6257507519 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 9 Feb 2024 15:42:10 -0500 Subject: [PATCH 5/5] Revert benchmarking code --- Cargo.lock | 5 - crates/ruff_python_parser/Cargo.toml | 14 - crates/ruff_python_parser/benches/string.rs | 93 --- crates/ruff_python_parser/src/lib.rs | 3 +- crates/ruff_python_parser/src/old_string.rs | 820 -------------------- crates/ruff_python_parser/src/string.rs | 8 +- 6 files changed, 5 insertions(+), 938 deletions(-) delete mode 100644 crates/ruff_python_parser/benches/string.rs delete mode 100644 crates/ruff_python_parser/src/old_string.rs diff --git a/Cargo.lock b/Cargo.lock index ae0b8f6a15259..97511968ff8c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2337,21 +2337,16 @@ dependencies = [ "anyhow", "bitflags 2.4.1", "bstr", - "codspeed-criterion-compat", - "criterion", "insta", "is-macro", "itertools 0.12.1", "lalrpop", "lalrpop-util", "memchr", - "mimalloc", - "once_cell", "ruff_python_ast", "ruff_text_size", "rustc-hash", "static_assertions", - "tikv-jemallocator", "tiny-keccak", "unicode-ident", "unicode_names2", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 075cc4ef8a00a..886bb07fec0b6 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -12,11 +12,6 @@ license = { workspace = true } build = "build.rs" [lib] -bench = false - -[[bench]] -name = "string" -harness = false [dependencies] ruff_python_ast = { path = "../ruff_python_ast" } @@ -33,15 +28,6 @@ rustc-hash = { workspace = true } static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } -once_cell = "1.19.0" -criterion = { workspace = true, default-features = false } -codspeed-criterion-compat = { workspace = true, default-features = false, optional = true} - -[target.'cfg(target_os = "windows")'.dev-dependencies] -mimalloc = { workspace = true } - -[target.'cfg(all(not(target_os = "windows"), not(target_os = "openbsd"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "powerpc64")))'.dev-dependencies] -tikv-jemallocator = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/benches/string.rs b/crates/ruff_python_parser/benches/string.rs deleted file mode 100644 index 8e1a79c312af8..0000000000000 --- a/crates/ruff_python_parser/benches/string.rs +++ /dev/null @@ -1,93 +0,0 @@ -use criterion::{ - black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, Criterion, -}; -use ruff_python_parser::StringKind; -use ruff_text_size::TextRange; - -#[cfg(target_os = "windows")] -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; - -#[cfg(all( - not(target_os = "windows"), - not(target_os = "openbsd"), - any( - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "powerpc64" - ) -))] -#[global_allocator] -static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; - -fn benchmark_parser(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("parse"); - - let s = "\"\"\"Validate length based{ on BIN for major brands: - https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_(IIN)\"\"\""; - - // group.bench_with_input("new_string", &s, |b, &s| { - // b.iter_batched( - // || s.to_string().into_boxed_str(), - // |data| { - // ruff_python_parser::string::parse_string_literal( - // black_box(data), - // StringKind::String, - // true, - // TextRange::default(), - // ) - // }, - // BatchSize::SmallInput, - // ); - // }); - // - // group.bench_function("old_string", |b| { - // b.iter_batched( - // || s.to_string(), - // |data| { - // ruff_python_parser::old_string::parse_string_literal( - // black_box(&data), - // StringKind::String, - // true, - // TextRange::default(), - // ) - // }, - // BatchSize::SmallInput, - // ); - // }); - - let s = "Item {i+1}"; - - group.bench_with_input("new_fstring", &s, |b, &s| { - b.iter_batched( - || s.to_string().into_boxed_str(), - |data| { - ruff_python_parser::string::parse_fstring_literal_element( - black_box(data), - true, - TextRange::default(), - ) - }, - BatchSize::SmallInput, - ); - }); - - group.bench_function("old_fstring", |b| { - b.iter_batched( - || s.to_string(), - |data| { - ruff_python_parser::old_string::parse_fstring_literal_element( - black_box(&data), - true, - TextRange::default(), - ) - }, - BatchSize::SmallInput, - ); - }); - - group.finish(); -} - -criterion_group!(parser, benchmark_parser); -criterion_main!(parser); diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 8e855e5d92dc6..7c9c5402fb442 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -124,10 +124,9 @@ mod function; mod invalid; // Skip flattening lexer to distinguish from full ruff_python_parser pub mod lexer; -pub mod old_string; mod parser; mod soft_keywords; -pub mod string; +mod string; mod token; mod token_source; pub mod typing; diff --git a/crates/ruff_python_parser/src/old_string.rs b/crates/ruff_python_parser/src/old_string.rs deleted file mode 100644 index 54f2dece59198..0000000000000 --- a/crates/ruff_python_parser/src/old_string.rs +++ /dev/null @@ -1,820 +0,0 @@ -//! Parsing of string literals, bytes literals, and implicit string concatenation. - -use ruff_python_ast::{self as ast, Expr}; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; - -use crate::lexer::{LexicalError, LexicalErrorType}; -use crate::string::FStringError; -use crate::token::{StringKind, Tok}; - -pub enum StringType { - Str(ast::StringLiteral), - Bytes(ast::BytesLiteral), - FString(ast::FString), -} - -impl Ranged for StringType { - fn range(&self) -> TextRange { - match self { - Self::Str(node) => node.range(), - Self::Bytes(node) => node.range(), - Self::FString(node) => node.range(), - } - } -} - -impl From for Expr { - fn from(string: StringType) -> Self { - match string { - StringType::Str(node) => Expr::from(node), - StringType::Bytes(node) => Expr::from(node), - StringType::FString(node) => Expr::from(node), - } - } -} - -struct StringParser<'a> { - rest: &'a str, - kind: StringKind, - location: TextSize, - range: TextRange, -} - -impl<'a> StringParser<'a> { - fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self { - Self { - rest: source, - kind, - location: start, - range, - } - } - - #[inline] - fn skip_bytes(&mut self, bytes: usize) -> &'a str { - let skipped_str = &self.rest[..bytes]; - self.rest = &self.rest[bytes..]; - self.location += skipped_str.text_len(); - skipped_str - } - - #[inline] - fn get_pos(&self) -> TextSize { - self.location - } - - /// Returns the next byte in the string, if there is one. - /// - /// # Panics - /// - /// When the next byte is a part of a multi-byte character. - #[inline] - fn next_byte(&mut self) -> Option { - self.rest.as_bytes().first().map(|&byte| { - self.rest = &self.rest[1..]; - self.location += TextSize::new(1); - byte - }) - } - - #[inline] - fn next_char(&mut self) -> Option { - self.rest.chars().next().map(|c| { - self.rest = &self.rest[c.len_utf8()..]; - self.location += c.text_len(); - c - }) - } - - #[inline] - fn peek_byte(&self) -> Option { - self.rest.as_bytes().first().copied() - } - - fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { - let mut p: u32 = 0u32; - let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); - for i in 1..=literal_number { - match self.next_char() { - Some(c) => match c.to_digit(16) { - Some(d) => p += d << ((literal_number - i) * 4), - None => return Err(unicode_error), - }, - None => return Err(unicode_error), - } - } - match p { - 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), - _ => std::char::from_u32(p).ok_or(unicode_error), - } - } - - fn parse_octet(&mut self, o: u8) -> char { - let mut radix_bytes = [o, 0, 0]; - let mut len = 1; - - while len < 3 { - let Some(b'0'..=b'7') = self.peek_byte() else { - break; - }; - - radix_bytes[len] = self.next_byte().unwrap(); - len += 1; - } - - // OK because radix_bytes is always going to be in the ASCII range. - let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes"); - let value = u32::from_str_radix(radix_str, 8).unwrap(); - char::from_u32(value).unwrap() - } - - fn parse_unicode_name(&mut self) -> Result { - let start_pos = self.get_pos(); - - let Some('{') = self.next_char() else { - return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)); - }; - - let start_pos = self.get_pos(); - let Some(close_idx) = self.rest.find('}') else { - return Err(LexicalError::new( - LexicalErrorType::StringError, - self.get_pos(), - )); - }; - - let name_and_ending = self.skip_bytes(close_idx + 1); - let name = &name_and_ending[..name_and_ending.len() - 1]; - - unicode_names2::character(name) - .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) - } - - fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { - let Some(first_char) = self.next_char() else { - return Err(LexicalError::new( - LexicalErrorType::StringError, - self.get_pos(), - )); - }; - - let new_char = match first_char { - '\\' => '\\', - '\'' => '\'', - '\"' => '"', - 'a' => '\x07', - 'b' => '\x08', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'v' => '\x0b', - o @ '0'..='7' => self.parse_octet(o as u8), - 'x' => self.parse_unicode_literal(2)?, - 'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, - 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, - 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, - // Special cases where the escape sequence is not a single character - '\n' => return Ok(()), - '\r' => { - if self.peek_byte() == Some(b'\n') { - self.next_byte(); - } - - return Ok(()); - } - _ => { - if self.kind.is_any_bytes() && !first_char.is_ascii() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters" - .to_string() - .into_boxed_str(), - ), - self.get_pos(), - )); - } - - string.push('\\'); - - first_char - } - }; - - string.push(new_char); - - Ok(()) - } - - fn parse_fstring_middle(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { - // We can encounter a `\` as the last character in a `FStringMiddle` - // token which is valid in this context. For example, - // - // ```python - // f"\{foo} \{bar:\}" - // # ^ ^^ ^ - // ``` - // - // Here, the `FStringMiddle` token content will be "\" and " \" - // which is invalid if we look at the content in isolation: - // - // ```python - // "\" - // ``` - // - // However, the content is syntactically valid in the context of - // the f-string because it's a substring of the entire f-string. - // This is still an invalid escape sequence, but we don't want to - // raise a syntax error as is done by the CPython parser. It might - // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas - '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { - self.parse_escaped_char(&mut value)?; - } - // If there are any curly braces inside a `FStringMiddle` token, - // then they were escaped (i.e. `{{` or `}}`). This means that - // we need increase the location by 2 instead of 1. - ch @ ('{' | '}') => { - self.location += ch.text_len(); - value.push(ch); - } - ch => value.push(ch), - } - } - Ok(ast::FStringElement::Literal(ast::FStringLiteralElement { - value: value.into_boxed_str(), - range: self.range, - })) - } - - fn parse_bytes(&mut self) -> Result { - let mut content = String::with_capacity(self.rest.len()); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - self.parse_escaped_char(&mut content)?; - } - ch => { - if !ch.is_ascii() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters" - .to_string() - .into_boxed_str(), - ), - self.get_pos(), - )); - } - content.push(ch); - } - } - } - Ok(StringType::Bytes(ast::BytesLiteral { - value: content - .chars() - .map(|c| c as u8) - .collect::>() - .into_boxed_slice(), - range: self.range, - })) - } - - fn parse_string(&mut self) -> Result { - let mut value = String::with_capacity(self.rest.len()); - if self.kind.is_raw() { - value.push_str(self.skip_bytes(self.rest.len())); - } else { - loop { - let Some(escape_idx) = self.rest.find('\\') else { - value.push_str(self.skip_bytes(self.rest.len())); - break; - }; - - let before_with_slash = self.skip_bytes(escape_idx + 1); - let before = &before_with_slash[..before_with_slash.len() - 1]; - - value.push_str(before); - self.parse_escaped_char(&mut value)?; - } - } - Ok(StringType::Str(ast::StringLiteral { - value: value.into_boxed_str(), - unicode: self.kind.is_unicode(), - range: self.range, - })) - } - - fn parse(&mut self) -> Result { - if self.kind.is_any_bytes() { - self.parse_bytes() - } else { - self.parse_string() - } - } -} - -pub fn parse_string_literal( - source: &str, - kind: StringKind, - triple_quoted: bool, - range: TextRange, -) -> Result { - let start_location = range.start() - + kind.prefix_len() - + if triple_quoted { - TextSize::from(3) - } else { - TextSize::from(1) - }; - StringParser::new(source, kind, start_location, range).parse() -} - -pub fn parse_fstring_literal_element( - source: &str, - is_raw: bool, - range: TextRange, -) -> Result { - let kind = if is_raw { - StringKind::RawString - } else { - StringKind::String - }; - StringParser::new(source, kind, range.start(), range).parse_fstring_middle() -} - -pub(crate) fn concatenated_strings( - strings: Vec, - range: TextRange, -) -> Result { - #[cfg(debug_assertions)] - debug_assert!(strings.len() > 1); - - let mut has_fstring = false; - let mut byte_literal_count = 0; - for string in &strings { - match string { - StringType::FString(_) => has_fstring = true, - StringType::Bytes(_) => byte_literal_count += 1, - StringType::Str(_) => {} - } - } - let has_bytes = byte_literal_count > 0; - - if has_bytes && byte_literal_count < strings.len() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "cannot mix bytes and nonbytes literals" - .to_string() - .into_boxed_str(), - ), - range.start(), - )); - } - - if has_bytes { - let mut values = Vec::with_capacity(strings.len()); - for string in strings { - match string { - StringType::Bytes(value) => values.push(value), - _ => unreachable!("Unexpected non-bytes literal."), - } - } - return Ok(Expr::from(ast::ExprBytesLiteral { - value: ast::BytesLiteralValue::concatenated(values), - range, - })); - } - - if !has_fstring { - let mut values = Vec::with_capacity(strings.len()); - for string in strings { - match string { - StringType::Str(value) => values.push(value), - _ => unreachable!("Unexpected non-string literal."), - } - } - return Ok(Expr::from(ast::ExprStringLiteral { - value: ast::StringLiteralValue::concatenated(values), - range, - })); - } - - let mut parts = Vec::with_capacity(strings.len()); - for string in strings { - match string { - StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)), - StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)), - StringType::Bytes(_) => unreachable!("Unexpected bytes literal."), - } - } - - Ok(ast::ExprFString { - value: ast::FStringValue::concatenated(parts), - range, - } - .into()) -} - -/// Represents the different types of errors that can occur during parsing of an f-string. -#[derive(Copy, Debug, Clone, PartialEq)] -pub enum FStringErrorType { - /// Expected a right brace after an opened left brace. - UnclosedLbrace, - /// An invalid conversion flag was encountered. - InvalidConversionFlag, - /// A single right brace was encountered. - SingleRbrace, - /// Unterminated string. - UnterminatedString, - /// Unterminated triple-quoted string. - UnterminatedTripleQuotedString, - // TODO(dhruvmanila): The parser can't catch all cases of this error, but - // wherever it can, we'll display the correct error message. - /// A lambda expression without parentheses was encountered. - LambdaWithoutParentheses, -} - -impl std::fmt::Display for FStringErrorType { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - use FStringErrorType::{ - InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace, - UnterminatedString, UnterminatedTripleQuotedString, - }; - match self { - UnclosedLbrace => write!(f, "expecting '}}'"), - InvalidConversionFlag => write!(f, "invalid conversion character"), - SingleRbrace => write!(f, "single '}}' is not allowed"), - UnterminatedString => write!(f, "unterminated string"), - UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"), - LambdaWithoutParentheses => { - write!(f, "lambda expressions are not allowed without parentheses") - } - } - } -} - -#[cfg(test)] -mod tests { - use crate::lexer::LexicalErrorType; - use crate::parser::parse_suite; - use crate::{ParseErrorType, Suite}; - - use super::*; - - const WINDOWS_EOL: &str = "\r\n"; - const MAC_EOL: &str = "\r"; - const UNIX_EOL: &str = "\n"; - - fn string_parser_escaped_eol(eol: &str) -> Suite { - let source = format!(r"'text \{eol}more text'"); - parse_suite(&source).unwrap() - } - - #[test] - fn test_string_parser_escaped_unix_eol() { - let parse_ast = string_parser_escaped_eol(UNIX_EOL); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_string_parser_escaped_mac_eol() { - let parse_ast = string_parser_escaped_eol(MAC_EOL); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_string_parser_escaped_windows_eol() { - let parse_ast = string_parser_escaped_eol(WINDOWS_EOL); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring() { - let source = r#"f"{a}{ b }{{foo}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_nested_spec() { - let source = r#"f"{foo:{spec}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_not_nested_spec() { - let source = r#"f"{foo:spec}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_empty_fstring() { - insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap()); - } - - #[test] - fn test_fstring_parse_self_documenting_base() { - let source = r#"f"{user=}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_parse_self_documenting_base_more() { - let source = r#"f"mix {user=} with text and {second=}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_parse_self_documenting_format() { - let source = r#"f"{user=:>10}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - fn parse_fstring_error(source: &str) -> FStringErrorType { - parse_suite(source) - .map_err(|e| match e.error { - ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e, - e => unreachable!("Expected FStringError: {:?}", e), - }) - .expect_err("Expected error") - } - - #[test] - fn test_parse_invalid_fstring() { - use FStringErrorType::{InvalidConversionFlag, LambdaWithoutParentheses}; - - assert_eq!(parse_fstring_error(r#"f"{5!x}""#), InvalidConversionFlag); - assert_eq!( - parse_fstring_error("f'{lambda x:{x}}'"), - LambdaWithoutParentheses - ); - assert_eq!( - parse_fstring_error("f'{lambda x: {x}}'"), - LambdaWithoutParentheses - ); - assert!(parse_suite(r#"f"{class}""#,).is_err()); - } - - #[test] - fn test_parse_fstring_not_equals() { - let source = r#"f"{1 != 2}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_equals() { - let source = r#"f"{42 == 42}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_self_doc_prec_space() { - let source = r#"f"{x =}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_self_doc_trailing_space() { - let source = r#"f"{x= }""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_yield_expr() { - let source = r#"f"{yield}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_string_concat() { - let source = "'Hello ' 'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_u_string_concat_1() { - let source = "'Hello ' u'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_u_string_concat_2() { - let source = "u'Hello ' 'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_f_string_concat_1() { - let source = "'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_f_string_concat_2() { - let source = "'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_f_string_concat_3() { - let source = "'Hello ' f'world{\"!\"}'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_f_string_concat_4() { - let source = "'Hello ' f'world{\"!\"}' 'again!'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_u_f_string_concat_1() { - let source = "u'Hello ' f'world'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_u_f_string_concat_2() { - let source = "u'Hello ' f'world' '!'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_string_triple_quotes_with_kind() { - let source = "u'''Hello, world!'''"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_single_quoted_byte() { - // single quote - let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_double_quoted_byte() { - // double quote - let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_escape_char_in_byte_literal() { - // backslash does not escape - let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_raw_byte_literal_1() { - let source = r"rb'\x1z'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_raw_byte_literal_2() { - let source = r"rb'\\'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_escape_octet() { - let source = r"b'\43a\4\1234'"; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_escaped_newline() { - let source = r#"f"\n{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_constant_range() { - let source = r#"f"aaa{bbb}ccc{ddd}eee""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_unescaped_newline() { - let source = r#"f""" -{x}""""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_escaped_character() { - let source = r#"f"\\{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_raw_fstring() { - let source = r#"rf"{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_triple_quoted_raw_fstring() { - let source = r#"rf"""{x}""""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_line_continuation() { - let source = r#"rf"\ -{x}""#; - let parse_ast = parse_suite(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_nested_string_spec() { - let source = r#"f"{foo:{''}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_nested_concatenation_string_spec() { - let source = r#"f"{foo:{'' ''}}""#; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - /// - #[test] - fn test_dont_panic_on_8_in_octal_escape() { - let source = r"bold = '\038[1m'"; - let parse_ast = parse_suite(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - macro_rules! test_aliases_parse { - ($($name:ident: $alias:expr,)*) => { - $( - #[test] - fn $name() { - let source = format!(r#""\N{{{0}}}""#, $alias); - let parse_ast = parse_suite(&source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - )* - } - } - - test_aliases_parse! { - test_backspace_alias: "BACKSPACE", - test_bell_alias: "BEL", - test_carriage_return_alias: "CARRIAGE RETURN", - test_delete_alias: "DELETE", - test_escape_alias: "ESCAPE", - test_form_feed_alias: "FORM FEED", - test_hts_alias: "HTS", - test_character_tabulation_with_justification_alias: "CHARACTER TABULATION WITH JUSTIFICATION", - } -} diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 79d423bfaffb7..fb536537216a0 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -8,7 +8,7 @@ use ruff_text_size::{Ranged, TextRange, TextSize}; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; -pub enum StringType { +pub(crate) enum StringType { Str(ast::StringLiteral), Bytes(ast::BytesLiteral), FString(ast::FString), @@ -419,7 +419,7 @@ impl StringParser { } } -pub fn parse_string_literal( +pub(crate) fn parse_string_literal( source: Box, kind: StringKind, triple_quoted: bool, @@ -435,7 +435,7 @@ pub fn parse_string_literal( StringParser::new(source, kind, start_location, range).parse() } -pub fn parse_fstring_literal_element( +pub(crate) fn parse_fstring_literal_element( source: Box, is_raw: bool, range: TextRange, @@ -524,7 +524,7 @@ pub(crate) fn concatenated_strings( // TODO: consolidate these with ParseError /// An error that occurred during parsing of an f-string. #[derive(Debug, Clone, PartialEq)] -pub(crate) struct FStringError { +struct FStringError { /// The type of error that occurred. pub(crate) error: FStringErrorType, /// The location of the error.