From 0871805c7d2d5ecdc7e2af4b82ee33809bd371d5 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Wed, 7 Feb 2024 17:04:34 -0500
Subject: [PATCH 1/5] Remove unnecessary string cloning from the parser

---
 crates/ruff_python_parser/src/ascii.rs       | 345 +++++++++++++++++++
 crates/ruff_python_parser/src/lib.rs         |   1 +
 crates/ruff_python_parser/src/python.lalrpop |   4 +-
 crates/ruff_python_parser/src/python.rs      |   6 +-
 crates/ruff_python_parser/src/string.rs      | 270 ++++++++++-----
 5 files changed, 541 insertions(+), 85 deletions(-)
 create mode 100644 crates/ruff_python_parser/src/ascii.rs
diff --git a/crates/ruff_python_parser/src/ascii.rs b/crates/ruff_python_parser/src/ascii.rs
new file mode 100644
index 0000000000000..87614dc98a0e8
--- /dev/null
+++ b/crates/ruff_python_parser/src/ascii.rs
@@ -0,0 +1,345 @@
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_ptr_alignment,
+    clippy::inline_always,
+    clippy::ptr_as_ptr,
+    unsafe_code
+)]
+
+//! Source: <https://github.com/BurntSushi/bstr/blob/d4aeee2eac5d5ef6ec4d2206f6ebffe7b3dd3e1f/src/ascii.rs>
+
+// The following ~400 lines of code exists for exactly one purpose, which is
+// to optimize this code:
+//
+//     byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
+//
+// Yes... Overengineered is a word that comes to mind, but this is effectively
+// a very similar problem to memchr, and virtually nobody has been able to
+// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
+// folks). In particular, this routine makes a very common case (ASCII) very
+// fast, which seems worth it. We do stop short of adding AVX variants of the
+// code below in order to retain our sanity and also to avoid needing to deal
+// with runtime target feature detection. RESIST!
+//
+// In order to understand the SIMD version below, it would be good to read this
+// comment describing how my memchr routine works:
+// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
+//
+// The primary difference with memchr is that for ASCII, we can do a bit less
+// work. In particular, we don't need to detect the presence of a specific
+// byte, but rather, whether any byte has its most significant bit set. That
+// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
+// _mm_movemask_epi8.
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const USIZE_BYTES: usize = core::mem::size_of::<usize>();
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
+
+// This is a mask where the most significant bit of each byte in the usize
+// is set. We test this bit to determine whether a character is ASCII or not.
+// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
+// most significant bit is not set.
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080;
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
+
+/// Returns the index of the first non ASCII byte in the given slice.
+///
+/// If slice only contains ASCII bytes, then the length of the slice is
+/// returned.
+pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize {
+    #[cfg(any(miri, not(target_arch = "x86_64")))]
+    {
+        first_non_ascii_byte_fallback(slice)
+    }
+
+    #[cfg(all(not(miri), target_arch = "x86_64"))]
+    {
+        first_non_ascii_byte_sse2(slice)
+    }
+}
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
+    let align = USIZE_BYTES - 1;
+    let start_ptr = slice.as_ptr();
+    let end_ptr = slice[slice.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if slice.len() < USIZE_BYTES {
+            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
+        }
+
+        let chunk = read_unaligned_usize(ptr);
+        let mask = chunk & ASCII_MASK;
+        if mask != 0 {
+            return first_non_ascii_byte_mask(mask);
+        }
+
+        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
+        if slice.len() >= FALLBACK_LOOP_SIZE {
+            while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
+                debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+                let a = *ptr.cast::<usize>();
+                let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
+                if (a | b) & ASCII_MASK != 0 {
+                    // What a kludge. We wrap the position finding code into
+                    // a non-inlineable function, which makes the codegen in
+                    // the tight loop above a bit better by avoiding a
+                    // couple extra movs. We pay for it by two additional
+                    // stores, but only in the case of finding a non-ASCII
+                    // byte.
+                    #[inline(never)]
+                    unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize {
+                        let a = *ptr.cast::<usize>();
+                        let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
+
+                        let mut at = sub(ptr, start_ptr);
+                        let maska = a & ASCII_MASK;
+                        if maska != 0 {
+                            return at + first_non_ascii_byte_mask(maska);
+                        }
+
+                        at += USIZE_BYTES;
+                        let maskb = b & ASCII_MASK;
+                        debug_assert!(maskb != 0);
+                        at + first_non_ascii_byte_mask(maskb)
+                    }
+                    return findpos(start_ptr, ptr);
+                }
+                ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
+            }
+        }
+        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
+    }
+}
+
+#[cfg(all(not(miri), target_arch = "x86_64"))]
+fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
+    use core::arch::x86_64::{
+        __m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
+    };
+
+    const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
+    const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+    const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+    let start_ptr = slice.as_ptr();
+    let end_ptr = slice[slice.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    unsafe {
+        if slice.len() < VECTOR_SIZE {
+            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
+        }
+
+        let chunk = _mm_loadu_si128(ptr as *const __m128i);
+        let mask = _mm_movemask_epi8(chunk);
+        if mask != 0 {
+            return mask.trailing_zeros() as usize;
+        }
+
+        ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+        debug_assert!(ptr > start_ptr);
+        debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+        if slice.len() >= VECTOR_LOOP_SIZE {
+            while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
+                debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+                let a = _mm_load_si128(ptr as *const __m128i);
+                let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+                let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+                let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+
+                let or1 = _mm_or_si128(a, b);
+                let or2 = _mm_or_si128(c, d);
+                let or3 = _mm_or_si128(or1, or2);
+                if _mm_movemask_epi8(or3) != 0 {
+                    let mut at = sub(ptr, start_ptr);
+                    let mask = _mm_movemask_epi8(a);
+                    if mask != 0 {
+                        return at + mask.trailing_zeros() as usize;
+                    }
+
+                    at += VECTOR_SIZE;
+                    let mask = _mm_movemask_epi8(b);
+                    if mask != 0 {
+                        return at + mask.trailing_zeros() as usize;
+                    }
+
+                    at += VECTOR_SIZE;
+                    let mask = _mm_movemask_epi8(c);
+                    if mask != 0 {
+                        return at + mask.trailing_zeros() as usize;
+                    }
+
+                    at += VECTOR_SIZE;
+                    let mask = _mm_movemask_epi8(d);
+                    debug_assert!(mask != 0);
+                    return at + mask.trailing_zeros() as usize;
+                }
+                ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
+            }
+        }
+        while ptr <= end_ptr.sub(VECTOR_SIZE) {
+            debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+            let chunk = _mm_loadu_si128(ptr as *const __m128i);
+            let mask = _mm_movemask_epi8(chunk);
+            if mask != 0 {
+                return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
+            }
+            ptr = ptr.add(VECTOR_SIZE);
+        }
+        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
+    }
+}
+
+#[inline(always)]
+unsafe fn first_non_ascii_byte_slow(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    mut ptr: *const u8,
+) -> usize {
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr);
+
+    while ptr < end_ptr {
+        if *ptr > 0x7F {
+            return sub(ptr, start_ptr);
+        }
+        ptr = ptr.offset(1);
+    }
+    sub(end_ptr, start_ptr)
+}
+
+/// Compute the position of the first ASCII byte in the given mask.
+///
+/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
+/// 8 contiguous bytes of the slice being checked where *at least* one of those
+/// bytes is not an ASCII byte.
+///
+/// The position returned is always in the inclusive range [0, 7].
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+fn first_non_ascii_byte_mask(mask: usize) -> usize {
+    #[cfg(target_endian = "little")]
+    {
+        mask.trailing_zeros() as usize / 8
+    }
+    #[cfg(target_endian = "big")]
+    {
+        mask.leading_zeros() as usize / 8
+    }
+}
+
+/// Increment the given pointer by the given amount.
+unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
+    debug_assert!(amt < ::core::isize::MAX as usize);
+    ptr.add(amt)
+}
+
+/// Decrement the given pointer by the given amount.
+unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
+    debug_assert!(amt < ::core::isize::MAX as usize);
+    ptr.offset((amt as isize).wrapping_neg())
+}
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
+    use core::ptr;
+
+    let mut n: usize = 0;
+    ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES);
+    n
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Our testing approach here is to try and exhaustively test every case.
+    // This includes the position at which a non-ASCII byte occurs in addition
+    // to the alignment of the slice that we're searching.
+
+    #[test]
+    fn positive_fallback_forward() {
+        for i in 0..517 {
+            let s = "a".repeat(i);
+            assert_eq!(
+                i,
+                first_non_ascii_byte_fallback(s.as_bytes()),
+                "i: {:?}, len: {:?}, s: {:?}",
+                i,
+                s.len(),
+                s
+            );
+        }
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    #[cfg(not(miri))]
+    fn positive_sse2_forward() {
+        for i in 0..517 {
+            let b = "a".repeat(i).into_bytes();
+            assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
+        }
+    }
+
+    #[test]
+    #[cfg(not(miri))]
+    fn negative_fallback_forward() {
+        for i in 0..517 {
+            for align in 0..65 {
+                let mut s = "a".repeat(i);
+                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
+                let s = s.get(align..).unwrap_or("");
+                assert_eq!(
+                    i.saturating_sub(align),
+                    first_non_ascii_byte_fallback(s.as_bytes()),
+                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
+                    i,
+                    align,
+                    s.len(),
+                    s
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    #[cfg(not(miri))]
+    fn negative_sse2_forward() {
+        for i in 0..517 {
+            for align in 0..65 {
+                let mut s = "a".repeat(i);
+                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
+                let s = s.get(align..).unwrap_or("");
+                assert_eq!(
+                    i.saturating_sub(align),
+                    first_non_ascii_byte_sse2(s.as_bytes()),
+                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
+                    i,
+                    align,
+                    s.len(),
+                    s
+                );
+            }
+        }
+    }
+}
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
index 2f95c684e87d9..074746ec01ab8 100644
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@@ -121,6 +121,7 @@ use crate::lexer::LexResult;
 
 mod function;
 // Skip flattening lexer to distinguish from full ruff_python_parser
+mod ascii;
 mod context;
 mod invalid;
 pub mod lexer;
diff --git a/crates/ruff_python_parser/src/python.lalrpop b/crates/ruff_python_parser/src/python.lalrpop
index 2d628ae74a805..f61ae2c2b4eff 100644
--- a/crates/ruff_python_parser/src/python.lalrpop
+++ b/crates/ruff_python_parser/src/python.lalrpop
@@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
 StringLiteral: StringType = {
     <location:@L> <string:string> <end_location:@R> =>? {
         let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
     }
 };
 
@@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
     FStringReplacementField,
     <location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
         let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
     }
 };
 
diff --git a/crates/ruff_python_parser/src/python.rs b/crates/ruff_python_parser/src/python.rs
index 1372b6e4fb260..95de336aa7614 100644
--- a/crates/ruff_python_parser/src/python.rs
+++ b/crates/ruff_python_parser/src/python.rs
@@ -1,5 +1,5 @@
 // auto-generated: "lalrpop 0.20.0"
-// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
+// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
 use crate::{
@@ -36369,7 +36369,7 @@ fn __action217<
 {
     {
         let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
     }
 }
 
@@ -36419,7 +36419,7 @@ fn __action220<
 {
     {
         let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
     }
 }
 
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 5b15474cf2dd6..124ba688a3a7a 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -1,8 +1,9 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.
 
 use ruff_python_ast::{self as ast, Expr};
-use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
+use ruff_text_size::{Ranged, TextRange, TextSize};
 
+use crate::ascii::first_non_ascii_byte;
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
@@ -32,34 +33,40 @@ impl From<StringType> for Expr {
     }
 }
 
-struct StringParser<'a> {
-    rest: &'a str,
+enum EscapedChar {
+    Literal(char),
+    Escape(char),
+}
+
+struct StringParser {
+    source: Box<str>,
+    cursor: usize,
     kind: StringKind,
-    location: TextSize,
+    offset: TextSize,
     range: TextRange,
 }
 
-impl<'a> StringParser<'a> {
-    fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
+impl StringParser {
+    fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
         Self {
-            rest: source,
+            source,
+            cursor: 0,
             kind,
-            location: start,
+            offset,
             range,
         }
     }
 
     #[inline]
-    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
-        let skipped_str = &self.rest[..bytes];
-        self.rest = &self.rest[bytes..];
-        self.location += skipped_str.text_len();
+    fn skip_bytes(&mut self, bytes: usize) -> &str {
+        let skipped_str = &self.source[self.cursor..self.cursor + bytes];
+        self.cursor += bytes;
         skipped_str
     }
 
     #[inline]
     fn get_pos(&self) -> TextSize {
-        self.location
+        self.offset + TextSize::try_from(self.cursor).unwrap()
     }
 
     /// Returns the next byte in the string, if there is one.
@@ -69,25 +76,23 @@ impl<'a> StringParser<'a> {
     /// When the next byte is a part of a multi-byte character.
     #[inline]
     fn next_byte(&mut self) -> Option<u8> {
-        self.rest.as_bytes().first().map(|&byte| {
-            self.rest = &self.rest[1..];
-            self.location += TextSize::new(1);
+        self.source[self.cursor..].as_bytes().first().map(|&byte| {
+            self.cursor += 1;
             byte
         })
     }
 
     #[inline]
     fn next_char(&mut self) -> Option<char> {
-        self.rest.chars().next().map(|c| {
-            self.rest = &self.rest[c.len_utf8()..];
-            self.location += c.text_len();
+        self.source[self.cursor..].chars().next().map(|c| {
+            self.cursor += c.len_utf8();
             c
         })
     }
 
     #[inline]
     fn peek_byte(&self) -> Option<u8> {
-        self.rest.as_bytes().first().copied()
+        self.source[self.cursor..].as_bytes().first().copied()
     }
 
     fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
@@ -135,7 +140,7 @@ impl<'a> StringParser<'a> {
         };
 
         let start_pos = self.get_pos();
-        let Some(close_idx) = self.rest.find('}') else {
+        let Some(close_idx) = self.source[self.cursor..].find('}') else {
             return Err(LexicalError::new(
                 LexicalErrorType::StringError,
                 self.get_pos(),
@@ -149,7 +154,8 @@ impl<'a> StringParser<'a> {
             .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
     }
 
-    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
+    /// Parse an escaped character, returning the new character.
+    fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
         let Some(first_char) = self.next_char() else {
             return Err(LexicalError::new(
                 LexicalErrorType::StringError,
@@ -174,13 +180,13 @@ impl<'a> StringParser<'a> {
             'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
             'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
             // Special cases where the escape sequence is not a single character
-            '\n' => return Ok(()),
+            '\n' => return Ok(None),
             '\r' => {
                 if self.peek_byte() == Some(b'\n') {
                     self.next_byte();
                 }
 
-                return Ok(());
+                return Ok(None);
             }
             _ => {
                 if self.kind.is_any_bytes() && !first_char.is_ascii() {
@@ -194,21 +200,42 @@ impl<'a> StringParser<'a> {
                     ));
                 }
 
-                string.push('\\');
-
-                first_char
+                return Ok(Some(EscapedChar::Escape(first_char)));
             }
         };
 
-        string.push(new_char);
-
-        Ok(())
+        Ok(Some(EscapedChar::Literal(new_char)))
     }
 
-    fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
+    fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
+        // Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
+        let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
+            return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
+                value: self.source,
+                range: self.range,
+            }));
+        };
+
+        let mut value = String::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(index + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match &self.source.as_bytes()[self.cursor - 1] {
+                // If there are any curly braces inside a `FStringMiddle` token,
+                // then they were escaped (i.e. `{{` or `}}`). This means that
+                // we need increase the location by 2 instead of 1.
+                b'{' => {
+                    self.offset += TextSize::from(1);
+                    value.push('{');
+                }
+                b'}' => {
+                    self.offset += TextSize::from(1);
+                    value.push('}');
+                }
                 // We can encounter a `\` as the last character in a `FStringMiddle`
                 // token which is valid in this context. For example,
                 //
@@ -229,71 +256,154 @@ impl<'a> StringParser<'a> {
                 // This is still an invalid escape sequence, but we don't want to
                 // raise a syntax error as is done by the CPython parser. It might
                 // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
-                    self.parse_escaped_char(&mut value)?;
+                b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
+                    match self.parse_escaped_char()? {
+                        None => {}
+                        Some(EscapedChar::Literal(c)) => value.push(c),
+                        Some(EscapedChar::Escape(c)) => {
+                            value.push('\\');
+                            value.push(c);
+                        }
+                    }
                 }
-                // If there are any curly braces inside a `FStringMiddle` token,
-                // then they were escaped (i.e. `{{` or `}}`). This means that
-                // we need increase the location by 2 instead of 1.
-                ch @ ('{' | '}') => {
-                    self.location += ch.text_len();
-                    value.push(ch);
+                ch => {
+                    value.push(char::from(*ch));
                 }
-                ch => value.push(ch),
             }
+
+            let Some(next_index) =
+                memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
+                break;
+            };
+
+            index = next_index;
         }
+
         Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
             value,
             range: self.range,
         }))
     }
 
-    fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
-        let mut content = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
-                '\\' if !self.kind.is_raw() => {
-                    self.parse_escaped_char(&mut content)?;
-                }
-                ch => {
-                    if !ch.is_ascii() {
-                        return Err(LexicalError::new(
-                            LexicalErrorType::OtherError(
-                                "bytes can only contain ASCII literal characters"
-                                    .to_string()
-                                    .into_boxed_str(),
-                            ),
-                            self.get_pos(),
-                        ));
-                    }
-                    content.push(ch);
+    fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
+        let index = first_non_ascii_byte(self.source.as_bytes());
+        if index < self.source.len() {
+            return Err(LexicalError::new(
+                LexicalErrorType::OtherError(
+                    "bytes can only contain ASCII literal characters"
+                        .to_string()
+                        .into_boxed_str(),
+                ),
+                self.offset + TextSize::try_from(index).unwrap(),
+            ));
+        }
+
+        if self.kind.is_raw() {
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_bytes(),
+                range: self.range,
+            }));
+        }
+
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_bytes(),
+                range: self.range,
+            }));
+        };
+
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = Vec::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.extend_from_slice(before.as_bytes());
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c as u8),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push(b'\\');
+                    value.push(c as u8);
                 }
             }
+
+            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.extend_from_slice(rest.as_bytes());
+                break;
+            };
+
+            // Update the position of the next escape sequence.
+            escape = next_escape;
         }
+
         Ok(StringType::Bytes(ast::BytesLiteral {
-            value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
+            value,
             range: self.range,
         }))
     }
 
-    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
+    fn parse_string(mut self) -> Result<StringType, LexicalError> {
         if self.kind.is_raw() {
-            value.push_str(self.skip_bytes(self.rest.len()));
-        } else {
-            loop {
-                let Some(escape_idx) = self.rest.find('\\') else {
-                    value.push_str(self.skip_bytes(self.rest.len()));
-                    break;
-                };
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        }
 
-                let before_with_slash = self.skip_bytes(escape_idx + 1);
-                let before = &before_with_slash[..before_with_slash.len() - 1];
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        };
 
-                value.push_str(before);
-                self.parse_escaped_char(&mut value)?;
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = String::with_capacity(self.source.len());
+
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push('\\');
+                    value.push(c);
+                }
             }
+
+            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
+                break;
+            };
+
+            // Update the position of the next escape sequence.
+            escape = next_escape;
         }
+
         Ok(StringType::Str(ast::StringLiteral {
             value: value.into_boxed_str(),
             unicode: self.kind.is_unicode(),
@@ -301,7 +411,7 @@ impl<'a> StringParser<'a> {
         }))
     }
 
-    fn parse(&mut self) -> Result<StringType, LexicalError> {
+    fn parse(self) -> Result<StringType, LexicalError> {
         if self.kind.is_any_bytes() {
             self.parse_bytes()
         } else {
@@ -311,7 +421,7 @@ impl<'a> StringParser<'a> {
 }
 
 pub(crate) fn parse_string_literal(
-    source: &str,
+    source: Box<str>,
     kind: StringKind,
     triple_quoted: bool,
     range: TextRange,
@@ -327,7 +437,7 @@ pub(crate) fn parse_string_literal(
 }
 
 pub(crate) fn parse_fstring_literal_element(
-    source: &str,
+    source: Box<str>,
     is_raw: bool,
     range: TextRange,
 ) -> Result<ast::FStringElement, LexicalError> {
@@ -360,7 +470,7 @@ pub(crate) fn concatenated_strings(
     if has_bytes && byte_literal_count < strings.len() {
         return Err(LexicalError::new(
             LexicalErrorType::OtherError(
-                "cannot mix bytes and nonbytes literals"
+                "cannot mix bytes and non-bytes literals"
                     .to_string()
                     .into_boxed_str(),
             ),

From 46800aeadb9da78dd58a10ee44675c0510b2c115 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Thu, 8 Feb 2024 16:25:48 -0500
Subject: [PATCH 2/5] Box other strings

---
 Cargo.lock                                    |  13 +-
 Cargo.toml                                    |   1 +
 .../rules/hardcoded_bind_all_interfaces.rs    |   4 +-
 crates/ruff_linter/src/rules/flynt/helpers.rs |   4 +-
 crates/ruff_python_ast/src/comparable.rs      |   2 +-
 crates/ruff_python_ast/src/nodes.rs           |   8 +-
 crates/ruff_python_parser/Cargo.toml          |   5 +-
 crates/ruff_python_parser/src/ascii.rs        | 345 ------------------
 crates/ruff_python_parser/src/lib.rs          |   5 +-
 crates/ruff_python_parser/src/string.rs       |  20 +-
 10 files changed, 30 insertions(+), 377 deletions(-)
 delete mode 100644 crates/ruff_python_parser/src/ascii.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5703ae61ca9f5..97511968ff8c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
 
 [[package]]
 name = "bstr"
-version = "1.6.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
+checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
 dependencies = [
  "memchr",
- "regex-automata 0.3.9",
+ "regex-automata 0.4.3",
  "serde",
 ]
 
@@ -1921,12 +1921,6 @@ dependencies = [
  "regex-syntax 0.6.29",
 ]
 
-[[package]]
-name = "regex-automata"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
-
 [[package]]
 name = "regex-automata"
 version = "0.4.3"
@@ -2342,6 +2336,7 @@ version = "0.0.0"
 dependencies = [
  "anyhow",
  "bitflags 2.4.1",
+ "bstr",
  "insta",
  "is-macro",
  "itertools 0.12.1",
diff --git a/Cargo.toml b/Cargo.toml
index a783bbebef3e2..c4f4492c18e80 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
 assert_cmd = { version = "2.0.13" }
 bincode = { version = "1.3.3" }
 bitflags = { version = "2.4.1" }
+bstr = { version = "1.9.0" }
 cachedir = { version = "0.3.1" }
 chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
 clap = { version = "4.4.18", features = ["derive"] }
diff --git a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
index 38295b71316a2..0e4301ee44c07 100644
--- a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
+++ b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
@@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
 pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
     let is_bind_all_interface = match string {
         StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
-        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
+        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
+            &**value == "0.0.0.0"
+        }
         StringLike::BytesLiteral(_) => return,
     };
 
diff --git a/crates/ruff_linter/src/rules/flynt/helpers.rs b/crates/ruff_linter/src/rules/flynt/helpers.rs
index 7a6af204d13f9..640f922d6faa2 100644
--- a/crates/ruff_linter/src/rules/flynt/helpers.rs
+++ b/crates/ruff_linter/src/rules/flynt/helpers.rs
@@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
 /// Convert a string to a [`ast::FStringElement::Literal`].
 pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
     ast::FStringElement::Literal(ast::FStringLiteralElement {
-        value: s.to_owned(),
+        value: s.to_string().into_boxed_str(),
         range: TextRange::default(),
     })
 }
@@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
     match expr {
         Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
             Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
-                value: value.to_string(),
+                value: value.to_string().into_boxed_str(),
                 range: *range,
             }))
         }
diff --git a/crates/ruff_python_ast/src/comparable.rs b/crates/ruff_python_ast/src/comparable.rs
index bc6327f01dca0..344bb615ce95e 100644
--- a/crates/ruff_python_ast/src/comparable.rs
+++ b/crates/ruff_python_ast/src/comparable.rs
@@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
 impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
     fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
         Self {
-            value: bytes_literal.value.as_slice(),
+            value: &bytes_literal.value,
         }
     }
 }
diff --git a/crates/ruff_python_ast/src/nodes.rs b/crates/ruff_python_ast/src/nodes.rs
index cfb8355c69f05..b6581eef40524 100644
--- a/crates/ruff_python_ast/src/nodes.rs
+++ b/crates/ruff_python_ast/src/nodes.rs
@@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
 #[derive(Clone, Debug, PartialEq)]
 pub struct FStringLiteralElement {
     pub range: TextRange,
-    pub value: String,
+    pub value: Box<str>,
 }
 
 impl Ranged for FStringLiteralElement {
@@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
     type Target = str;
 
     fn deref(&self) -> &Self::Target {
-        self.value.as_str()
+        &self.value
     }
 }
 
@@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
 #[derive(Clone, Debug, Default, PartialEq)]
 pub struct BytesLiteral {
     pub range: TextRange,
-    pub value: Vec<u8>,
+    pub value: Box<[u8]>,
 }
 
 impl Ranged for BytesLiteral {
@@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
     type Target = [u8];
 
     fn deref(&self) -> &Self::Target {
-        self.value.as_slice()
+        &self.value
     }
 }
 
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index 6bcdf6c902172..886bb07fec0b6 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
 
 anyhow = { workspace = true }
 bitflags = { workspace = true }
+bstr = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { workspace = true, default-features = false }
 memchr = { workspace = true }
-unicode-ident = { workspace = true }
-unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
+unicode-ident = { workspace = true }
+unicode_names2 = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/src/ascii.rs b/crates/ruff_python_parser/src/ascii.rs
deleted file mode 100644
index 87614dc98a0e8..0000000000000
--- a/crates/ruff_python_parser/src/ascii.rs
+++ /dev/null
@@ -1,345 +0,0 @@
-#![allow(
-    clippy::cast_possible_truncation,
-    clippy::cast_possible_wrap,
-    clippy::cast_ptr_alignment,
-    clippy::inline_always,
-    clippy::ptr_as_ptr,
-    unsafe_code
-)]
-
-//! Source: <https://github.com/BurntSushi/bstr/blob/d4aeee2eac5d5ef6ec4d2206f6ebffe7b3dd3e1f/src/ascii.rs>
-
-// The following ~400 lines of code exists for exactly one purpose, which is
-// to optimize this code:
-//
-//     byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
-//
-// Yes... Overengineered is a word that comes to mind, but this is effectively
-// a very similar problem to memchr, and virtually nobody has been able to
-// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
-// folks). In particular, this routine makes a very common case (ASCII) very
-// fast, which seems worth it. We do stop short of adding AVX variants of the
-// code below in order to retain our sanity and also to avoid needing to deal
-// with runtime target feature detection. RESIST!
-//
-// In order to understand the SIMD version below, it would be good to read this
-// comment describing how my memchr routine works:
-// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
-//
-// The primary difference with memchr is that for ASCII, we can do a bit less
-// work. In particular, we don't need to detect the presence of a specific
-// byte, but rather, whether any byte has its most significant bit set. That
-// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
-// _mm_movemask_epi8.
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const USIZE_BYTES: usize = core::mem::size_of::<usize>();
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
-
-// This is a mask where the most significant bit of each byte in the usize
-// is set. We test this bit to determine whether a character is ASCII or not.
-// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
-// most significant bit is not set.
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080;
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
-
-/// Returns the index of the first non ASCII byte in the given slice.
-///
-/// If slice only contains ASCII bytes, then the length of the slice is
-/// returned.
-pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize {
-    #[cfg(any(miri, not(target_arch = "x86_64")))]
-    {
-        first_non_ascii_byte_fallback(slice)
-    }
-
-    #[cfg(all(not(miri), target_arch = "x86_64"))]
-    {
-        first_non_ascii_byte_sse2(slice)
-    }
-}
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
-    let align = USIZE_BYTES - 1;
-    let start_ptr = slice.as_ptr();
-    let end_ptr = slice[slice.len()..].as_ptr();
-    let mut ptr = start_ptr;
-
-    unsafe {
-        if slice.len() < USIZE_BYTES {
-            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
-        }
-
-        let chunk = read_unaligned_usize(ptr);
-        let mask = chunk & ASCII_MASK;
-        if mask != 0 {
-            return first_non_ascii_byte_mask(mask);
-        }
-
-        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
-        debug_assert!(ptr > start_ptr);
-        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
-        if slice.len() >= FALLBACK_LOOP_SIZE {
-            while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
-                debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
-
-                let a = *ptr.cast::<usize>();
-                let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
-                if (a | b) & ASCII_MASK != 0 {
-                    // What a kludge. We wrap the position finding code into
-                    // a non-inlineable function, which makes the codegen in
-                    // the tight loop above a bit better by avoiding a
-                    // couple extra movs. We pay for it by two additional
-                    // stores, but only in the case of finding a non-ASCII
-                    // byte.
-                    #[inline(never)]
-                    unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize {
-                        let a = *ptr.cast::<usize>();
-                        let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
-
-                        let mut at = sub(ptr, start_ptr);
-                        let maska = a & ASCII_MASK;
-                        if maska != 0 {
-                            return at + first_non_ascii_byte_mask(maska);
-                        }
-
-                        at += USIZE_BYTES;
-                        let maskb = b & ASCII_MASK;
-                        debug_assert!(maskb != 0);
-                        at + first_non_ascii_byte_mask(maskb)
-                    }
-                    return findpos(start_ptr, ptr);
-                }
-                ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
-            }
-        }
-        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
-    }
-}
-
-#[cfg(all(not(miri), target_arch = "x86_64"))]
-fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
-    use core::arch::x86_64::{
-        __m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
-    };
-
-    const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
-    const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
-    const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
-
-    let start_ptr = slice.as_ptr();
-    let end_ptr = slice[slice.len()..].as_ptr();
-    let mut ptr = start_ptr;
-
-    unsafe {
-        if slice.len() < VECTOR_SIZE {
-            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
-        }
-
-        let chunk = _mm_loadu_si128(ptr as *const __m128i);
-        let mask = _mm_movemask_epi8(chunk);
-        if mask != 0 {
-            return mask.trailing_zeros() as usize;
-        }
-
-        ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
-        debug_assert!(ptr > start_ptr);
-        debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
-        if slice.len() >= VECTOR_LOOP_SIZE {
-            while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
-                debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
-
-                let a = _mm_load_si128(ptr as *const __m128i);
-                let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
-                let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
-                let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
-
-                let or1 = _mm_or_si128(a, b);
-                let or2 = _mm_or_si128(c, d);
-                let or3 = _mm_or_si128(or1, or2);
-                if _mm_movemask_epi8(or3) != 0 {
-                    let mut at = sub(ptr, start_ptr);
-                    let mask = _mm_movemask_epi8(a);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(b);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(c);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(d);
-                    debug_assert!(mask != 0);
-                    return at + mask.trailing_zeros() as usize;
-                }
-                ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
-            }
-        }
-        while ptr <= end_ptr.sub(VECTOR_SIZE) {
-            debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
-
-            let chunk = _mm_loadu_si128(ptr as *const __m128i);
-            let mask = _mm_movemask_epi8(chunk);
-            if mask != 0 {
-                return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
-            }
-            ptr = ptr.add(VECTOR_SIZE);
-        }
-        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
-    }
-}
-
-#[inline(always)]
-unsafe fn first_non_ascii_byte_slow(
-    start_ptr: *const u8,
-    end_ptr: *const u8,
-    mut ptr: *const u8,
-) -> usize {
-    debug_assert!(start_ptr <= ptr);
-    debug_assert!(ptr <= end_ptr);
-
-    while ptr < end_ptr {
-        if *ptr > 0x7F {
-            return sub(ptr, start_ptr);
-        }
-        ptr = ptr.offset(1);
-    }
-    sub(end_ptr, start_ptr)
-}
-
-/// Compute the position of the first ASCII byte in the given mask.
-///
-/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
-/// 8 contiguous bytes of the slice being checked where *at least* one of those
-/// bytes is not an ASCII byte.
-///
-/// The position returned is always in the inclusive range [0, 7].
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-fn first_non_ascii_byte_mask(mask: usize) -> usize {
-    #[cfg(target_endian = "little")]
-    {
-        mask.trailing_zeros() as usize / 8
-    }
-    #[cfg(target_endian = "big")]
-    {
-        mask.leading_zeros() as usize / 8
-    }
-}
-
-/// Increment the given pointer by the given amount.
-unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
-    debug_assert!(amt < ::core::isize::MAX as usize);
-    ptr.add(amt)
-}
-
-/// Decrement the given pointer by the given amount.
-unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
-    debug_assert!(amt < ::core::isize::MAX as usize);
-    ptr.offset((amt as isize).wrapping_neg())
-}
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
-    use core::ptr;
-
-    let mut n: usize = 0;
-    ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES);
-    n
-}
-
-/// Subtract `b` from `a` and return the difference. `a` should be greater than
-/// or equal to `b`.
-fn sub(a: *const u8, b: *const u8) -> usize {
-    debug_assert!(a >= b);
-    (a as usize) - (b as usize)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // Our testing approach here is to try and exhaustively test every case.
-    // This includes the position at which a non-ASCII byte occurs in addition
-    // to the alignment of the slice that we're searching.
-
-    #[test]
-    fn positive_fallback_forward() {
-        for i in 0..517 {
-            let s = "a".repeat(i);
-            assert_eq!(
-                i,
-                first_non_ascii_byte_fallback(s.as_bytes()),
-                "i: {:?}, len: {:?}, s: {:?}",
-                i,
-                s.len(),
-                s
-            );
-        }
-    }
-
-    #[test]
-    #[cfg(target_arch = "x86_64")]
-    #[cfg(not(miri))]
-    fn positive_sse2_forward() {
-        for i in 0..517 {
-            let b = "a".repeat(i).into_bytes();
-            assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
-        }
-    }
-
-    #[test]
-    #[cfg(not(miri))]
-    fn negative_fallback_forward() {
-        for i in 0..517 {
-            for align in 0..65 {
-                let mut s = "a".repeat(i);
-                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
-                let s = s.get(align..).unwrap_or("");
-                assert_eq!(
-                    i.saturating_sub(align),
-                    first_non_ascii_byte_fallback(s.as_bytes()),
-                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
-                    i,
-                    align,
-                    s.len(),
-                    s
-                );
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(target_arch = "x86_64")]
-    #[cfg(not(miri))]
-    fn negative_sse2_forward() {
-        for i in 0..517 {
-            for align in 0..65 {
-                let mut s = "a".repeat(i);
-                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
-                let s = s.get(align..).unwrap_or("");
-                assert_eq!(
-                    i.saturating_sub(align),
-                    first_non_ascii_byte_sse2(s.as_bytes()),
-                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
-                    i,
-                    align,
-                    s.len(),
-                    s
-                );
-            }
-        }
-    }
-}
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
index 074746ec01ab8..7c9c5402fb442 100644
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@@ -119,11 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
 
 use crate::lexer::LexResult;
 
-mod function;
-// Skip flattening lexer to distinguish from full ruff_python_parser
-mod ascii;
 mod context;
+mod function;
 mod invalid;
+// Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
 mod parser;
 mod soft_keywords;
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 124ba688a3a7a..807b645aef29a 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -1,9 +1,10 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.
 
+use bstr::ByteSlice;
+
 use ruff_python_ast::{self as ast, Expr};
 use ruff_text_size::{Ranged, TextRange, TextSize};
 
-use crate::ascii::first_non_ascii_byte;
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
@@ -218,9 +219,9 @@ impl StringParser {
 
         let mut value = String::with_capacity(self.source.len());
         loop {
-            // Add the characters before the escape sequence to the string.
-            let before_with_slash = self.skip_bytes(index + 1);
-            let before = &before_with_slash[..before_with_slash.len() - 1];
+            // Add the characters before the escape sequence (or curly brace) to the string.
+            let before_with_slash_or_brace = self.skip_bytes(index + 1);
+            let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
             value.push_str(before);
 
             // Add the escaped character to the string.
@@ -284,14 +285,13 @@ impl StringParser {
         }
 
         Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
-            value,
+            value: value.into_boxed_str(),
             range: self.range,
         }))
     }
 
     fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
-        let index = first_non_ascii_byte(self.source.as_bytes());
-        if index < self.source.len() {
+        if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
             return Err(LexicalError::new(
                 LexicalErrorType::OtherError(
                     "bytes can only contain ASCII literal characters"
@@ -305,7 +305,7 @@ impl StringParser {
         if self.kind.is_raw() {
             // For raw strings, no escaping is necessary.
             return Ok(StringType::Bytes(ast::BytesLiteral {
-                value: self.source.into_bytes(),
+                value: self.source.into_boxed_bytes(),
                 range: self.range,
             }));
         }
@@ -313,7 +313,7 @@ impl StringParser {
         let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
             // If the string doesn't contain any escape sequences, return the owned string.
             return Ok(StringType::Bytes(ast::BytesLiteral {
-                value: self.source.into_bytes(),
+                value: self.source.into_boxed_bytes(),
                 range: self.range,
             }));
         };
@@ -349,7 +349,7 @@ impl StringParser {
         }
 
         Ok(StringType::Bytes(ast::BytesLiteral {
-            value,
+            value: value.into_boxed_slice(),
             range: self.range,
         }))
     }

From 5db7e35e8a9e421dd9d39e1cb42a8445be39d596 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Thu, 8 Feb 2024 18:05:51 -0500
Subject: [PATCH 3/5] Use shared finder

---
 Cargo.lock                              |  1 +
 crates/ruff_python_parser/Cargo.toml    |  1 +
 crates/ruff_python_parser/src/string.rs | 12 ++++++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 97511968ff8c4..ba459f3f1cf23 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2343,6 +2343,7 @@ dependencies = [
  "lalrpop",
  "lalrpop-util",
  "memchr",
+ "once_cell",
  "ruff_python_ast",
  "ruff_text_size",
  "rustc-hash",
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index 886bb07fec0b6..e1291ab6a2e6c 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
+once_cell = "1.19.0"
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 807b645aef29a..0451604040882 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -1,6 +1,8 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.
 
 use bstr::ByteSlice;
+use memchr::memmem;
+use once_cell::sync::Lazy;
 
 use ruff_python_ast::{self as ast, Expr};
 use ruff_text_size::{Ranged, TextRange, TextSize};
@@ -8,6 +10,8 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
+const BACKSLASH_FINDER: Lazy<memmem::Finder> = Lazy::new(|| memmem::Finder::new(b"\\"));
+
 pub(crate) enum StringType {
     Str(ast::StringLiteral),
     Bytes(ast::BytesLiteral),
@@ -310,7 +314,7 @@ impl StringParser {
             }));
         }
 
-        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+        let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else {
             // If the string doesn't contain any escape sequences, return the owned string.
             return Ok(StringType::Bytes(ast::BytesLiteral {
                 value: self.source.into_boxed_bytes(),
@@ -336,7 +340,7 @@ impl StringParser {
                 }
             }
 
-            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes())
             else {
                 // Add the rest of the string to the value.
                 let rest = &self.source[self.cursor..];
@@ -364,7 +368,7 @@ impl StringParser {
             }));
         }
 
-        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+        let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else {
             // If the string doesn't contain any escape sequences, return the owned string.
             return Ok(StringType::Str(ast::StringLiteral {
                 value: self.source,
@@ -392,7 +396,7 @@ impl StringParser {
                 }
             }
 
-            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes())
             else {
                 // Add the rest of the string to the value.
                 let rest = &self.source[self.cursor..];

From 09be9b2c26d7f68714606128d22bcfdf0a26d2d3 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Thu, 8 Feb 2024 23:03:11 -0500
Subject: [PATCH 4/5] Add benches

---
 Cargo.lock                                  |   4 +
 crates/ruff_python_parser/Cargo.toml        |  13 +
 crates/ruff_python_parser/benches/string.rs |  93 +++
 crates/ruff_python_parser/src/lib.rs        |   3 +-
 crates/ruff_python_parser/src/old_string.rs | 820 ++++++++++++++++++++
 crates/ruff_python_parser/src/string.rs     |  21 +-
 6 files changed, 940 insertions(+), 14 deletions(-)
 create mode 100644 crates/ruff_python_parser/benches/string.rs
 create mode 100644 crates/ruff_python_parser/src/old_string.rs

diff --git a/Cargo.lock b/Cargo.lock
index ba459f3f1cf23..ae0b8f6a15259 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2337,17 +2337,21 @@ dependencies = [
  "anyhow",
  "bitflags 2.4.1",
  "bstr",
+ "codspeed-criterion-compat",
+ "criterion",
  "insta",
  "is-macro",
  "itertools 0.12.1",
  "lalrpop",
  "lalrpop-util",
  "memchr",
+ "mimalloc",
  "once_cell",
  "ruff_python_ast",
  "ruff_text_size",
  "rustc-hash",
  "static_assertions",
+ "tikv-jemallocator",
  "tiny-keccak",
  "unicode-ident",
  "unicode_names2",
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index e1291ab6a2e6c..075cc4ef8a00a 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -12,6 +12,11 @@ license = { workspace = true }
 build = "build.rs"
 
 [lib]
+bench = false
+
+[[bench]]
+name = "string"
+harness = false
 
 [dependencies]
 ruff_python_ast = { path = "../ruff_python_ast" }
@@ -29,6 +34,14 @@ static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
 once_cell = "1.19.0"
+criterion = { workspace = true, default-features = false }
+codspeed-criterion-compat = { workspace = true, default-features = false, optional = true}
+
+[target.'cfg(target_os = "windows")'.dev-dependencies]
+mimalloc = { workspace = true }
+
+[target.'cfg(all(not(target_os = "windows"), not(target_os = "openbsd"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "powerpc64")))'.dev-dependencies]
+tikv-jemallocator = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/benches/string.rs b/crates/ruff_python_parser/benches/string.rs
new file mode 100644
index 0000000000000..8e1a79c312af8
--- /dev/null
+++ b/crates/ruff_python_parser/benches/string.rs
@@ -0,0 +1,93 @@
+use criterion::{
+    black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, Criterion,
+};
+use ruff_python_parser::StringKind;
+use ruff_text_size::TextRange;
+
+#[cfg(target_os = "windows")]
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+#[cfg(all(
+    not(target_os = "windows"),
+    not(target_os = "openbsd"),
+    any(
+        target_arch = "x86_64",
+        target_arch = "aarch64",
+        target_arch = "powerpc64"
+    )
+))]
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+fn benchmark_parser(criterion: &mut Criterion<WallTime>) {
+    let mut group = criterion.benchmark_group("parse");
+
+    let s = "\"\"\"Validate length based{ on BIN for major brands:
+        https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_(IIN)\"\"\"";
+
+    // group.bench_with_input("new_string", &s, |b, &s| {
+    //     b.iter_batched(
+    //         || s.to_string().into_boxed_str(),
+    //         |data| {
+    //             ruff_python_parser::string::parse_string_literal(
+    //                 black_box(data),
+    //                 StringKind::String,
+    //                 true,
+    //                 TextRange::default(),
+    //             )
+    //         },
+    //         BatchSize::SmallInput,
+    //     );
+    // });
+    //
+    // group.bench_function("old_string", |b| {
+    //     b.iter_batched(
+    //         || s.to_string(),
+    //         |data| {
+    //             ruff_python_parser::old_string::parse_string_literal(
+    //                 black_box(&data),
+    //                 StringKind::String,
+    //                 true,
+    //                 TextRange::default(),
+    //             )
+    //         },
+    //         BatchSize::SmallInput,
+    //     );
+    // });
+
+    let s = "Item {i+1}";
+
+    group.bench_with_input("new_fstring", &s, |b, &s| {
+        b.iter_batched(
+            || s.to_string().into_boxed_str(),
+            |data| {
+                ruff_python_parser::string::parse_fstring_literal_element(
+                    black_box(data),
+                    true,
+                    TextRange::default(),
+                )
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("old_fstring", |b| {
+        b.iter_batched(
+            || s.to_string(),
+            |data| {
+                ruff_python_parser::old_string::parse_fstring_literal_element(
+                    black_box(&data),
+                    true,
+                    TextRange::default(),
+                )
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+criterion_group!(parser, benchmark_parser);
+criterion_main!(parser);
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
index 7c9c5402fb442..8e855e5d92dc6 100644
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@@ -124,9 +124,10 @@ mod function;
 mod invalid;
 // Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
+pub mod old_string;
 mod parser;
 mod soft_keywords;
-mod string;
+pub mod string;
 mod token;
 mod token_source;
 pub mod typing;
diff --git a/crates/ruff_python_parser/src/old_string.rs b/crates/ruff_python_parser/src/old_string.rs
new file mode 100644
index 0000000000000..54f2dece59198
--- /dev/null
+++ b/crates/ruff_python_parser/src/old_string.rs
@@ -0,0 +1,820 @@
+//! Parsing of string literals, bytes literals, and implicit string concatenation.
+
+use ruff_python_ast::{self as ast, Expr};
+use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
+
+use crate::lexer::{LexicalError, LexicalErrorType};
+use crate::string::FStringError;
+use crate::token::{StringKind, Tok};
+
+pub enum StringType {
+    Str(ast::StringLiteral),
+    Bytes(ast::BytesLiteral),
+    FString(ast::FString),
+}
+
+impl Ranged for StringType {
+    fn range(&self) -> TextRange {
+        match self {
+            Self::Str(node) => node.range(),
+            Self::Bytes(node) => node.range(),
+            Self::FString(node) => node.range(),
+        }
+    }
+}
+
+impl From<StringType> for Expr {
+    fn from(string: StringType) -> Self {
+        match string {
+            StringType::Str(node) => Expr::from(node),
+            StringType::Bytes(node) => Expr::from(node),
+            StringType::FString(node) => Expr::from(node),
+        }
+    }
+}
+
+struct StringParser<'a> {
+    rest: &'a str,
+    kind: StringKind,
+    location: TextSize,
+    range: TextRange,
+}
+
+impl<'a> StringParser<'a> {
+    fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
+        Self {
+            rest: source,
+            kind,
+            location: start,
+            range,
+        }
+    }
+
+    #[inline]
+    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
+        let skipped_str = &self.rest[..bytes];
+        self.rest = &self.rest[bytes..];
+        self.location += skipped_str.text_len();
+        skipped_str
+    }
+
+    #[inline]
+    fn get_pos(&self) -> TextSize {
+        self.location
+    }
+
+    /// Returns the next byte in the string, if there is one.
+    ///
+    /// # Panics
+    ///
+    /// When the next byte is a part of a multi-byte character.
+    #[inline]
+    fn next_byte(&mut self) -> Option<u8> {
+        self.rest.as_bytes().first().map(|&byte| {
+            self.rest = &self.rest[1..];
+            self.location += TextSize::new(1);
+            byte
+        })
+    }
+
+    #[inline]
+    fn next_char(&mut self) -> Option<char> {
+        self.rest.chars().next().map(|c| {
+            self.rest = &self.rest[c.len_utf8()..];
+            self.location += c.text_len();
+            c
+        })
+    }
+
+    #[inline]
+    fn peek_byte(&self) -> Option<u8> {
+        self.rest.as_bytes().first().copied()
+    }
+
+    fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
+        let mut p: u32 = 0u32;
+        let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
+        for i in 1..=literal_number {
+            match self.next_char() {
+                Some(c) => match c.to_digit(16) {
+                    Some(d) => p += d << ((literal_number - i) * 4),
+                    None => return Err(unicode_error),
+                },
+                None => return Err(unicode_error),
+            }
+        }
+        match p {
+            0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
+            _ => std::char::from_u32(p).ok_or(unicode_error),
+        }
+    }
+
+    fn parse_octet(&mut self, o: u8) -> char {
+        let mut radix_bytes = [o, 0, 0];
+        let mut len = 1;
+
+        while len < 3 {
+            let Some(b'0'..=b'7') = self.peek_byte() else {
+                break;
+            };
+
+            radix_bytes[len] = self.next_byte().unwrap();
+            len += 1;
+        }
+
+        // OK because radix_bytes is always going to be in the ASCII range.
+        let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes");
+        let value = u32::from_str_radix(radix_str, 8).unwrap();
+        char::from_u32(value).unwrap()
+    }
+
+    fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
+        let start_pos = self.get_pos();
+
+        let Some('{') = self.next_char() else {
+            return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
+        };
+
+        let start_pos = self.get_pos();
+        let Some(close_idx) = self.rest.find('}') else {
+            return Err(LexicalError::new(
+                LexicalErrorType::StringError,
+                self.get_pos(),
+            ));
+        };
+
+        let name_and_ending = self.skip_bytes(close_idx + 1);
+        let name = &name_and_ending[..name_and_ending.len() - 1];
+
+        unicode_names2::character(name)
+            .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
+    }
+
+    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
+        let Some(first_char) = self.next_char() else {
+            return Err(LexicalError::new(
+                LexicalErrorType::StringError,
+                self.get_pos(),
+            ));
+        };
+
+        let new_char = match first_char {
+            '\\' => '\\',
+            '\'' => '\'',
+            '\"' => '"',
+            'a' => '\x07',
+            'b' => '\x08',
+            'f' => '\x0c',
+            'n' => '\n',
+            'r' => '\r',
+            't' => '\t',
+            'v' => '\x0b',
+            o @ '0'..='7' => self.parse_octet(o as u8),
+            'x' => self.parse_unicode_literal(2)?,
+            'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
+            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
+            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
+            // Special cases where the escape sequence is not a single character
+            '\n' => return Ok(()),
+            '\r' => {
+                if self.peek_byte() == Some(b'\n') {
+                    self.next_byte();
+                }
+
+                return Ok(());
+            }
+            _ => {
+                if self.kind.is_any_bytes() && !first_char.is_ascii() {
+                    return Err(LexicalError::new(
+                        LexicalErrorType::OtherError(
+                            "bytes can only contain ASCII literal characters"
+                                .to_string()
+                                .into_boxed_str(),
+                        ),
+                        self.get_pos(),
+                    ));
+                }
+
+                string.push('\\');
+
+                first_char
+            }
+        };
+
+        string.push(new_char);
+
+        Ok(())
+    }
+
+    fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
+        let mut value = String::with_capacity(self.rest.len());
+        while let Some(ch) = self.next_char() {
+            match ch {
+                // We can encounter a `\` as the last character in a `FStringMiddle`
+                // token which is valid in this context. For example,
+                //
+                // ```python
+                // f"\{foo} \{bar:\}"
+                // # ^     ^^     ^
+                // ```
+                //
+                // Here, the `FStringMiddle` token content will be "\" and " \"
+                // which is invalid if we look at the content in isolation:
+                //
+                // ```python
+                // "\"
+                // ```
+                //
+                // However, the content is syntactically valid in the context of
+                // the f-string because it's a substring of the entire f-string.
+                // This is still an invalid escape sequence, but we don't want to
+                // raise a syntax error as is done by the CPython parser. It might
+                // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
+                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
+                    self.parse_escaped_char(&mut value)?;
+                }
+                // If there are any curly braces inside a `FStringMiddle` token,
+                // then they were escaped (i.e. `{{` or `}}`). This means that
+                // we need increase the location by 2 instead of 1.
+                ch @ ('{' | '}') => {
+                    self.location += ch.text_len();
+                    value.push(ch);
+                }
+                ch => value.push(ch),
+            }
+        }
+        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
+            value: value.into_boxed_str(),
+            range: self.range,
+        }))
+    }
+
+    fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
+        let mut content = String::with_capacity(self.rest.len());
+        while let Some(ch) = self.next_char() {
+            match ch {
+                '\\' if !self.kind.is_raw() => {
+                    self.parse_escaped_char(&mut content)?;
+                }
+                ch => {
+                    if !ch.is_ascii() {
+                        return Err(LexicalError::new(
+                            LexicalErrorType::OtherError(
+                                "bytes can only contain ASCII literal characters"
+                                    .to_string()
+                                    .into_boxed_str(),
+                            ),
+                            self.get_pos(),
+                        ));
+                    }
+                    content.push(ch);
+                }
+            }
+        }
+        Ok(StringType::Bytes(ast::BytesLiteral {
+            value: content
+                .chars()
+                .map(|c| c as u8)
+                .collect::<Vec<u8>>()
+                .into_boxed_slice(),
+            range: self.range,
+        }))
+    }
+
+    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
+        let mut value = String::with_capacity(self.rest.len());
+        if self.kind.is_raw() {
+            value.push_str(self.skip_bytes(self.rest.len()));
+        } else {
+            loop {
+                let Some(escape_idx) = self.rest.find('\\') else {
+                    value.push_str(self.skip_bytes(self.rest.len()));
+                    break;
+                };
+
+                let before_with_slash = self.skip_bytes(escape_idx + 1);
+                let before = &before_with_slash[..before_with_slash.len() - 1];
+
+                value.push_str(before);
+                self.parse_escaped_char(&mut value)?;
+            }
+        }
+        Ok(StringType::Str(ast::StringLiteral {
+            value: value.into_boxed_str(),
+            unicode: self.kind.is_unicode(),
+            range: self.range,
+        }))
+    }
+
+    fn parse(&mut self) -> Result<StringType, LexicalError> {
+        if self.kind.is_any_bytes() {
+            self.parse_bytes()
+        } else {
+            self.parse_string()
+        }
+    }
+}
+
+pub fn parse_string_literal(
+    source: &str,
+    kind: StringKind,
+    triple_quoted: bool,
+    range: TextRange,
+) -> Result<StringType, LexicalError> {
+    let start_location = range.start()
+        + kind.prefix_len()
+        + if triple_quoted {
+            TextSize::from(3)
+        } else {
+            TextSize::from(1)
+        };
+    StringParser::new(source, kind, start_location, range).parse()
+}
+
+pub fn parse_fstring_literal_element(
+    source: &str,
+    is_raw: bool,
+    range: TextRange,
+) -> Result<ast::FStringElement, LexicalError> {
+    let kind = if is_raw {
+        StringKind::RawString
+    } else {
+        StringKind::String
+    };
+    StringParser::new(source, kind, range.start(), range).parse_fstring_middle()
+}
+
+pub(crate) fn concatenated_strings(
+    strings: Vec<StringType>,
+    range: TextRange,
+) -> Result<Expr, LexicalError> {
+    #[cfg(debug_assertions)]
+    debug_assert!(strings.len() > 1);
+
+    let mut has_fstring = false;
+    let mut byte_literal_count = 0;
+    for string in &strings {
+        match string {
+            StringType::FString(_) => has_fstring = true,
+            StringType::Bytes(_) => byte_literal_count += 1,
+            StringType::Str(_) => {}
+        }
+    }
+    let has_bytes = byte_literal_count > 0;
+
+    if has_bytes && byte_literal_count < strings.len() {
+        return Err(LexicalError::new(
+            LexicalErrorType::OtherError(
+                "cannot mix bytes and nonbytes literals"
+                    .to_string()
+                    .into_boxed_str(),
+            ),
+            range.start(),
+        ));
+    }
+
+    if has_bytes {
+        let mut values = Vec::with_capacity(strings.len());
+        for string in strings {
+            match string {
+                StringType::Bytes(value) => values.push(value),
+                _ => unreachable!("Unexpected non-bytes literal."),
+            }
+        }
+        return Ok(Expr::from(ast::ExprBytesLiteral {
+            value: ast::BytesLiteralValue::concatenated(values),
+            range,
+        }));
+    }
+
+    if !has_fstring {
+        let mut values = Vec::with_capacity(strings.len());
+        for string in strings {
+            match string {
+                StringType::Str(value) => values.push(value),
+                _ => unreachable!("Unexpected non-string literal."),
+            }
+        }
+        return Ok(Expr::from(ast::ExprStringLiteral {
+            value: ast::StringLiteralValue::concatenated(values),
+            range,
+        }));
+    }
+
+    let mut parts = Vec::with_capacity(strings.len());
+    for string in strings {
+        match string {
+            StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)),
+            StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)),
+            StringType::Bytes(_) => unreachable!("Unexpected bytes literal."),
+        }
+    }
+
+    Ok(ast::ExprFString {
+        value: ast::FStringValue::concatenated(parts),
+        range,
+    }
+    .into())
+}
+
+/// Represents the different types of errors that can occur during parsing of an f-string.
+#[derive(Copy, Debug, Clone, PartialEq)]
+pub enum FStringErrorType {
+    /// Expected a right brace after an opened left brace.
+    UnclosedLbrace,
+    /// An invalid conversion flag was encountered.
+    InvalidConversionFlag,
+    /// A single right brace was encountered.
+    SingleRbrace,
+    /// Unterminated string.
+    UnterminatedString,
+    /// Unterminated triple-quoted string.
+    UnterminatedTripleQuotedString,
+    // TODO(dhruvmanila): The parser can't catch all cases of this error, but
+    // wherever it can, we'll display the correct error message.
+    /// A lambda expression without parentheses was encountered.
+    LambdaWithoutParentheses,
+}
+
+impl std::fmt::Display for FStringErrorType {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        use FStringErrorType::{
+            InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace,
+            UnterminatedString, UnterminatedTripleQuotedString,
+        };
+        match self {
+            UnclosedLbrace => write!(f, "expecting '}}'"),
+            InvalidConversionFlag => write!(f, "invalid conversion character"),
+            SingleRbrace => write!(f, "single '}}' is not allowed"),
+            UnterminatedString => write!(f, "unterminated string"),
+            UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"),
+            LambdaWithoutParentheses => {
+                write!(f, "lambda expressions are not allowed without parentheses")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::lexer::LexicalErrorType;
+    use crate::parser::parse_suite;
+    use crate::{ParseErrorType, Suite};
+
+    use super::*;
+
+    const WINDOWS_EOL: &str = "\r\n";
+    const MAC_EOL: &str = "\r";
+    const UNIX_EOL: &str = "\n";
+
+    fn string_parser_escaped_eol(eol: &str) -> Suite {
+        let source = format!(r"'text \{eol}more text'");
+        parse_suite(&source).unwrap()
+    }
+
+    #[test]
+    fn test_string_parser_escaped_unix_eol() {
+        let parse_ast = string_parser_escaped_eol(UNIX_EOL);
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_string_parser_escaped_mac_eol() {
+        let parse_ast = string_parser_escaped_eol(MAC_EOL);
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_string_parser_escaped_windows_eol() {
+        let parse_ast = string_parser_escaped_eol(WINDOWS_EOL);
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring() {
+        let source = r#"f"{a}{ b }{{foo}}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_nested_spec() {
+        let source = r#"f"{foo:{spec}}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_not_nested_spec() {
+        let source = r#"f"{foo:spec}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_empty_fstring() {
+        insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap());
+    }
+
+    #[test]
+    fn test_fstring_parse_self_documenting_base() {
+        let source = r#"f"{user=}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_parse_self_documenting_base_more() {
+        let source = r#"f"mix {user=} with text and {second=}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_parse_self_documenting_format() {
+        let source = r#"f"{user=:>10}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    fn parse_fstring_error(source: &str) -> FStringErrorType {
+        parse_suite(source)
+            .map_err(|e| match e.error {
+                ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e,
+                e => unreachable!("Expected FStringError: {:?}", e),
+            })
+            .expect_err("Expected error")
+    }
+
+    #[test]
+    fn test_parse_invalid_fstring() {
+        use FStringErrorType::{InvalidConversionFlag, LambdaWithoutParentheses};
+
+        assert_eq!(parse_fstring_error(r#"f"{5!x}""#), InvalidConversionFlag);
+        assert_eq!(
+            parse_fstring_error("f'{lambda x:{x}}'"),
+            LambdaWithoutParentheses
+        );
+        assert_eq!(
+            parse_fstring_error("f'{lambda x: {x}}'"),
+            LambdaWithoutParentheses
+        );
+        assert!(parse_suite(r#"f"{class}""#,).is_err());
+    }
+
+    #[test]
+    fn test_parse_fstring_not_equals() {
+        let source = r#"f"{1 != 2}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_equals() {
+        let source = r#"f"{42 == 42}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_self_doc_prec_space() {
+        let source = r#"f"{x   =}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_self_doc_trailing_space() {
+        let source = r#"f"{x=   }""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_yield_expr() {
+        let source = r#"f"{yield}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_string_concat() {
+        let source = "'Hello ' 'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_u_string_concat_1() {
+        let source = "'Hello ' u'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_u_string_concat_2() {
+        let source = "u'Hello ' 'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_f_string_concat_1() {
+        let source = "'Hello ' f'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_f_string_concat_2() {
+        let source = "'Hello ' f'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_f_string_concat_3() {
+        let source = "'Hello ' f'world{\"!\"}'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_f_string_concat_4() {
+        let source = "'Hello ' f'world{\"!\"}' 'again!'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_u_f_string_concat_1() {
+        let source = "u'Hello ' f'world'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_u_f_string_concat_2() {
+        let source = "u'Hello ' f'world' '!'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_string_triple_quotes_with_kind() {
+        let source = "u'''Hello, world!'''";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_single_quoted_byte() {
+        // single quote
+        let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_double_quoted_byte() {
+        // double quote
+        let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_escape_char_in_byte_literal() {
+        // backslash does not escape
+        let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_raw_byte_literal_1() {
+        let source = r"rb'\x1z'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_raw_byte_literal_2() {
+        let source = r"rb'\\'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_escape_octet() {
+        let source = r"b'\43a\4\1234'";
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_escaped_newline() {
+        let source = r#"f"\n{x}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_constant_range() {
+        let source = r#"f"aaa{bbb}ccc{ddd}eee""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_unescaped_newline() {
+        let source = r#"f"""
+{x}""""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_escaped_character() {
+        let source = r#"f"\\{x}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_raw_fstring() {
+        let source = r#"rf"{x}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_triple_quoted_raw_fstring() {
+        let source = r#"rf"""{x}""""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_fstring_line_continuation() {
+        let source = r#"rf"\
+{x}""#;
+        let parse_ast = parse_suite(source).unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_nested_string_spec() {
+        let source = r#"f"{foo:{''}}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_parse_fstring_nested_concatenation_string_spec() {
+        let source = r#"f"{foo:{'' ''}}""#;
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    /// <https://github.com/astral-sh/ruff/issues/8355>
+    #[test]
+    fn test_dont_panic_on_8_in_octal_escape() {
+        let source = r"bold = '\038[1m'";
+        let parse_ast = parse_suite(source).unwrap();
+
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    macro_rules! test_aliases_parse {
+        ($($name:ident: $alias:expr,)*) => {
+        $(
+            #[test]
+            fn $name() {
+                let source = format!(r#""\N{{{0}}}""#, $alias);
+                let parse_ast = parse_suite(&source).unwrap();
+                insta::assert_debug_snapshot!(parse_ast);
+            }
+        )*
+        }
+    }
+
+    test_aliases_parse! {
+        test_backspace_alias: "BACKSPACE",
+        test_bell_alias: "BEL",
+        test_carriage_return_alias: "CARRIAGE RETURN",
+        test_delete_alias: "DELETE",
+        test_escape_alias: "ESCAPE",
+        test_form_feed_alias: "FORM FEED",
+        test_hts_alias: "HTS",
+        test_character_tabulation_with_justification_alias: "CHARACTER TABULATION WITH JUSTIFICATION",
+    }
+}
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 0451604040882..79d423bfaffb7 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -1,8 +1,6 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.
 
 use bstr::ByteSlice;
-use memchr::memmem;
-use once_cell::sync::Lazy;
 
 use ruff_python_ast::{self as ast, Expr};
 use ruff_text_size::{Ranged, TextRange, TextSize};
@@ -10,9 +8,7 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
-const BACKSLASH_FINDER: Lazy<memmem::Finder> = Lazy::new(|| memmem::Finder::new(b"\\"));
-
-pub(crate) enum StringType {
+pub enum StringType {
     Str(ast::StringLiteral),
     Bytes(ast::BytesLiteral),
     FString(ast::FString),
@@ -314,7 +310,7 @@ impl StringParser {
             }));
         }
 
-        let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else {
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
             // If the string doesn't contain any escape sequences, return the owned string.
             return Ok(StringType::Bytes(ast::BytesLiteral {
                 value: self.source.into_boxed_bytes(),
@@ -340,7 +336,7 @@ impl StringParser {
                 }
             }
 
-            let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes())
+            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
             else {
                 // Add the rest of the string to the value.
                 let rest = &self.source[self.cursor..];
@@ -368,7 +364,7 @@ impl StringParser {
             }));
         }
 
-        let Some(mut escape) = BACKSLASH_FINDER.find(self.source.as_bytes()) else {
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
             // If the string doesn't contain any escape sequences, return the owned string.
             return Ok(StringType::Str(ast::StringLiteral {
                 value: self.source,
@@ -396,8 +392,7 @@ impl StringParser {
                 }
             }
 
-            let Some(next_escape) = BACKSLASH_FINDER.find(self.source[self.cursor..].as_bytes())
-            else {
+            let Some(next_escape) = self.source[self.cursor..].find('\\') else {
                 // Add the rest of the string to the value.
                 let rest = &self.source[self.cursor..];
                 value.push_str(rest);
@@ -424,7 +419,7 @@ impl StringParser {
     }
 }
 
-pub(crate) fn parse_string_literal(
+pub fn parse_string_literal(
     source: Box<str>,
     kind: StringKind,
     triple_quoted: bool,
@@ -440,7 +435,7 @@ pub(crate) fn parse_string_literal(
     StringParser::new(source, kind, start_location, range).parse()
 }
 
-pub(crate) fn parse_fstring_literal_element(
+pub fn parse_fstring_literal_element(
     source: Box<str>,
     is_raw: bool,
     range: TextRange,
@@ -529,7 +524,7 @@ pub(crate) fn concatenated_strings(
 // TODO: consolidate these with ParseError
 /// An error that occurred during parsing of an f-string.
 #[derive(Debug, Clone, PartialEq)]
-struct FStringError {
+pub(crate) struct FStringError {
     /// The type of error that occurred.
     pub(crate) error: FStringErrorType,
     /// The location of the error.

From 58178b30c0eae58977715f09697bca6257507519 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Fri, 9 Feb 2024 15:42:10 -0500
Subject: [PATCH 5/5] Revert benchmarking code

---
 Cargo.lock                                  |   5 -
 crates/ruff_python_parser/Cargo.toml        |  14 -
 crates/ruff_python_parser/benches/string.rs |  93 ---
 crates/ruff_python_parser/src/lib.rs        |   3 +-
 crates/ruff_python_parser/src/old_string.rs | 820 --------------------
 crates/ruff_python_parser/src/string.rs     |   8 +-
 6 files changed, 5 insertions(+), 938 deletions(-)
 delete mode 100644 crates/ruff_python_parser/benches/string.rs
 delete mode 100644 crates/ruff_python_parser/src/old_string.rs

diff --git a/Cargo.lock b/Cargo.lock
index ae0b8f6a15259..97511968ff8c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2337,21 +2337,16 @@ dependencies = [
  "anyhow",
  "bitflags 2.4.1",
  "bstr",
- "codspeed-criterion-compat",
- "criterion",
  "insta",
  "is-macro",
  "itertools 0.12.1",
  "lalrpop",
  "lalrpop-util",
  "memchr",
- "mimalloc",
- "once_cell",
  "ruff_python_ast",
  "ruff_text_size",
  "rustc-hash",
  "static_assertions",
- "tikv-jemallocator",
  "tiny-keccak",
  "unicode-ident",
  "unicode_names2",
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index 075cc4ef8a00a..886bb07fec0b6 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -12,11 +12,6 @@ license = { workspace = true }
 build = "build.rs"
 
 [lib]
-bench = false
-
-[[bench]]
-name = "string"
-harness = false
 
 [dependencies]
 ruff_python_ast = { path = "../ruff_python_ast" }
@@ -33,15 +28,6 @@ rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
-once_cell = "1.19.0"
-criterion = { workspace = true, default-features = false }
-codspeed-criterion-compat = { workspace = true, default-features = false, optional = true}
-
-[target.'cfg(target_os = "windows")'.dev-dependencies]
-mimalloc = { workspace = true }
-
-[target.'cfg(all(not(target_os = "windows"), not(target_os = "openbsd"), any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "powerpc64")))'.dev-dependencies]
-tikv-jemallocator = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/benches/string.rs b/crates/ruff_python_parser/benches/string.rs
deleted file mode 100644
index 8e1a79c312af8..0000000000000
--- a/crates/ruff_python_parser/benches/string.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use criterion::{
-    black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, Criterion,
-};
-use ruff_python_parser::StringKind;
-use ruff_text_size::TextRange;
-
-#[cfg(target_os = "windows")]
-#[global_allocator]
-static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-#[cfg(all(
-    not(target_os = "windows"),
-    not(target_os = "openbsd"),
-    any(
-        target_arch = "x86_64",
-        target_arch = "aarch64",
-        target_arch = "powerpc64"
-    )
-))]
-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
-fn benchmark_parser(criterion: &mut Criterion<WallTime>) {
-    let mut group = criterion.benchmark_group("parse");
-
-    let s = "\"\"\"Validate length based{ on BIN for major brands:
-        https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_(IIN)\"\"\"";
-
-    // group.bench_with_input("new_string", &s, |b, &s| {
-    //     b.iter_batched(
-    //         || s.to_string().into_boxed_str(),
-    //         |data| {
-    //             ruff_python_parser::string::parse_string_literal(
-    //                 black_box(data),
-    //                 StringKind::String,
-    //                 true,
-    //                 TextRange::default(),
-    //             )
-    //         },
-    //         BatchSize::SmallInput,
-    //     );
-    // });
-    //
-    // group.bench_function("old_string", |b| {
-    //     b.iter_batched(
-    //         || s.to_string(),
-    //         |data| {
-    //             ruff_python_parser::old_string::parse_string_literal(
-    //                 black_box(&data),
-    //                 StringKind::String,
-    //                 true,
-    //                 TextRange::default(),
-    //             )
-    //         },
-    //         BatchSize::SmallInput,
-    //     );
-    // });
-
-    let s = "Item {i+1}";
-
-    group.bench_with_input("new_fstring", &s, |b, &s| {
-        b.iter_batched(
-            || s.to_string().into_boxed_str(),
-            |data| {
-                ruff_python_parser::string::parse_fstring_literal_element(
-                    black_box(data),
-                    true,
-                    TextRange::default(),
-                )
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group.bench_function("old_fstring", |b| {
-        b.iter_batched(
-            || s.to_string(),
-            |data| {
-                ruff_python_parser::old_string::parse_fstring_literal_element(
-                    black_box(&data),
-                    true,
-                    TextRange::default(),
-                )
-            },
-            BatchSize::SmallInput,
-        );
-    });
-
-    group.finish();
-}
-
-criterion_group!(parser, benchmark_parser);
-criterion_main!(parser);
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
index 8e855e5d92dc6..7c9c5402fb442 100644
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@@ -124,10 +124,9 @@ mod function;
 mod invalid;
 // Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
-pub mod old_string;
 mod parser;
 mod soft_keywords;
-pub mod string;
+mod string;
 mod token;
 mod token_source;
 pub mod typing;
diff --git a/crates/ruff_python_parser/src/old_string.rs b/crates/ruff_python_parser/src/old_string.rs
deleted file mode 100644
index 54f2dece59198..0000000000000
--- a/crates/ruff_python_parser/src/old_string.rs
+++ /dev/null
@@ -1,820 +0,0 @@
-//! Parsing of string literals, bytes literals, and implicit string concatenation.
-
-use ruff_python_ast::{self as ast, Expr};
-use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
-
-use crate::lexer::{LexicalError, LexicalErrorType};
-use crate::string::FStringError;
-use crate::token::{StringKind, Tok};
-
-pub enum StringType {
-    Str(ast::StringLiteral),
-    Bytes(ast::BytesLiteral),
-    FString(ast::FString),
-}
-
-impl Ranged for StringType {
-    fn range(&self) -> TextRange {
-        match self {
-            Self::Str(node) => node.range(),
-            Self::Bytes(node) => node.range(),
-            Self::FString(node) => node.range(),
-        }
-    }
-}
-
-impl From<StringType> for Expr {
-    fn from(string: StringType) -> Self {
-        match string {
-            StringType::Str(node) => Expr::from(node),
-            StringType::Bytes(node) => Expr::from(node),
-            StringType::FString(node) => Expr::from(node),
-        }
-    }
-}
-
-struct StringParser<'a> {
-    rest: &'a str,
-    kind: StringKind,
-    location: TextSize,
-    range: TextRange,
-}
-
-impl<'a> StringParser<'a> {
-    fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
-        Self {
-            rest: source,
-            kind,
-            location: start,
-            range,
-        }
-    }
-
-    #[inline]
-    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
-        let skipped_str = &self.rest[..bytes];
-        self.rest = &self.rest[bytes..];
-        self.location += skipped_str.text_len();
-        skipped_str
-    }
-
-    #[inline]
-    fn get_pos(&self) -> TextSize {
-        self.location
-    }
-
-    /// Returns the next byte in the string, if there is one.
-    ///
-    /// # Panics
-    ///
-    /// When the next byte is a part of a multi-byte character.
-    #[inline]
-    fn next_byte(&mut self) -> Option<u8> {
-        self.rest.as_bytes().first().map(|&byte| {
-            self.rest = &self.rest[1..];
-            self.location += TextSize::new(1);
-            byte
-        })
-    }
-
-    #[inline]
-    fn next_char(&mut self) -> Option<char> {
-        self.rest.chars().next().map(|c| {
-            self.rest = &self.rest[c.len_utf8()..];
-            self.location += c.text_len();
-            c
-        })
-    }
-
-    #[inline]
-    fn peek_byte(&self) -> Option<u8> {
-        self.rest.as_bytes().first().copied()
-    }
-
-    fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
-        let mut p: u32 = 0u32;
-        let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
-        for i in 1..=literal_number {
-            match self.next_char() {
-                Some(c) => match c.to_digit(16) {
-                    Some(d) => p += d << ((literal_number - i) * 4),
-                    None => return Err(unicode_error),
-                },
-                None => return Err(unicode_error),
-            }
-        }
-        match p {
-            0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
-            _ => std::char::from_u32(p).ok_or(unicode_error),
-        }
-    }
-
-    fn parse_octet(&mut self, o: u8) -> char {
-        let mut radix_bytes = [o, 0, 0];
-        let mut len = 1;
-
-        while len < 3 {
-            let Some(b'0'..=b'7') = self.peek_byte() else {
-                break;
-            };
-
-            radix_bytes[len] = self.next_byte().unwrap();
-            len += 1;
-        }
-
-        // OK because radix_bytes is always going to be in the ASCII range.
-        let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes");
-        let value = u32::from_str_radix(radix_str, 8).unwrap();
-        char::from_u32(value).unwrap()
-    }
-
-    fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
-        let start_pos = self.get_pos();
-
-        let Some('{') = self.next_char() else {
-            return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
-        };
-
-        let start_pos = self.get_pos();
-        let Some(close_idx) = self.rest.find('}') else {
-            return Err(LexicalError::new(
-                LexicalErrorType::StringError,
-                self.get_pos(),
-            ));
-        };
-
-        let name_and_ending = self.skip_bytes(close_idx + 1);
-        let name = &name_and_ending[..name_and_ending.len() - 1];
-
-        unicode_names2::character(name)
-            .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
-    }
-
-    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
-        let Some(first_char) = self.next_char() else {
-            return Err(LexicalError::new(
-                LexicalErrorType::StringError,
-                self.get_pos(),
-            ));
-        };
-
-        let new_char = match first_char {
-            '\\' => '\\',
-            '\'' => '\'',
-            '\"' => '"',
-            'a' => '\x07',
-            'b' => '\x08',
-            'f' => '\x0c',
-            'n' => '\n',
-            'r' => '\r',
-            't' => '\t',
-            'v' => '\x0b',
-            o @ '0'..='7' => self.parse_octet(o as u8),
-            'x' => self.parse_unicode_literal(2)?,
-            'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
-            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
-            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
-            // Special cases where the escape sequence is not a single character
-            '\n' => return Ok(()),
-            '\r' => {
-                if self.peek_byte() == Some(b'\n') {
-                    self.next_byte();
-                }
-
-                return Ok(());
-            }
-            _ => {
-                if self.kind.is_any_bytes() && !first_char.is_ascii() {
-                    return Err(LexicalError::new(
-                        LexicalErrorType::OtherError(
-                            "bytes can only contain ASCII literal characters"
-                                .to_string()
-                                .into_boxed_str(),
-                        ),
-                        self.get_pos(),
-                    ));
-                }
-
-                string.push('\\');
-
-                first_char
-            }
-        };
-
-        string.push(new_char);
-
-        Ok(())
-    }
-
-    fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
-                // We can encounter a `\` as the last character in a `FStringMiddle`
-                // token which is valid in this context. For example,
-                //
-                // ```python
-                // f"\{foo} \{bar:\}"
-                // # ^     ^^     ^
-                // ```
-                //
-                // Here, the `FStringMiddle` token content will be "\" and " \"
-                // which is invalid if we look at the content in isolation:
-                //
-                // ```python
-                // "\"
-                // ```
-                //
-                // However, the content is syntactically valid in the context of
-                // the f-string because it's a substring of the entire f-string.
-                // This is still an invalid escape sequence, but we don't want to
-                // raise a syntax error as is done by the CPython parser. It might
-                // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
-                    self.parse_escaped_char(&mut value)?;
-                }
-                // If there are any curly braces inside a `FStringMiddle` token,
-                // then they were escaped (i.e. `{{` or `}}`). This means that
-                // we need increase the location by 2 instead of 1.
-                ch @ ('{' | '}') => {
-                    self.location += ch.text_len();
-                    value.push(ch);
-                }
-                ch => value.push(ch),
-            }
-        }
-        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
-            value: value.into_boxed_str(),
-            range: self.range,
-        }))
-    }
-
-    fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
-        let mut content = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
-                '\\' if !self.kind.is_raw() => {
-                    self.parse_escaped_char(&mut content)?;
-                }
-                ch => {
-                    if !ch.is_ascii() {
-                        return Err(LexicalError::new(
-                            LexicalErrorType::OtherError(
-                                "bytes can only contain ASCII literal characters"
-                                    .to_string()
-                                    .into_boxed_str(),
-                            ),
-                            self.get_pos(),
-                        ));
-                    }
-                    content.push(ch);
-                }
-            }
-        }
-        Ok(StringType::Bytes(ast::BytesLiteral {
-            value: content
-                .chars()
-                .map(|c| c as u8)
-                .collect::<Vec<u8>>()
-                .into_boxed_slice(),
-            range: self.range,
-        }))
-    }
-
-    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        if self.kind.is_raw() {
-            value.push_str(self.skip_bytes(self.rest.len()));
-        } else {
-            loop {
-                let Some(escape_idx) = self.rest.find('\\') else {
-                    value.push_str(self.skip_bytes(self.rest.len()));
-                    break;
-                };
-
-                let before_with_slash = self.skip_bytes(escape_idx + 1);
-                let before = &before_with_slash[..before_with_slash.len() - 1];
-
-                value.push_str(before);
-                self.parse_escaped_char(&mut value)?;
-            }
-        }
-        Ok(StringType::Str(ast::StringLiteral {
-            value: value.into_boxed_str(),
-            unicode: self.kind.is_unicode(),
-            range: self.range,
-        }))
-    }
-
-    fn parse(&mut self) -> Result<StringType, LexicalError> {
-        if self.kind.is_any_bytes() {
-            self.parse_bytes()
-        } else {
-            self.parse_string()
-        }
-    }
-}
-
-pub fn parse_string_literal(
-    source: &str,
-    kind: StringKind,
-    triple_quoted: bool,
-    range: TextRange,
-) -> Result<StringType, LexicalError> {
-    let start_location = range.start()
-        + kind.prefix_len()
-        + if triple_quoted {
-            TextSize::from(3)
-        } else {
-            TextSize::from(1)
-        };
-    StringParser::new(source, kind, start_location, range).parse()
-}
-
-pub fn parse_fstring_literal_element(
-    source: &str,
-    is_raw: bool,
-    range: TextRange,
-) -> Result<ast::FStringElement, LexicalError> {
-    let kind = if is_raw {
-        StringKind::RawString
-    } else {
-        StringKind::String
-    };
-    StringParser::new(source, kind, range.start(), range).parse_fstring_middle()
-}
-
-pub(crate) fn concatenated_strings(
-    strings: Vec<StringType>,
-    range: TextRange,
-) -> Result<Expr, LexicalError> {
-    #[cfg(debug_assertions)]
-    debug_assert!(strings.len() > 1);
-
-    let mut has_fstring = false;
-    let mut byte_literal_count = 0;
-    for string in &strings {
-        match string {
-            StringType::FString(_) => has_fstring = true,
-            StringType::Bytes(_) => byte_literal_count += 1,
-            StringType::Str(_) => {}
-        }
-    }
-    let has_bytes = byte_literal_count > 0;
-
-    if has_bytes && byte_literal_count < strings.len() {
-        return Err(LexicalError::new(
-            LexicalErrorType::OtherError(
-                "cannot mix bytes and nonbytes literals"
-                    .to_string()
-                    .into_boxed_str(),
-            ),
-            range.start(),
-        ));
-    }
-
-    if has_bytes {
-        let mut values = Vec::with_capacity(strings.len());
-        for string in strings {
-            match string {
-                StringType::Bytes(value) => values.push(value),
-                _ => unreachable!("Unexpected non-bytes literal."),
-            }
-        }
-        return Ok(Expr::from(ast::ExprBytesLiteral {
-            value: ast::BytesLiteralValue::concatenated(values),
-            range,
-        }));
-    }
-
-    if !has_fstring {
-        let mut values = Vec::with_capacity(strings.len());
-        for string in strings {
-            match string {
-                StringType::Str(value) => values.push(value),
-                _ => unreachable!("Unexpected non-string literal."),
-            }
-        }
-        return Ok(Expr::from(ast::ExprStringLiteral {
-            value: ast::StringLiteralValue::concatenated(values),
-            range,
-        }));
-    }
-
-    let mut parts = Vec::with_capacity(strings.len());
-    for string in strings {
-        match string {
-            StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)),
-            StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)),
-            StringType::Bytes(_) => unreachable!("Unexpected bytes literal."),
-        }
-    }
-
-    Ok(ast::ExprFString {
-        value: ast::FStringValue::concatenated(parts),
-        range,
-    }
-    .into())
-}
-
-/// Represents the different types of errors that can occur during parsing of an f-string.
-#[derive(Copy, Debug, Clone, PartialEq)]
-pub enum FStringErrorType {
-    /// Expected a right brace after an opened left brace.
-    UnclosedLbrace,
-    /// An invalid conversion flag was encountered.
-    InvalidConversionFlag,
-    /// A single right brace was encountered.
-    SingleRbrace,
-    /// Unterminated string.
-    UnterminatedString,
-    /// Unterminated triple-quoted string.
-    UnterminatedTripleQuotedString,
-    // TODO(dhruvmanila): The parser can't catch all cases of this error, but
-    // wherever it can, we'll display the correct error message.
-    /// A lambda expression without parentheses was encountered.
-    LambdaWithoutParentheses,
-}
-
-impl std::fmt::Display for FStringErrorType {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        use FStringErrorType::{
-            InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace,
-            UnterminatedString, UnterminatedTripleQuotedString,
-        };
-        match self {
-            UnclosedLbrace => write!(f, "expecting '}}'"),
-            InvalidConversionFlag => write!(f, "invalid conversion character"),
-            SingleRbrace => write!(f, "single '}}' is not allowed"),
-            UnterminatedString => write!(f, "unterminated string"),
-            UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"),
-            LambdaWithoutParentheses => {
-                write!(f, "lambda expressions are not allowed without parentheses")
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::lexer::LexicalErrorType;
-    use crate::parser::parse_suite;
-    use crate::{ParseErrorType, Suite};
-
-    use super::*;
-
-    const WINDOWS_EOL: &str = "\r\n";
-    const MAC_EOL: &str = "\r";
-    const UNIX_EOL: &str = "\n";
-
-    fn string_parser_escaped_eol(eol: &str) -> Suite {
-        let source = format!(r"'text \{eol}more text'");
-        parse_suite(&source).unwrap()
-    }
-
-    #[test]
-    fn test_string_parser_escaped_unix_eol() {
-        let parse_ast = string_parser_escaped_eol(UNIX_EOL);
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_string_parser_escaped_mac_eol() {
-        let parse_ast = string_parser_escaped_eol(MAC_EOL);
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_string_parser_escaped_windows_eol() {
-        let parse_ast = string_parser_escaped_eol(WINDOWS_EOL);
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring() {
-        let source = r#"f"{a}{ b }{{foo}}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_nested_spec() {
-        let source = r#"f"{foo:{spec}}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_not_nested_spec() {
-        let source = r#"f"{foo:spec}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_empty_fstring() {
-        insta::assert_debug_snapshot!(parse_suite(r#"f"""#,).unwrap());
-    }
-
-    #[test]
-    fn test_fstring_parse_self_documenting_base() {
-        let source = r#"f"{user=}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_parse_self_documenting_base_more() {
-        let source = r#"f"mix {user=} with text and {second=}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_parse_self_documenting_format() {
-        let source = r#"f"{user=:>10}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    fn parse_fstring_error(source: &str) -> FStringErrorType {
-        parse_suite(source)
-            .map_err(|e| match e.error {
-                ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e,
-                e => unreachable!("Expected FStringError: {:?}", e),
-            })
-            .expect_err("Expected error")
-    }
-
-    #[test]
-    fn test_parse_invalid_fstring() {
-        use FStringErrorType::{InvalidConversionFlag, LambdaWithoutParentheses};
-
-        assert_eq!(parse_fstring_error(r#"f"{5!x}""#), InvalidConversionFlag);
-        assert_eq!(
-            parse_fstring_error("f'{lambda x:{x}}'"),
-            LambdaWithoutParentheses
-        );
-        assert_eq!(
-            parse_fstring_error("f'{lambda x: {x}}'"),
-            LambdaWithoutParentheses
-        );
-        assert!(parse_suite(r#"f"{class}""#,).is_err());
-    }
-
-    #[test]
-    fn test_parse_fstring_not_equals() {
-        let source = r#"f"{1 != 2}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_equals() {
-        let source = r#"f"{42 == 42}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_self_doc_prec_space() {
-        let source = r#"f"{x   =}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_self_doc_trailing_space() {
-        let source = r#"f"{x=   }""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_yield_expr() {
-        let source = r#"f"{yield}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_string_concat() {
-        let source = "'Hello ' 'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_u_string_concat_1() {
-        let source = "'Hello ' u'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_u_string_concat_2() {
-        let source = "u'Hello ' 'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_f_string_concat_1() {
-        let source = "'Hello ' f'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_f_string_concat_2() {
-        let source = "'Hello ' f'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_f_string_concat_3() {
-        let source = "'Hello ' f'world{\"!\"}'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_f_string_concat_4() {
-        let source = "'Hello ' f'world{\"!\"}' 'again!'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_u_f_string_concat_1() {
-        let source = "u'Hello ' f'world'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_u_f_string_concat_2() {
-        let source = "u'Hello ' f'world' '!'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_string_triple_quotes_with_kind() {
-        let source = "u'''Hello, world!'''";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_single_quoted_byte() {
-        // single quote
-        let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_double_quoted_byte() {
-        // double quote
-        let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_escape_char_in_byte_literal() {
-        // backslash does not escape
-        let source = r#"b"omkmok\Xaa""#; // spell-checker:ignore omkmok
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_raw_byte_literal_1() {
-        let source = r"rb'\x1z'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_raw_byte_literal_2() {
-        let source = r"rb'\\'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_escape_octet() {
-        let source = r"b'\43a\4\1234'";
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_escaped_newline() {
-        let source = r#"f"\n{x}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_constant_range() {
-        let source = r#"f"aaa{bbb}ccc{ddd}eee""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_unescaped_newline() {
-        let source = r#"f"""
-{x}""""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_escaped_character() {
-        let source = r#"f"\\{x}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_raw_fstring() {
-        let source = r#"rf"{x}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_triple_quoted_raw_fstring() {
-        let source = r#"rf"""{x}""""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_fstring_line_continuation() {
-        let source = r#"rf"\
-{x}""#;
-        let parse_ast = parse_suite(source).unwrap();
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_nested_string_spec() {
-        let source = r#"f"{foo:{''}}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    #[test]
-    fn test_parse_fstring_nested_concatenation_string_spec() {
-        let source = r#"f"{foo:{'' ''}}""#;
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    /// <https://github.com/astral-sh/ruff/issues/8355>
-    #[test]
-    fn test_dont_panic_on_8_in_octal_escape() {
-        let source = r"bold = '\038[1m'";
-        let parse_ast = parse_suite(source).unwrap();
-
-        insta::assert_debug_snapshot!(parse_ast);
-    }
-
-    macro_rules! test_aliases_parse {
-        ($($name:ident: $alias:expr,)*) => {
-        $(
-            #[test]
-            fn $name() {
-                let source = format!(r#""\N{{{0}}}""#, $alias);
-                let parse_ast = parse_suite(&source).unwrap();
-                insta::assert_debug_snapshot!(parse_ast);
-            }
-        )*
-        }
-    }
-
-    test_aliases_parse! {
-        test_backspace_alias: "BACKSPACE",
-        test_bell_alias: "BEL",
-        test_carriage_return_alias: "CARRIAGE RETURN",
-        test_delete_alias: "DELETE",
-        test_escape_alias: "ESCAPE",
-        test_form_feed_alias: "FORM FEED",
-        test_hts_alias: "HTS",
-        test_character_tabulation_with_justification_alias: "CHARACTER TABULATION WITH JUSTIFICATION",
-    }
-}
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 79d423bfaffb7..fb536537216a0 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -8,7 +8,7 @@ use ruff_text_size::{Ranged, TextRange, TextSize};
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
-pub enum StringType {
+pub(crate) enum StringType {
     Str(ast::StringLiteral),
     Bytes(ast::BytesLiteral),
     FString(ast::FString),
@@ -419,7 +419,7 @@ impl StringParser {
     }
 }
 
-pub fn parse_string_literal(
+pub(crate) fn parse_string_literal(
     source: Box<str>,
     kind: StringKind,
     triple_quoted: bool,
@@ -435,7 +435,7 @@ pub fn parse_string_literal(
     StringParser::new(source, kind, start_location, range).parse()
 }
 
-pub fn parse_fstring_literal_element(
+pub(crate) fn parse_fstring_literal_element(
     source: Box<str>,
     is_raw: bool,
     range: TextRange,
@@ -524,7 +524,7 @@ pub(crate) fn concatenated_strings(
 // TODO: consolidate these with ParseError
 /// An error that occurred during parsing of an f-string.
 #[derive(Debug, Clone, PartialEq)]
-pub(crate) struct FStringError {
+struct FStringError {
     /// The type of error that occurred.
     pub(crate) error: FStringErrorType,
     /// The location of the error.