From fe79798c12b4771cee0b0c59964ad7bd751c3779 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Wed, 14 Feb 2024 18:54:55 +0100 Subject: [PATCH] split string module (#9987) --- .../ruff_python_formatter/src/string/any.rs | 212 +++++ .../src/string/docstring.rs | 14 +- .../ruff_python_formatter/src/string/mod.rs | 812 +----------------- .../src/string/normalize.rs | 622 ++++++++++++++ 4 files changed, 847 insertions(+), 813 deletions(-) create mode 100644 crates/ruff_python_formatter/src/string/any.rs create mode 100644 crates/ruff_python_formatter/src/string/normalize.rs diff --git a/crates/ruff_python_formatter/src/string/any.rs b/crates/ruff_python_formatter/src/string/any.rs new file mode 100644 index 0000000000000..5c1acf938597a --- /dev/null +++ b/crates/ruff_python_formatter/src/string/any.rs @@ -0,0 +1,212 @@ +use std::iter::FusedIterator; + +use memchr::memchr2; + +use ruff_python_ast::{ + self as ast, AnyNodeRef, Expr, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef, + StringLiteral, +}; +use ruff_source_file::Locator; +use ruff_text_size::{Ranged, TextLen, TextRange}; + +use crate::expression::expr_f_string::f_string_quoting; +use crate::other::f_string::FormatFString; +use crate::other::string_literal::{FormatStringLiteral, StringLiteralKind}; +use crate::prelude::*; +use crate::string::{Quoting, StringPrefix, StringQuotes}; + +/// Represents any kind of string expression. This could be either a string, +/// bytes or f-string. +#[derive(Copy, Clone, Debug)] +pub(crate) enum AnyString<'a> { + String(&'a ExprStringLiteral), + Bytes(&'a ExprBytesLiteral), + FString(&'a ExprFString), +} + +impl<'a> AnyString<'a> { + /// Creates a new [`AnyString`] from the given [`Expr`]. + /// + /// Returns `None` if the expression is not either a string, bytes or f-string. + pub(crate) fn from_expression(expression: &'a Expr) -> Option> { + match expression { + Expr::StringLiteral(string) => Some(AnyString::String(string)), + Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)), + Expr::FString(fstring) => Some(AnyString::FString(fstring)), + _ => None, + } + } + + /// Returns `true` if the string is implicitly concatenated. + pub(crate) fn is_implicit_concatenated(self) -> bool { + match self { + Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(), + Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(), + Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(), + } + } + + /// Returns the quoting to be used for this string. + pub(super) fn quoting(self, locator: &Locator<'_>) -> Quoting { + match self { + Self::String(_) | Self::Bytes(_) => Quoting::CanChange, + Self::FString(f_string) => f_string_quoting(f_string, locator), + } + } + + /// Returns a vector of all the [`AnyStringPart`] of this string. + pub(super) fn parts(self, quoting: Quoting) -> AnyStringPartsIter<'a> { + match self { + Self::String(ExprStringLiteral { value, .. }) => { + AnyStringPartsIter::String(value.iter()) + } + Self::Bytes(ExprBytesLiteral { value, .. }) => AnyStringPartsIter::Bytes(value.iter()), + Self::FString(ExprFString { value, .. }) => { + AnyStringPartsIter::FString(value.iter(), quoting) + } + } + } + + pub(crate) fn is_multiline(self, source: &str) -> bool { + match self { + AnyString::String(_) | AnyString::Bytes(_) => { + let contents = &source[self.range()]; + let prefix = StringPrefix::parse(contents); + let quotes = StringQuotes::parse( + &contents[TextRange::new(prefix.text_len(), contents.text_len())], + ); + + quotes.is_some_and(StringQuotes::is_triple) + && memchr2(b'\n', b'\r', contents.as_bytes()).is_some() + } + AnyString::FString(fstring) => { + memchr2(b'\n', b'\r', source[fstring.range].as_bytes()).is_some() + } + } + } +} + +impl Ranged for AnyString<'_> { + fn range(&self) -> TextRange { + match self { + Self::String(expr) => expr.range(), + Self::Bytes(expr) => expr.range(), + Self::FString(expr) => expr.range(), + } + } +} + +impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> { + fn from(value: &AnyString<'a>) -> Self { + match value { + AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr), + AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr), + AnyString::FString(expr) => AnyNodeRef::ExprFString(expr), + } + } +} + +impl<'a> From> for AnyNodeRef<'a> { + fn from(value: AnyString<'a>) -> Self { + AnyNodeRef::from(&value) + } +} + +impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> { + fn from(value: &AnyString<'a>) -> Self { + match value { + AnyString::String(expr) => ExpressionRef::StringLiteral(expr), + AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr), + AnyString::FString(expr) => ExpressionRef::FString(expr), + } + } +} + +pub(super) enum AnyStringPartsIter<'a> { + String(std::slice::Iter<'a, StringLiteral>), + Bytes(std::slice::Iter<'a, ast::BytesLiteral>), + FString(std::slice::Iter<'a, ast::FStringPart>, Quoting), +} + +impl<'a> Iterator for AnyStringPartsIter<'a> { + type Item = AnyStringPart<'a>; + + fn next(&mut self) -> Option { + let part = match self { + Self::String(inner) => { + let part = inner.next()?; + AnyStringPart::String { + part, + layout: StringLiteralKind::String, + } + } + Self::Bytes(inner) => AnyStringPart::Bytes(inner.next()?), + Self::FString(inner, quoting) => { + let part = inner.next()?; + match part { + ast::FStringPart::Literal(string_literal) => AnyStringPart::String { + part: string_literal, + layout: StringLiteralKind::InImplicitlyConcatenatedFString(*quoting), + }, + ast::FStringPart::FString(f_string) => AnyStringPart::FString { + part: f_string, + quoting: *quoting, + }, + } + } + }; + + Some(part) + } +} + +impl FusedIterator for AnyStringPartsIter<'_> {} + +/// Represents any kind of string which is part of an implicitly concatenated +/// string. This could be either a string, bytes or f-string. +/// +/// This is constructed from the [`AnyString::parts`] method on [`AnyString`]. +#[derive(Clone, Debug)] +pub(super) enum AnyStringPart<'a> { + String { + part: &'a ast::StringLiteral, + layout: StringLiteralKind, + }, + Bytes(&'a ast::BytesLiteral), + FString { + part: &'a ast::FString, + quoting: Quoting, + }, +} + +impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> { + fn from(value: &AnyStringPart<'a>) -> Self { + match value { + AnyStringPart::String { part, .. } => AnyNodeRef::StringLiteral(part), + AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part), + AnyStringPart::FString { part, .. } => AnyNodeRef::FString(part), + } + } +} + +impl Ranged for AnyStringPart<'_> { + fn range(&self) -> TextRange { + match self { + Self::String { part, .. } => part.range(), + Self::Bytes(part) => part.range(), + Self::FString { part, .. } => part.range(), + } + } +} + +impl Format> for AnyStringPart<'_> { + fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + match self { + AnyStringPart::String { part, layout } => { + FormatStringLiteral::new(part, *layout).fmt(f) + } + AnyStringPart::Bytes(bytes_literal) => bytes_literal.format().fmt(f), + AnyStringPart::FString { part, quoting } => FormatFString::new(part, *quoting).fmt(f), + } + } +} diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index b06ba04b5a57b..a6b4539024ed2 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -109,7 +109,7 @@ use super::{NormalizedString, QuoteChar}; /// `indent-width * spaces` to tabs because doing so could break ASCII art and other docstrings /// that use spaces for alignment. pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> FormatResult<()> { - let docstring = &normalized.text; + let docstring = &normalized.text(); // Black doesn't change the indentation of docstrings that contain an escaped newline if contains_unescaped_newline(docstring) { @@ -125,7 +125,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form let mut lines = docstring.split('\n').peekable(); // Start the string - write!(f, [normalized.prefix, normalized.quotes])?; + write!(f, [normalized.prefix(), normalized.quotes()])?; // We track where in the source docstring we are (in source code byte offsets) let mut offset = normalized.start(); @@ -141,7 +141,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form // Edge case: The first line is `""" "content`, so we need to insert chaperone space that keep // inner quotes and closing quotes from getting to close to avoid `""""content` - if trim_both.starts_with(normalized.quotes.quote_char.as_char()) { + if trim_both.starts_with(normalized.quotes().quote_char.as_char()) { space().fmt(f)?; } @@ -168,7 +168,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form { space().fmt(f)?; } - normalized.quotes.fmt(f)?; + normalized.quotes().fmt(f)?; return Ok(()); } @@ -194,7 +194,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form offset, stripped_indentation, already_normalized, - quote_char: normalized.quotes.quote_char, + quote_char: normalized.quotes().quote_char, code_example: CodeExample::default(), } .add_iter(lines)?; @@ -207,7 +207,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form space().fmt(f)?; } - write!(f, [normalized.quotes]) + write!(f, [normalized.quotes()]) } fn contains_unescaped_newline(haystack: &str) -> bool { @@ -1569,7 +1569,7 @@ fn docstring_format_source( /// that avoids `content""""` and `content\"""`. This does only applies to un-escaped backslashes, /// so `content\\ """` doesn't need a space while `content\\\ """` does. fn needs_chaperone_space(normalized: &NormalizedString, trim_end: &str) -> bool { - trim_end.ends_with(normalized.quotes.quote_char.as_char()) + trim_end.ends_with(normalized.quotes().quote_char.as_char()) || trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1 } diff --git a/crates/ruff_python_formatter/src/string/mod.rs b/crates/ruff_python_formatter/src/string/mod.rs index 047ae7cd36306..df307200e6904 100644 --- a/crates/ruff_python_formatter/src/string/mod.rs +++ b/crates/ruff_python_formatter/src/string/mod.rs @@ -1,27 +1,19 @@ -use std::borrow::Cow; -use std::iter::FusedIterator; - use bitflags::bitflags; -use memchr::memchr2; -use ruff_formatter::{format_args, write}; -use ruff_python_ast::{ - self as ast, Expr, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef, -}; -use ruff_python_ast::{AnyNodeRef, StringLiteral}; +pub(crate) use any::AnyString; +pub(crate) use normalize::{NormalizedString, StringNormalizer}; +use ruff_formatter::format_args; use ruff_source_file::Locator; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; +use ruff_text_size::{TextLen, TextRange, TextSize}; use crate::comments::{leading_comments, trailing_comments}; -use crate::expression::expr_f_string::f_string_quoting; use crate::expression::parentheses::in_parentheses_only_soft_line_break_or_space; -use crate::other::f_string::FormatFString; -use crate::other::string_literal::{FormatStringLiteral, StringLiteralKind}; use crate::prelude::*; -use crate::preview::is_hex_codes_in_unicode_sequences_enabled; use crate::QuoteStyle; +mod any; pub(crate) mod docstring; +mod normalize; #[derive(Copy, Clone, Debug, Default)] pub(crate) enum Quoting { @@ -30,202 +22,6 @@ pub(crate) enum Quoting { Preserve, } -/// Represents any kind of string expression. This could be either a string, -/// bytes or f-string. -#[derive(Copy, Clone, Debug)] -pub(crate) enum AnyString<'a> { - String(&'a ExprStringLiteral), - Bytes(&'a ExprBytesLiteral), - FString(&'a ExprFString), -} - -impl<'a> AnyString<'a> { - /// Creates a new [`AnyString`] from the given [`Expr`]. - /// - /// Returns `None` if the expression is not either a string, bytes or f-string. - pub(crate) fn from_expression(expression: &'a Expr) -> Option> { - match expression { - Expr::StringLiteral(string) => Some(AnyString::String(string)), - Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)), - Expr::FString(fstring) => Some(AnyString::FString(fstring)), - _ => None, - } - } - - /// Returns `true` if the string is implicitly concatenated. - pub(crate) fn is_implicit_concatenated(self) -> bool { - match self { - Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(), - Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(), - Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(), - } - } - - /// Returns the quoting to be used for this string. - fn quoting(self, locator: &Locator<'_>) -> Quoting { - match self { - Self::String(_) | Self::Bytes(_) => Quoting::CanChange, - Self::FString(f_string) => f_string_quoting(f_string, locator), - } - } - - /// Returns a vector of all the [`AnyStringPart`] of this string. - fn parts(self, quoting: Quoting) -> AnyStringPartsIter<'a> { - match self { - Self::String(ExprStringLiteral { value, .. }) => { - AnyStringPartsIter::String(value.iter()) - } - Self::Bytes(ExprBytesLiteral { value, .. }) => AnyStringPartsIter::Bytes(value.iter()), - Self::FString(ExprFString { value, .. }) => { - AnyStringPartsIter::FString(value.iter(), quoting) - } - } - } - - pub(crate) fn is_multiline(self, source: &str) -> bool { - match self { - AnyString::String(_) | AnyString::Bytes(_) => { - let contents = &source[self.range()]; - let prefix = StringPrefix::parse(contents); - let quotes = StringQuotes::parse( - &contents[TextRange::new(prefix.text_len(), contents.text_len())], - ); - - quotes.is_some_and(StringQuotes::is_triple) - && memchr2(b'\n', b'\r', contents.as_bytes()).is_some() - } - AnyString::FString(fstring) => { - memchr2(b'\n', b'\r', source[fstring.range].as_bytes()).is_some() - } - } - } -} - -impl Ranged for AnyString<'_> { - fn range(&self) -> TextRange { - match self { - Self::String(expr) => expr.range(), - Self::Bytes(expr) => expr.range(), - Self::FString(expr) => expr.range(), - } - } -} - -impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> { - fn from(value: &AnyString<'a>) -> Self { - match value { - AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr), - AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr), - AnyString::FString(expr) => AnyNodeRef::ExprFString(expr), - } - } -} - -impl<'a> From> for AnyNodeRef<'a> { - fn from(value: AnyString<'a>) -> Self { - AnyNodeRef::from(&value) - } -} - -impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> { - fn from(value: &AnyString<'a>) -> Self { - match value { - AnyString::String(expr) => ExpressionRef::StringLiteral(expr), - AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr), - AnyString::FString(expr) => ExpressionRef::FString(expr), - } - } -} - -enum AnyStringPartsIter<'a> { - String(std::slice::Iter<'a, StringLiteral>), - Bytes(std::slice::Iter<'a, ast::BytesLiteral>), - FString(std::slice::Iter<'a, ast::FStringPart>, Quoting), -} - -impl<'a> Iterator for AnyStringPartsIter<'a> { - type Item = AnyStringPart<'a>; - - fn next(&mut self) -> Option { - let part = match self { - Self::String(inner) => { - let part = inner.next()?; - AnyStringPart::String { - part, - layout: StringLiteralKind::String, - } - } - Self::Bytes(inner) => AnyStringPart::Bytes(inner.next()?), - Self::FString(inner, quoting) => { - let part = inner.next()?; - match part { - ast::FStringPart::Literal(string_literal) => AnyStringPart::String { - part: string_literal, - layout: StringLiteralKind::InImplicitlyConcatenatedFString(*quoting), - }, - ast::FStringPart::FString(f_string) => AnyStringPart::FString { - part: f_string, - quoting: *quoting, - }, - } - } - }; - - Some(part) - } -} - -impl FusedIterator for AnyStringPartsIter<'_> {} - -/// Represents any kind of string which is part of an implicitly concatenated -/// string. This could be either a string, bytes or f-string. -/// -/// This is constructed from the [`AnyString::parts`] method on [`AnyString`]. -#[derive(Clone, Debug)] -enum AnyStringPart<'a> { - String { - part: &'a ast::StringLiteral, - layout: StringLiteralKind, - }, - Bytes(&'a ast::BytesLiteral), - FString { - part: &'a ast::FString, - quoting: Quoting, - }, -} - -impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> { - fn from(value: &AnyStringPart<'a>) -> Self { - match value { - AnyStringPart::String { part, .. } => AnyNodeRef::StringLiteral(part), - AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part), - AnyStringPart::FString { part, .. } => AnyNodeRef::FString(part), - } - } -} - -impl Ranged for AnyStringPart<'_> { - fn range(&self) -> TextRange { - match self { - Self::String { part, .. } => part.range(), - Self::Bytes(part) => part.range(), - Self::FString { part, .. } => part.range(), - } - } -} - -impl Format> for AnyStringPart<'_> { - fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { - match self { - AnyStringPart::String { part, layout } => { - FormatStringLiteral::new(part, *layout).fmt(f) - } - AnyStringPart::Bytes(bytes_literal) => bytes_literal.format().fmt(f), - AnyStringPart::FString { part, quoting } => FormatFString::new(part, *quoting).fmt(f), - } - } -} - /// Formats any implicitly concatenated string. This could be any valid combination /// of string, bytes or f-string literals. pub(crate) struct FormatStringContinuation<'a> { @@ -308,167 +104,6 @@ impl StringPart { } } -pub(crate) struct StringNormalizer { - quoting: Quoting, - preferred_quote_style: QuoteStyle, - parent_docstring_quote_char: Option, - normalize_hex: bool, -} - -impl StringNormalizer { - pub(crate) fn from_context(context: &PyFormatContext<'_>) -> Self { - Self { - quoting: Quoting::default(), - preferred_quote_style: QuoteStyle::default(), - parent_docstring_quote_char: context.docstring(), - normalize_hex: is_hex_codes_in_unicode_sequences_enabled(context), - } - } - - pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self { - self.preferred_quote_style = quote_style; - self - } - - pub(crate) fn with_quoting(mut self, quoting: Quoting) -> Self { - self.quoting = quoting; - self - } - - /// Computes the strings preferred quotes. - pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes { - // Per PEP 8, always prefer double quotes for triple-quoted strings. - // Except when using quote-style-preserve. - let preferred_style = if string.quotes().triple { - // ... unless we're formatting a code snippet inside a docstring, - // then we specifically want to invert our quote style to avoid - // writing out invalid Python. - // - // It's worth pointing out that we can actually wind up being - // somewhat out of sync with PEP8 in this case. Consider this - // example: - // - // def foo(): - // ''' - // Something. - // - // >>> """tricksy""" - // ''' - // pass - // - // Ideally, this would be reformatted as: - // - // def foo(): - // """ - // Something. - // - // >>> '''tricksy''' - // """ - // pass - // - // But the logic here results in the original quoting being - // preserved. This is because the quoting style of the outer - // docstring is determined, in part, by looking at its contents. In - // this case, it notices that it contains a `"""` and thus infers - // that using `'''` would overall read better because it avoids - // the need to escape the interior `"""`. Except... in this case, - // the `"""` is actually part of a code snippet that could get - // reformatted to using a different quoting style itself. - // - // Fixing this would, I believe, require some fairly seismic - // changes to how formatting strings works. Namely, we would need - // to look for code snippets before normalizing the docstring, and - // then figure out the quoting style more holistically by looking - // at the various kinds of quotes used in the code snippets and - // what reformatting them might look like. - // - // Overall this is a bit of a corner case and just inverting the - // style from what the parent ultimately decided upon works, even - // if it doesn't have perfect alignment with PEP8. - if let Some(quote) = self.parent_docstring_quote_char { - QuoteStyle::from(quote.invert()) - } else if self.preferred_quote_style.is_preserve() { - QuoteStyle::Preserve - } else { - QuoteStyle::Double - } - } else { - self.preferred_quote_style - }; - - match self.quoting { - Quoting::Preserve => string.quotes(), - Quoting::CanChange => { - if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) { - let raw_content = locator.slice(string.content_range()); - if string.prefix().is_raw_string() { - choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote) - } else { - choose_quotes_impl(raw_content, string.quotes(), preferred_quote) - } - } else { - string.quotes() - } - } - } - } - - /// Computes the strings preferred quotes and normalizes its content. - pub(crate) fn normalize<'a>( - &self, - string: &StringPart, - locator: &'a Locator, - ) -> NormalizedString<'a> { - let raw_content = locator.slice(string.content_range()); - - let quotes = self.choose_quotes(string, locator); - - let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex); - - NormalizedString { - prefix: string.prefix(), - content_range: string.content_range(), - text: normalized, - quotes, - } - } -} - -#[derive(Debug)] -pub(crate) struct NormalizedString<'a> { - prefix: StringPrefix, - - /// The quotes of the normalized string (preferred quotes) - quotes: StringQuotes, - - /// The range of the string's content in the source (minus prefix and quotes). - content_range: TextRange, - - /// The normalized text - text: Cow<'a, str>, -} - -impl Ranged for NormalizedString<'_> { - fn range(&self) -> TextRange { - self.content_range - } -} - -impl Format> for NormalizedString<'_> { - fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { - write!(f, [self.prefix, self.quotes])?; - match &self.text { - Cow::Borrowed(_) => { - source_text_slice(self.range()).fmt(f)?; - } - Cow::Owned(normalized) => { - text(normalized).fmt(f)?; - } - } - self.quotes.fmt(f) - } -} - bitflags! { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub(crate) struct StringPrefix: u8 { @@ -549,175 +184,6 @@ impl Format> for StringPrefix { } } -/// Choose the appropriate quote style for a raw string. -/// -/// The preferred quote style is chosen unless the string contains unescaped quotes of the -/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote -/// style is double quotes. -fn choose_quotes_for_raw_string( - input: &str, - quotes: StringQuotes, - preferred_quote: QuoteChar, -) -> StringQuotes { - let preferred_quote_char = preferred_quote.as_char(); - let mut chars = input.chars().peekable(); - let contains_unescaped_configured_quotes = loop { - match chars.next() { - Some('\\') => { - // Ignore escaped characters - chars.next(); - } - // `"` or `'` - Some(c) if c == preferred_quote_char => { - if !quotes.triple { - break true; - } - - match chars.peek() { - // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser - // about where the closing triple quotes start - None => break true, - Some(next) if *next == preferred_quote_char => { - // `""` or `''` - chars.next(); - - // We can't turn `r'''""'''` into `r""""""""`, nor can we have - // `"""` or `'''` respectively inside the string - if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) { - break true; - } - } - _ => {} - } - } - Some(_) => continue, - None => break false, - } - }; - - StringQuotes { - triple: quotes.triple, - quote_char: if contains_unescaped_configured_quotes { - quotes.quote_char - } else { - preferred_quote - }, - } -} - -/// Choose the appropriate quote style for a string. -/// -/// For single quoted strings, the preferred quote style is used, unless the alternative quote style -/// would require fewer escapes. -/// -/// For triple quoted strings, the preferred quote style is always used, unless the string contains -/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be -/// used unless the string contains `"""`). -fn choose_quotes_impl( - input: &str, - quotes: StringQuotes, - preferred_quote: QuoteChar, -) -> StringQuotes { - let quote = if quotes.triple { - // True if the string contains a triple quote sequence of the configured quote style. - let mut uses_triple_quotes = false; - let mut chars = input.chars().peekable(); - - while let Some(c) = chars.next() { - let preferred_quote_char = preferred_quote.as_char(); - match c { - '\\' => { - if matches!(chars.peek(), Some('"' | '\\')) { - chars.next(); - } - } - // `"` or `'` - c if c == preferred_quote_char => { - match chars.peek().copied() { - Some(c) if c == preferred_quote_char => { - // `""` or `''` - chars.next(); - - match chars.peek().copied() { - Some(c) if c == preferred_quote_char => { - // `"""` or `'''` - chars.next(); - uses_triple_quotes = true; - break; - } - Some(_) => {} - None => { - // Handle `''' ""'''`. At this point we have consumed both - // double quotes, so on the next iteration the iterator is empty - // and we'd miss the string ending with a preferred quote - uses_triple_quotes = true; - break; - } - } - } - Some(_) => { - // A single quote char, this is ok - } - None => { - // Trailing quote at the end of the comment - uses_triple_quotes = true; - break; - } - } - } - _ => continue, - } - } - - if uses_triple_quotes { - // String contains a triple quote sequence of the configured quote style. - // Keep the existing quote style. - quotes.quote_char - } else { - preferred_quote - } - } else { - let mut single_quotes = 0u32; - let mut double_quotes = 0u32; - - for c in input.chars() { - match c { - '\'' => { - single_quotes += 1; - } - - '"' => { - double_quotes += 1; - } - - _ => continue, - } - } - - match preferred_quote { - QuoteChar::Single => { - if single_quotes > double_quotes { - QuoteChar::Double - } else { - QuoteChar::Single - } - } - QuoteChar::Double => { - if double_quotes > single_quotes { - QuoteChar::Single - } else { - QuoteChar::Double - } - } - } - }; - - StringQuotes { - triple: quotes.triple, - quote_char: quote, - } -} - #[derive(Copy, Clone, Debug)] pub(crate) struct StringQuotes { triple: bool, @@ -821,269 +287,3 @@ impl TryFrom for QuoteChar { } } } - -/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input` -/// with the provided [`StringQuotes`] style. -/// -/// Returns the normalized string and whether it contains new lines. -pub(crate) fn normalize_string( - input: &str, - quotes: StringQuotes, - prefix: StringPrefix, - normalize_hex: bool, -) -> Cow { - // The normalized string if `input` is not yet normalized. - // `output` must remain empty if `input` is already normalized. - let mut output = String::new(); - // Tracks the last index of `input` that has been written to `output`. - // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. - let mut last_index = 0; - - let quote = quotes.quote_char; - let preferred_quote = quote.as_char(); - let opposite_quote = quote.invert().as_char(); - - let mut chars = input.char_indices().peekable(); - - let is_raw = prefix.is_raw_string(); - let is_fstring = prefix.is_fstring(); - let mut formatted_value_nesting = 0u32; - - while let Some((index, c)) = chars.next() { - if is_fstring && matches!(c, '{' | '}') { - if chars.peek().copied().is_some_and(|(_, next)| next == c) { - // Skip over the second character of the double braces - chars.next(); - } else if c == '{' { - formatted_value_nesting += 1; - } else { - // Safe to assume that `c == '}'` here because of the matched pattern above - formatted_value_nesting = formatted_value_nesting.saturating_sub(1); - } - continue; - } - if c == '\r' { - output.push_str(&input[last_index..index]); - - // Skip over the '\r' character, keep the `\n` - if chars.peek().copied().is_some_and(|(_, next)| next == '\n') { - chars.next(); - } - // Replace the `\r` with a `\n` - else { - output.push('\n'); - } - - last_index = index + '\r'.len_utf8(); - } else if !is_raw { - if c == '\\' { - if let Some((_, next)) = chars.clone().next() { - if next == '\\' { - // Skip over escaped backslashes - chars.next(); - } else if normalize_hex { - if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte()) - .and_then(|escape| { - escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..]) - }) - { - // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) - let escape_start_len = '\\'.len_utf8() + next.len_utf8(); - let escape_start_offset = index + escape_start_len; - if let Cow::Owned(normalised) = &normalised { - output.push_str(&input[last_index..escape_start_offset]); - output.push_str(normalised); - last_index = escape_start_offset + normalised.len(); - }; - - // Move the `chars` iterator passed the escape sequence. - // Simply reassigning `chars` doesn't work because the indices` would - // then be off. - for _ in 0..next.len_utf8() + normalised.len() { - chars.next(); - } - } - } - - if !quotes.triple { - #[allow(clippy::if_same_then_else)] - if next == opposite_quote && formatted_value_nesting == 0 { - // Remove the escape by ending before the backslash and starting again with the quote - chars.next(); - output.push_str(&input[last_index..index]); - last_index = index + '\\'.len_utf8(); - } else if next == preferred_quote { - // Quote is already escaped, skip over it. - chars.next(); - } - } - } - } else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 { - // Escape the quote - output.push_str(&input[last_index..index]); - output.push('\\'); - output.push(c); - last_index = index + preferred_quote.len_utf8(); - } - } - } - - let normalized = if last_index == 0 { - Cow::Borrowed(input) - } else { - output.push_str(&input[last_index..]); - Cow::Owned(output) - }; - - normalized -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum UnicodeEscape { - /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters. - Hex(usize), - - /// An escaped unicode name (`\N{name}`) - CharacterName, -} - -impl UnicodeEscape { - fn new(first: char, allow_unicode: bool) -> Option { - Some(match first { - 'x' => UnicodeEscape::Hex(2), - 'u' if allow_unicode => UnicodeEscape::Hex(4), - 'U' if allow_unicode => UnicodeEscape::Hex(8), - 'N' if allow_unicode => UnicodeEscape::CharacterName, - _ => return None, - }) - } - - /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to: - /// - /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`. - /// * `\N`: To use uppercase letters - fn normalize(self, input: &str) -> Option> { - let mut normalised = String::new(); - - let len = match self { - UnicodeEscape::Hex(len) => { - // It's not a valid escape sequence if the input string has fewer characters - // left than required by the escape sequence. - if input.len() < len { - return None; - } - - for (index, c) in input.char_indices().take(len) { - match c { - '0'..='9' | 'a'..='f' => { - if !normalised.is_empty() { - normalised.push(c); - } - } - 'A'..='F' => { - if normalised.is_empty() { - normalised.reserve(len); - normalised.push_str(&input[..index]); - normalised.push(c.to_ascii_lowercase()); - } else { - normalised.push(c.to_ascii_lowercase()); - } - } - _ => { - // not a valid escape sequence - return None; - } - } - } - - len - } - UnicodeEscape::CharacterName => { - let mut char_indices = input.char_indices(); - - if !matches!(char_indices.next(), Some((_, '{'))) { - return None; - } - - loop { - if let Some((index, c)) = char_indices.next() { - match c { - '}' => { - if !normalised.is_empty() { - normalised.push('}'); - } - - // Name must be at least two characters long. - if index < 3 { - return None; - } - - break index + '}'.len_utf8(); - } - '0'..='9' | 'A'..='Z' | ' ' | '-' => { - if !normalised.is_empty() { - normalised.push(c); - } - } - 'a'..='z' => { - if normalised.is_empty() { - normalised.reserve(c.len_utf8() + '}'.len_utf8()); - normalised.push_str(&input[..index]); - normalised.push(c.to_ascii_uppercase()); - } else { - normalised.push(c.to_ascii_uppercase()); - } - } - _ => { - // Seems like an invalid escape sequence, don't normalise it. - return None; - } - } - } else { - // Unterminated escape sequence, don't normalise it. - return None; - } - } - } - }; - - Some(if normalised.is_empty() { - Cow::Borrowed(&input[..len]) - } else { - Cow::Owned(normalised) - }) - } -} - -#[cfg(test)] -mod tests { - use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape}; - use std::borrow::Cow; - - #[test] - fn normalize_32_escape() { - let escape_sequence = UnicodeEscape::new('U', true).unwrap(); - - assert_eq!( - Some(Cow::Owned("0001f60e".to_string())), - escape_sequence.normalize("0001F60E") - ); - } - - #[test] - fn normalize_hex_in_byte_string() { - let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"; - - let normalized = normalize_string( - input, - StringQuotes { - triple: false, - quote_char: QuoteChar::Double, - }, - StringPrefix::BYTE, - true, - ); - - assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized); - } -} diff --git a/crates/ruff_python_formatter/src/string/normalize.rs b/crates/ruff_python_formatter/src/string/normalize.rs new file mode 100644 index 0000000000000..5982781937bc9 --- /dev/null +++ b/crates/ruff_python_formatter/src/string/normalize.rs @@ -0,0 +1,622 @@ +use std::borrow::Cow; + +use ruff_source_file::Locator; +use ruff_text_size::{Ranged, TextRange}; + +use crate::prelude::*; +use crate::preview::is_hex_codes_in_unicode_sequences_enabled; +use crate::string::{QuoteChar, Quoting, StringPart, StringPrefix, StringQuotes}; +use crate::QuoteStyle; + +pub(crate) struct StringNormalizer { + quoting: Quoting, + preferred_quote_style: QuoteStyle, + parent_docstring_quote_char: Option, + normalize_hex: bool, +} + +impl StringNormalizer { + pub(crate) fn from_context(context: &PyFormatContext<'_>) -> Self { + Self { + quoting: Quoting::default(), + preferred_quote_style: QuoteStyle::default(), + parent_docstring_quote_char: context.docstring(), + normalize_hex: is_hex_codes_in_unicode_sequences_enabled(context), + } + } + + pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self { + self.preferred_quote_style = quote_style; + self + } + + pub(crate) fn with_quoting(mut self, quoting: Quoting) -> Self { + self.quoting = quoting; + self + } + + /// Computes the strings preferred quotes. + pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes { + // Per PEP 8, always prefer double quotes for triple-quoted strings. + // Except when using quote-style-preserve. + let preferred_style = if string.quotes().triple { + // ... unless we're formatting a code snippet inside a docstring, + // then we specifically want to invert our quote style to avoid + // writing out invalid Python. + // + // It's worth pointing out that we can actually wind up being + // somewhat out of sync with PEP8 in this case. Consider this + // example: + // + // def foo(): + // ''' + // Something. + // + // >>> """tricksy""" + // ''' + // pass + // + // Ideally, this would be reformatted as: + // + // def foo(): + // """ + // Something. + // + // >>> '''tricksy''' + // """ + // pass + // + // But the logic here results in the original quoting being + // preserved. This is because the quoting style of the outer + // docstring is determined, in part, by looking at its contents. In + // this case, it notices that it contains a `"""` and thus infers + // that using `'''` would overall read better because it avoids + // the need to escape the interior `"""`. Except... in this case, + // the `"""` is actually part of a code snippet that could get + // reformatted to using a different quoting style itself. + // + // Fixing this would, I believe, require some fairly seismic + // changes to how formatting strings works. Namely, we would need + // to look for code snippets before normalizing the docstring, and + // then figure out the quoting style more holistically by looking + // at the various kinds of quotes used in the code snippets and + // what reformatting them might look like. + // + // Overall this is a bit of a corner case and just inverting the + // style from what the parent ultimately decided upon works, even + // if it doesn't have perfect alignment with PEP8. + if let Some(quote) = self.parent_docstring_quote_char { + QuoteStyle::from(quote.invert()) + } else if self.preferred_quote_style.is_preserve() { + QuoteStyle::Preserve + } else { + QuoteStyle::Double + } + } else { + self.preferred_quote_style + }; + + match self.quoting { + Quoting::Preserve => string.quotes(), + Quoting::CanChange => { + if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) { + let raw_content = locator.slice(string.content_range()); + if string.prefix().is_raw_string() { + choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote) + } else { + choose_quotes_impl(raw_content, string.quotes(), preferred_quote) + } + } else { + string.quotes() + } + } + } + } + + /// Computes the strings preferred quotes and normalizes its content. + pub(crate) fn normalize<'a>( + &self, + string: &StringPart, + locator: &'a Locator, + ) -> NormalizedString<'a> { + let raw_content = locator.slice(string.content_range()); + + let quotes = self.choose_quotes(string, locator); + + let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex); + + NormalizedString { + prefix: string.prefix(), + content_range: string.content_range(), + text: normalized, + quotes, + } + } +} + +#[derive(Debug)] +pub(crate) struct NormalizedString<'a> { + prefix: crate::string::StringPrefix, + + /// The quotes of the normalized string (preferred quotes) + quotes: StringQuotes, + + /// The range of the string's content in the source (minus prefix and quotes). + content_range: TextRange, + + /// The normalized text + text: Cow<'a, str>, +} + +impl<'a> NormalizedString<'a> { + pub(crate) fn text(&self) -> &Cow<'a, str> { + &self.text + } + + pub(crate) fn quotes(&self) -> StringQuotes { + self.quotes + } + + pub(crate) fn prefix(&self) -> StringPrefix { + self.prefix + } +} + +impl Ranged for NormalizedString<'_> { + fn range(&self) -> TextRange { + self.content_range + } +} + +impl Format> for NormalizedString<'_> { + fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { + ruff_formatter::write!(f, [self.prefix, self.quotes])?; + match &self.text { + Cow::Borrowed(_) => { + source_text_slice(self.range()).fmt(f)?; + } + Cow::Owned(normalized) => { + text(normalized).fmt(f)?; + } + } + self.quotes.fmt(f) + } +} + +/// Choose the appropriate quote style for a raw string. +/// +/// The preferred quote style is chosen unless the string contains unescaped quotes of the +/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote +/// style is double quotes. +fn choose_quotes_for_raw_string( + input: &str, + quotes: StringQuotes, + preferred_quote: QuoteChar, +) -> StringQuotes { + let preferred_quote_char = preferred_quote.as_char(); + let mut chars = input.chars().peekable(); + let contains_unescaped_configured_quotes = loop { + match chars.next() { + Some('\\') => { + // Ignore escaped characters + chars.next(); + } + // `"` or `'` + Some(c) if c == preferred_quote_char => { + if !quotes.triple { + break true; + } + + match chars.peek() { + // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser + // about where the closing triple quotes start + None => break true, + Some(next) if *next == preferred_quote_char => { + // `""` or `''` + chars.next(); + + // We can't turn `r'''""'''` into `r""""""""`, nor can we have + // `"""` or `'''` respectively inside the string + if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) { + break true; + } + } + _ => {} + } + } + Some(_) => continue, + None => break false, + } + }; + + StringQuotes { + triple: quotes.triple, + quote_char: if contains_unescaped_configured_quotes { + quotes.quote_char + } else { + preferred_quote + }, + } +} + +/// Choose the appropriate quote style for a string. +/// +/// For single quoted strings, the preferred quote style is used, unless the alternative quote style +/// would require fewer escapes. +/// +/// For triple quoted strings, the preferred quote style is always used, unless the string contains +/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be +/// used unless the string contains `"""`). +fn choose_quotes_impl( + input: &str, + quotes: StringQuotes, + preferred_quote: QuoteChar, +) -> StringQuotes { + let quote = if quotes.triple { + // True if the string contains a triple quote sequence of the configured quote style. + let mut uses_triple_quotes = false; + let mut chars = input.chars().peekable(); + + while let Some(c) = chars.next() { + let preferred_quote_char = preferred_quote.as_char(); + match c { + '\\' => { + if matches!(chars.peek(), Some('"' | '\\')) { + chars.next(); + } + } + // `"` or `'` + c if c == preferred_quote_char => { + match chars.peek().copied() { + Some(c) if c == preferred_quote_char => { + // `""` or `''` + chars.next(); + + match chars.peek().copied() { + Some(c) if c == preferred_quote_char => { + // `"""` or `'''` + chars.next(); + uses_triple_quotes = true; + break; + } + Some(_) => {} + None => { + // Handle `''' ""'''`. At this point we have consumed both + // double quotes, so on the next iteration the iterator is empty + // and we'd miss the string ending with a preferred quote + uses_triple_quotes = true; + break; + } + } + } + Some(_) => { + // A single quote char, this is ok + } + None => { + // Trailing quote at the end of the comment + uses_triple_quotes = true; + break; + } + } + } + _ => continue, + } + } + + if uses_triple_quotes { + // String contains a triple quote sequence of the configured quote style. + // Keep the existing quote style. + quotes.quote_char + } else { + preferred_quote + } + } else { + let mut single_quotes = 0u32; + let mut double_quotes = 0u32; + + for c in input.chars() { + match c { + '\'' => { + single_quotes += 1; + } + + '"' => { + double_quotes += 1; + } + + _ => continue, + } + } + + match preferred_quote { + QuoteChar::Single => { + if single_quotes > double_quotes { + QuoteChar::Double + } else { + QuoteChar::Single + } + } + QuoteChar::Double => { + if double_quotes > single_quotes { + QuoteChar::Single + } else { + QuoteChar::Double + } + } + } + }; + + StringQuotes { + triple: quotes.triple, + quote_char: quote, + } +} + +/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input` +/// with the provided [`StringQuotes`] style. +/// +/// Returns the normalized string and whether it contains new lines. +pub(crate) fn normalize_string( + input: &str, + quotes: StringQuotes, + prefix: StringPrefix, + normalize_hex: bool, +) -> Cow { + // The normalized string if `input` is not yet normalized. + // `output` must remain empty if `input` is already normalized. + let mut output = String::new(); + // Tracks the last index of `input` that has been written to `output`. + // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. + let mut last_index = 0; + + let quote = quotes.quote_char; + let preferred_quote = quote.as_char(); + let opposite_quote = quote.invert().as_char(); + + let mut chars = input.char_indices().peekable(); + + let is_raw = prefix.is_raw_string(); + let is_fstring = prefix.is_fstring(); + let mut formatted_value_nesting = 0u32; + + while let Some((index, c)) = chars.next() { + if is_fstring && matches!(c, '{' | '}') { + if chars.peek().copied().is_some_and(|(_, next)| next == c) { + // Skip over the second character of the double braces + chars.next(); + } else if c == '{' { + formatted_value_nesting += 1; + } else { + // Safe to assume that `c == '}'` here because of the matched pattern above + formatted_value_nesting = formatted_value_nesting.saturating_sub(1); + } + continue; + } + if c == '\r' { + output.push_str(&input[last_index..index]); + + // Skip over the '\r' character, keep the `\n` + if chars.peek().copied().is_some_and(|(_, next)| next == '\n') { + chars.next(); + } + // Replace the `\r` with a `\n` + else { + output.push('\n'); + } + + last_index = index + '\r'.len_utf8(); + } else if !is_raw { + if c == '\\' { + if let Some((_, next)) = chars.clone().next() { + if next == '\\' { + // Skip over escaped backslashes + chars.next(); + } else if normalize_hex { + if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte()) + .and_then(|escape| { + escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..]) + }) + { + // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) + let escape_start_len = '\\'.len_utf8() + next.len_utf8(); + let escape_start_offset = index + escape_start_len; + if let Cow::Owned(normalised) = &normalised { + output.push_str(&input[last_index..escape_start_offset]); + output.push_str(normalised); + last_index = escape_start_offset + normalised.len(); + }; + + // Move the `chars` iterator passed the escape sequence. + // Simply reassigning `chars` doesn't work because the indices` would + // then be off. + for _ in 0..next.len_utf8() + normalised.len() { + chars.next(); + } + } + } + + if !quotes.triple { + #[allow(clippy::if_same_then_else)] + if next == opposite_quote && formatted_value_nesting == 0 { + // Remove the escape by ending before the backslash and starting again with the quote + chars.next(); + output.push_str(&input[last_index..index]); + last_index = index + '\\'.len_utf8(); + } else if next == preferred_quote { + // Quote is already escaped, skip over it. + chars.next(); + } + } + } + } else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 { + // Escape the quote + output.push_str(&input[last_index..index]); + output.push('\\'); + output.push(c); + last_index = index + preferred_quote.len_utf8(); + } + } + } + + let normalized = if last_index == 0 { + Cow::Borrowed(input) + } else { + output.push_str(&input[last_index..]); + Cow::Owned(output) + }; + + normalized +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum UnicodeEscape { + /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters. + Hex(usize), + + /// An escaped unicode name (`\N{name}`) + CharacterName, +} + +impl UnicodeEscape { + fn new(first: char, allow_unicode: bool) -> Option { + Some(match first { + 'x' => UnicodeEscape::Hex(2), + 'u' if allow_unicode => UnicodeEscape::Hex(4), + 'U' if allow_unicode => UnicodeEscape::Hex(8), + 'N' if allow_unicode => UnicodeEscape::CharacterName, + _ => return None, + }) + } + + /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to: + /// + /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`. + /// * `\N`: To use uppercase letters + fn normalize(self, input: &str) -> Option> { + let mut normalised = String::new(); + + let len = match self { + UnicodeEscape::Hex(len) => { + // It's not a valid escape sequence if the input string has fewer characters + // left than required by the escape sequence. + if input.len() < len { + return None; + } + + for (index, c) in input.char_indices().take(len) { + match c { + '0'..='9' | 'a'..='f' => { + if !normalised.is_empty() { + normalised.push(c); + } + } + 'A'..='F' => { + if normalised.is_empty() { + normalised.reserve(len); + normalised.push_str(&input[..index]); + normalised.push(c.to_ascii_lowercase()); + } else { + normalised.push(c.to_ascii_lowercase()); + } + } + _ => { + // not a valid escape sequence + return None; + } + } + } + + len + } + UnicodeEscape::CharacterName => { + let mut char_indices = input.char_indices(); + + if !matches!(char_indices.next(), Some((_, '{'))) { + return None; + } + + loop { + if let Some((index, c)) = char_indices.next() { + match c { + '}' => { + if !normalised.is_empty() { + normalised.push('}'); + } + + // Name must be at least two characters long. + if index < 3 { + return None; + } + + break index + '}'.len_utf8(); + } + '0'..='9' | 'A'..='Z' | ' ' | '-' => { + if !normalised.is_empty() { + normalised.push(c); + } + } + 'a'..='z' => { + if normalised.is_empty() { + normalised.reserve(c.len_utf8() + '}'.len_utf8()); + normalised.push_str(&input[..index]); + normalised.push(c.to_ascii_uppercase()); + } else { + normalised.push(c.to_ascii_uppercase()); + } + } + _ => { + // Seems like an invalid escape sequence, don't normalise it. + return None; + } + } + } else { + // Unterminated escape sequence, don't normalise it. + return None; + } + } + } + }; + + Some(if normalised.is_empty() { + Cow::Borrowed(&input[..len]) + } else { + Cow::Owned(normalised) + }) + } +} + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use crate::string::{QuoteChar, StringPrefix, StringQuotes}; + + use super::{normalize_string, UnicodeEscape}; + + #[test] + fn normalize_32_escape() { + let escape_sequence = UnicodeEscape::new('U', true).unwrap(); + + assert_eq!( + Some(Cow::Owned("0001f60e".to_string())), + escape_sequence.normalize("0001F60E") + ); + } + + #[test] + fn normalize_hex_in_byte_string() { + let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"; + + let normalized = normalize_string( + input, + StringQuotes { + triple: false, + quote_char: QuoteChar::Double, + }, + StringPrefix::BYTE, + true, + ); + + assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized); + } +}