From de2bd87f27c98438fae79087e0dc65c6099d5dc4 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Mon, 18 Sep 2023 21:35:31 +0530
Subject: [PATCH] Add f-string ranges builder to support nested f-strings

---
 crates/ruff/src/checkers/ast/mod.rs           |   2 +-
 .../ruff_python_index/src/fstring_ranges.rs   |  61 +++++++++
 crates/ruff_python_index/src/indexer.rs       | 123 ++++++++++--------
 crates/ruff_python_index/src/lib.rs           |   1 +
 4 files changed, 133 insertions(+), 54 deletions(-)
 create mode 100644 crates/ruff_python_index/src/fstring_ranges.rs
diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs
index 27a8a2a7d31b0f..365cb6d16a4f75 100644
--- a/crates/ruff/src/checkers/ast/mod.rs
+++ b/crates/ruff/src/checkers/ast/mod.rs
@@ -183,7 +183,7 @@ impl<'a> Checker<'a> {
 
         // Find the quote character used to start the containing f-string.
         let expr = self.semantic.current_expression()?;
-        let string_range = self.indexer.f_string_range(expr.start())?;
+        let string_range = self.indexer.fstring_ranges().innermost(expr.start())?;
         let trailing_quote = trailing_quote(self.locator.slice(string_range))?;
 
         // Invert the quote character, if it's a single quote.
diff --git a/crates/ruff_python_index/src/fstring_ranges.rs b/crates/ruff_python_index/src/fstring_ranges.rs
new file mode 100644
index 00000000000000..15fc08f7a11b7d
--- /dev/null
+++ b/crates/ruff_python_index/src/fstring_ranges.rs
@@ -0,0 +1,61 @@
+use std::collections::BTreeMap;
+
+use ruff_python_parser::Tok;
+use ruff_text_size::{TextRange, TextSize};
+
+/// Stores the ranges of all f-strings in a file sorted by [`TextRange::start`].
+/// There can be multiple overlapping ranges for nested f-strings.
+#[derive(Debug)]
+pub struct FStringRanges {
+    raw: BTreeMap<TextSize, TextRange>,
+}
+
+impl FStringRanges {
+    /// Return the [`TextRange`] of the innermost f-string at the given offset.
+    pub fn innermost(&self, offset: TextSize) -> Option<TextRange> {
+        self.raw
+            .range(..=offset)
+            .rev()
+            .find(|(_, range)| range.contains(offset))
+            .map(|(_, range)| *range)
+    }
+
+    /// Return the [`TextRange`] of the outermost f-string at the given offset.
+    pub fn outermost(&self, offset: TextSize) -> Option<TextRange> {
+        self.raw
+            .range(..=offset)
+            .find(|(_, range)| range.contains(offset))
+            .map(|(_, range)| *range)
+    }
+
+    #[cfg(test)]
+    pub(crate) fn ranges(&self) -> impl Iterator<Item = TextRange> + '_ {
+        self.raw.values().copied()
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct FStringRangesBuilder {
+    start_locations: Vec<TextSize>,
+    raw: BTreeMap<TextSize, TextRange>,
+}
+
+impl FStringRangesBuilder {
+    pub(crate) fn visit_token(&mut self, token: &Tok, range: TextRange) {
+        match token {
+            Tok::FStringStart => {
+                self.start_locations.push(range.start());
+            }
+            Tok::FStringEnd => {
+                if let Some(start) = self.start_locations.pop() {
+                    self.raw.insert(start, TextRange::new(start, range.end()));
+                }
+            }
+            _ => {}
+        }
+    }
+
+    pub(crate) fn finish(self) -> FStringRanges {
+        FStringRanges { raw: self.raw }
+    }
+}
diff --git a/crates/ruff_python_index/src/indexer.rs b/crates/ruff_python_index/src/indexer.rs
index a503ad20f25dda..3af272c527b4f2 100644
--- a/crates/ruff_python_index/src/indexer.rs
+++ b/crates/ruff_python_index/src/indexer.rs
@@ -1,7 +1,6 @@
 //! Struct used to index source code, to enable efficient lookup of tokens that
 //! are omitted from the AST (e.g., commented lines).
 
-use crate::CommentRangesBuilder;
 use ruff_python_ast::Stmt;
 use ruff_python_parser::lexer::LexResult;
 use ruff_python_parser::Tok;
@@ -11,15 +10,17 @@ use ruff_python_trivia::{
 use ruff_source_file::Locator;
 use ruff_text_size::{Ranged, TextRange, TextSize};
 
+use crate::fstring_ranges::{FStringRanges, FStringRangesBuilder};
+use crate::CommentRangesBuilder;
+
 pub struct Indexer {
     comment_ranges: CommentRanges,
 
     /// Stores the start offset of continuation lines.
     continuation_lines: Vec<TextSize>,
 
-    /// The range of all f-string in the source document. The ranges are sorted by their
-    /// [`TextRange::start`] position in increasing order. No two ranges are overlapping.
-    f_string_ranges: Vec<TextRange>,
+    /// The range of all f-string in the source document.
+    fstring_ranges: FStringRanges,
 }
 
 impl Indexer {
@@ -27,12 +28,8 @@ impl Indexer {
         assert!(TextSize::try_from(locator.contents().len()).is_ok());
 
         let mut comment_ranges_builder = CommentRangesBuilder::default();
+        let mut fstring_ranges_builder = FStringRangesBuilder::default();
         let mut continuation_lines = Vec::new();
-        let mut f_string_ranges = Vec::new();
-        // Range for the first f-string start token in a f-string that could
-        // potentially contain nested f-strings.
-        let mut first_f_string_start_range = None;
-        let mut f_string_start_count = 0u32;
         // Token, end
         let mut prev_end = TextSize::default();
         let mut prev_token: Option<&Tok> = None;
@@ -63,42 +60,20 @@ impl Indexer {
             }
 
             comment_ranges_builder.visit_token(tok, *range);
+            fstring_ranges_builder.visit_token(tok, *range);
 
-            match tok {
-                Tok::Newline | Tok::NonLogicalNewline => {
-                    line_start = range.end();
-                }
-                Tok::FStringStart => {
-                    f_string_start_count += 1;
-                    if f_string_start_count == 1 {
-                        first_f_string_start_range = Some(*range);
-                    }
-                }
-                Tok::FStringEnd => {
-                    // This is always going to be > 0, because the lexer will only
-                    // emit the end token if there was a start token to begin with.
-                    f_string_start_count -= 1;
-                    if f_string_start_count == 0 {
-                        if let Some(start_range) = first_f_string_start_range.take() {
-                            let f_string_range = TextRange::new(start_range.start(), range.end());
-                            f_string_ranges.push(f_string_range);
-
-                            if matches!(locator.slice(range), "'''" | r#"""""#) {
-                                triple_quoted_string_ranges.push(f_string_range);
-                            }
-                        }
-                    }
-                }
-                _ => {}
+            if matches!(tok, Tok::Newline | Tok::NonLogicalNewline) {
+                line_start = range.end();
             }
 
             prev_token = Some(tok);
             prev_end = range.end();
         }
+
         Self {
             comment_ranges: comment_ranges_builder.finish(),
             continuation_lines,
-            f_string_ranges,
+            fstring_ranges: fstring_ranges_builder.finish(),
         }
     }
 
@@ -107,6 +82,11 @@ impl Indexer {
         &self.comment_ranges
     }
 
+    /// Returns the byte offset ranges of f-strings.
+    pub const fn fstring_ranges(&self) -> &FStringRanges {
+        &self.fstring_ranges
+    }
+
     /// Returns the line start positions of continuations (backslash).
     pub fn continuation_line_starts(&self) -> &[TextSize] {
         &self.continuation_lines
@@ -118,22 +98,6 @@ impl Indexer {
         self.continuation_lines.binary_search(&line_start).is_ok()
     }
 
-    /// Return the [`TextRange`] of the f-string containing a given offset.
-    pub fn f_string_range(&self, offset: TextSize) -> Option<TextRange> {
-        let Ok(string_range_index) = self.f_string_ranges.binary_search_by(|range| {
-            if offset < range.start() {
-                std::cmp::Ordering::Greater
-            } else if range.contains(offset) {
-                std::cmp::Ordering::Equal
-            } else {
-                std::cmp::Ordering::Less
-            }
-        }) else {
-            return None;
-        };
-        Some(self.f_string_ranges[string_range_index])
-    }
-
     /// Returns `true` if a statement or expression includes at least one comment.
     pub fn has_comments<T>(&self, node: &T, locator: &Locator) -> bool
     where
@@ -269,7 +233,7 @@ mod tests {
     use ruff_python_parser::lexer::LexResult;
     use ruff_python_parser::{lexer, Mode};
     use ruff_source_file::Locator;
-    use ruff_text_size::TextSize;
+    use ruff_text_size::{TextRange, TextSize};
 
     use crate::Indexer;
 
@@ -353,4 +317,57 @@ import os
             ]
         );
     }
+
+    #[test]
+    fn test_f_string_ranges() {
+        let contents = r#"
+f"normal f-string"
+f"start {f"inner {f"another"}"} end"
+f"implicit " f"concatenation"
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+        assert_eq!(
+            indexer.fstring_ranges().ranges().collect::<Vec<_>>(),
+            &[
+                TextRange::new(TextSize::from(0), TextSize::from(18)),
+                TextRange::new(TextSize::from(19), TextSize::from(55)),
+                TextRange::new(TextSize::from(28), TextSize::from(49)),
+                TextRange::new(TextSize::from(37), TextSize::from(47)),
+                TextRange::new(TextSize::from(56), TextSize::from(68)),
+                TextRange::new(TextSize::from(69), TextSize::from(85)),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_triple_quoted_f_string_ranges() {
+        let contents = r#"
+f"""
+this is one
+multiline f-string
+"""
+f'''
+and this is
+another
+'''
+f"""
+this is a {f"""nested multiline
+f-string"""}
+"""
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+        assert_eq!(
+            indexer.fstring_ranges().ranges().collect::<Vec<_>>(),
+            &[
+                TextRange::new(TextSize::from(0), TextSize::from(39)),
+                TextRange::new(TextSize::from(40), TextSize::from(68)),
+                TextRange::new(TextSize::from(69), TextSize::from(122)),
+                TextRange::new(TextSize::from(85), TextSize::from(117)),
+            ]
+        );
+    }
 }
diff --git a/crates/ruff_python_index/src/lib.rs b/crates/ruff_python_index/src/lib.rs
index 2e585ca5df0e1a..288009e90e80cf 100644
--- a/crates/ruff_python_index/src/lib.rs
+++ b/crates/ruff_python_index/src/lib.rs
@@ -1,4 +1,5 @@
 mod comment_ranges;
+mod fstring_ranges;
 mod indexer;
 
 pub use comment_ranges::CommentRangesBuilder;