From 42531d65a811e58fc9c737cdeacb4ee69a9f716b Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Thu, 30 May 2024 11:22:23 +0530
Subject: [PATCH] Consider "gap" between tokens for range query

---
 crates/ruff_python_parser/src/lexer.rs |   4 +-
 crates/ruff_python_parser/src/lib.rs   | 238 ++++++++++++++++++++++---
 2 files changed, 215 insertions(+), 27 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index b70d2f2639d3d3..64c9f942a3584f 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -1386,7 +1386,7 @@ impl<'src> Lexer<'src> {
 }
 
 bitflags! {
-    #[derive(Clone, Copy, Debug)]
+    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
     pub(crate) struct TokenFlags: u8 {
         /// The token is a string with double quotes (`"`).
         const DOUBLE_QUOTES = 1 << 0;
@@ -1468,7 +1468,7 @@ impl TokenFlags {
     }
 }
 
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct Token {
     /// The kind of the token.
     kind: TokenKind,
diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs
index 5dfe22048372a5..819437c59cbd44 100644
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@@ -388,48 +388,94 @@ impl Tokens {
         &self.raw[..end]
     }
 
-    /// Returns a slice of the [`Token`] that are within the given `range`.
+    /// Returns a slice of [`Token`] that are within the given `range`.
     ///
-    /// The start and end position of the given range should correspond to the start position of
-    /// the first token and the end position of the last token in the returned slice.
+    /// The start and end offset of the given range should be either:
+    /// 1. Token boundary
+    /// 2. Gap between the tokens
     ///
     /// For example, considering the following tokens and their corresponding range:
     ///
-    /// ```txt
-    /// Def        0..3
-    /// Name       4..7
-    /// Lpar       7..8
-    /// Rpar       8..9
-    /// Colon      9..10
-    /// Ellipsis  11..14
-    /// Newline   14..14
-    /// ```
+    /// | Token               | Range     |
+    /// |---------------------|-----------|
+    /// | `Def`               | `0..3`    |
+    /// | `Name`              | `4..7`    |
+    /// | `Lpar`              | `7..8`    |
+    /// | `Rpar`              | `8..9`    |
+    /// | `Colon`             | `9..10`   |
+    /// | `Newline`           | `10..11`  |
+    /// | `Comment`           | `15..24`  |
+    /// | `NonLogicalNewline` | `24..25`  |
+    /// | `Indent`            | `25..29`  |
+    /// | `Pass`              | `29..33`  |
     ///
-    /// The range `4..10` would return a slice of `Name`, `Lpar`, `Rpar`, and `Colon` tokens. But,
-    /// if either the start or end position of the given range doesn't match any of the tokens
-    /// (like `5..10` or `4..12`), the returned slice will be empty.
+    /// Here, for (1) a token boundary is considered either the start or end offset of any of the
+    /// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
+    /// token which are 12, 13, and 14.
+    ///
+    /// Examples:
+    /// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
+    /// 2) `11..25` would give `Comment`, `NonLogicalNewline`
+    /// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
+    /// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
+    /// 5) `18..27` would panic because both the start and end offset is within a token
     ///
     /// ## Note
     ///
     /// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
     /// error encountered within the given range.
+    ///
+    /// # Panics
+    ///
+    /// If either the start or end offset of the given range is within a token range.
     pub fn tokens_in_range(&self, range: TextRange) -> &[Token] {
-        let Ok(start) = self.binary_search_by_key(&range.start(), Ranged::start) else {
-            return &[];
-        };
-        let Ok(end) = self[start..].binary_search_by_key(&range.end(), Ranged::end) else {
-            return &[];
-        };
-        &self[start..=start + end]
+        let tokens_after_start = self.after(range.start());
+        let end = tokens_after_start.partition_point(|token| token.end() <= range.end());
+        let result = &tokens_after_start[..end];
+
+        // If the end offset isn't the end of the last token in slice...
+        if result.last().is_some_and(|last| last.end() != range.end()) {
+            // Panic if it's greater than the start offset of the next token. If it's equal to the
+            // start offset, then it's at a token boundary which is valid. If it's less than the
+            // start offset, then it's in the gap between the tokens which is valid as well.
+            if tokens_after_start
+                .get(end)
+                .is_some_and(|next| range.end() > next.start())
+            {
+                panic!("Offset cannot be in between a token")
+            }
+        }
+
+        result
     }
 
     /// Returns a slice of tokens after the given [`TextSize`] offset.
     ///
-    /// If the given offset lies within a token, the returned slice will start from the token after
-    /// that. In other words, the returned slice will not include the token containing the offset.
+    /// If the given offset is between two tokens, the returned slice will start from the following
+    /// token. In other words, if the offset is between the end of previous token and start of next
+    /// token, the returned slice will start from the next token.
+    ///
+    /// # Panics
+    ///
+    /// If the given offset is within a token range.
     pub fn after(&self, offset: TextSize) -> &[Token] {
         let start = self.partition_point(|token| token.start() < offset);
-        &self[start..]
+        let result = &self[start..];
+
+        // If the offset isn't the start of the first token in slice...
+        if result.first().is_some_and(|token| token.start() != offset) {
+            // Panic if it's less than the end offset of the previous token. If it's equal to the
+            // end offset, then it's at a token boundary which is valid. If it's greater than the
+            // end offset, then it's in the gap between the tokens which is valid as well.
+            if self
+                .get(start.saturating_sub(1))
+                .is_some_and(|prev| offset < prev.end())
+            {
+                panic!("Offset cannot be in between a token")
+            }
+        }
+
+        result
     }
 }
 
@@ -506,3 +552,145 @@ impl std::fmt::Display for ModeParseError {
         write!(f, r#"mode must be "exec", "eval", "ipython", or "single""#)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::ops::Range;
+
+    use crate::lexer::TokenFlags;
+
+    use super::*;
+
+    /// Test case containing a "gap" between two tokens.
+    ///
+    /// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
+    const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
+        (TokenKind::Def, 0..3),
+        (TokenKind::Name, 4..7),
+        (TokenKind::Lpar, 7..8),
+        (TokenKind::Rpar, 8..9),
+        (TokenKind::Colon, 9..10),
+        (TokenKind::Newline, 10..11),
+        // Gap               ||..||
+        (TokenKind::Comment, 15..24),
+        (TokenKind::NonLogicalNewline, 24..25),
+        (TokenKind::Indent, 25..29),
+        (TokenKind::Pass, 29..33),
+        // No newline at the end to keep the token set full of unique tokens
+    ];
+
+    /// Test case containing [`TokenKind::Unknown`] token.
+    ///
+    /// Code: <https://play.ruff.rs/ea722760-9bf5-4d00-be9f-dc441793f88e>
+    const TEST_CASE_WITH_UNKNOWN: [(TokenKind, Range<u32>); 5] = [
+        (TokenKind::Name, 0..1),
+        (TokenKind::Equal, 2..3),
+        (TokenKind::Unknown, 4..11),
+        (TokenKind::Plus, 11..12),
+        (TokenKind::Int, 13..14),
+        // No newline at the end to keep the token set full of unique tokens
+    ];
+
+    /// Helper function to create [`Tokens`] from an iterator of (kind, range).
+    fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
+        Tokens::new(
+            tokens
+                .map(|(kind, range)| {
+                    Token::new(
+                        kind,
+                        TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
+                        TokenFlags::empty(),
+                    )
+                })
+                .collect(),
+        )
+    }
+
+    #[test]
+    fn test_tokens_up_to_first_unknown_empty() {
+        let tokens = Tokens::new(vec![]);
+        assert_eq!(tokens.up_to_first_unknown(), &[]);
+    }
+
+    #[test]
+    fn test_tokens_up_to_first_unknown() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+        let up_to_first_unknown = tokens.up_to_first_unknown();
+        assert_eq!(up_to_first_unknown.len(), tokens.len());
+
+        let tokens = new_tokens(TEST_CASE_WITH_UNKNOWN.into_iter());
+        let up_to_first_unknown = tokens.up_to_first_unknown();
+        assert_eq!(up_to_first_unknown.len(), 2);
+    }
+
+    #[test]
+    fn test_tokens_after() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+
+        let after = tokens.after(TextSize::new(8));
+        assert_eq!(after.len(), 7);
+        assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);
+
+        let after = tokens.after(TextSize::new(11));
+        assert_eq!(after.len(), 4);
+        assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
+
+        let after = tokens.after(TextSize::new(13));
+        assert_eq!(after.len(), 4);
+        assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
+
+        let after = tokens.after(TextSize::new(33));
+        assert_eq!(after.len(), 0);
+    }
+
+    #[test]
+    #[should_panic(expected = "Offset cannot be in between a token")]
+    fn test_tokens_after_panic() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+        tokens.after(TextSize::new(5));
+    }
+
+    #[test]
+    fn test_tokens_in_range() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+
+        let in_range = tokens.tokens_in_range(TextRange::new(4.into(), 10.into()));
+        assert_eq!(in_range.len(), 4);
+        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
+        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);
+
+        let in_range = tokens.tokens_in_range(TextRange::new(11.into(), 29.into()));
+        assert_eq!(in_range.len(), 3);
+        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
+        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
+
+        let in_range = tokens.tokens_in_range(TextRange::new(9.into(), 13.into()));
+        assert_eq!(in_range.len(), 2);
+        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
+        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
+
+        let in_range = tokens.tokens_in_range(TextRange::new(8.into(), 15.into()));
+        assert_eq!(in_range.len(), 3);
+        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar);
+        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
+
+        let in_range = tokens.tokens_in_range(TextRange::new(13.into(), 29.into()));
+        assert_eq!(in_range.len(), 3);
+        assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
+        assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
+    }
+
+    #[test]
+    #[should_panic(expected = "Offset cannot be in between a token")]
+    fn test_tokens_in_range_start_panic() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+        tokens.tokens_in_range(TextRange::new(5.into(), 10.into()));
+    }
+
+    #[test]
+    #[should_panic(expected = "Offset cannot be in between a token")]
+    fn test_tokens_in_range_end_panic() {
+        let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
+        tokens.tokens_in_range(TextRange::new(0.into(), 6.into()));
+    }
+}