From 42531d65a811e58fc9c737cdeacb4ee69a9f716b Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Thu, 30 May 2024 11:22:23 +0530 Subject: [PATCH] Consider "gap" between tokens for range query --- crates/ruff_python_parser/src/lexer.rs | 4 +- crates/ruff_python_parser/src/lib.rs | 238 ++++++++++++++++++++++--- 2 files changed, 215 insertions(+), 27 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index b70d2f2639d3d3..64c9f942a3584f 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -1386,7 +1386,7 @@ impl<'src> Lexer<'src> { } bitflags! { - #[derive(Clone, Copy, Debug)] + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) struct TokenFlags: u8 { /// The token is a string with double quotes (`"`). const DOUBLE_QUOTES = 1 << 0; @@ -1468,7 +1468,7 @@ impl TokenFlags { } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Token { /// The kind of the token. kind: TokenKind, diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 5dfe22048372a5..819437c59cbd44 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -388,48 +388,94 @@ impl Tokens { &self.raw[..end] } - /// Returns a slice of the [`Token`] that are within the given `range`. + /// Returns a slice of [`Token`] that are within the given `range`. /// - /// The start and end position of the given range should correspond to the start position of - /// the first token and the end position of the last token in the returned slice. + /// The start and end offset of the given range should be either: + /// 1. Token boundary + /// 2. Gap between the tokens /// /// For example, considering the following tokens and their corresponding range: /// - /// ```txt - /// Def 0..3 - /// Name 4..7 - /// Lpar 7..8 - /// Rpar 8..9 - /// Colon 9..10 - /// Ellipsis 11..14 - /// Newline 14..14 - /// ``` + /// | Token | Range | + /// |---------------------|-----------| + /// | `Def` | `0..3` | + /// | `Name` | `4..7` | + /// | `Lpar` | `7..8` | + /// | `Rpar` | `8..9` | + /// | `Colon` | `9..10` | + /// | `Newline` | `10..11` | + /// | `Comment` | `15..24` | + /// | `NonLogicalNewline` | `24..25` | + /// | `Indent` | `25..29` | + /// | `Pass` | `29..33` | /// - /// The range `4..10` would return a slice of `Name`, `Lpar`, `Rpar`, and `Colon` tokens. But, - /// if either the start or end position of the given range doesn't match any of the tokens - /// (like `5..10` or `4..12`), the returned slice will be empty. + /// Here, for (1) a token boundary is considered either the start or end offset of any of the + /// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment` + /// token which are 12, 13, and 14. + /// + /// Examples: + /// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon` + /// 2) `11..25` would give `Comment`, `NonLogicalNewline` + /// 3) `12..25` would give same as (2) and offset 12 is in the "gap" + /// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap" + /// 5) `18..27` would panic because both the start and end offset is within a token /// /// ## Note /// /// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical /// error encountered within the given range. + /// + /// # Panics + /// + /// If either the start or end offset of the given range is within a token range. pub fn tokens_in_range(&self, range: TextRange) -> &[Token] { - let Ok(start) = self.binary_search_by_key(&range.start(), Ranged::start) else { - return &[]; - }; - let Ok(end) = self[start..].binary_search_by_key(&range.end(), Ranged::end) else { - return &[]; - }; - &self[start..=start + end] + let tokens_after_start = self.after(range.start()); + let end = tokens_after_start.partition_point(|token| token.end() <= range.end()); + let result = &tokens_after_start[..end]; + + // If the end offset isn't the end of the last token in slice... + if result.last().is_some_and(|last| last.end() != range.end()) { + // Panic if it's greater than the start offset of the next token. If it's equal to the + // start offset, then it's at a token boundary which is valid. If it's less than the + // start offset, then it's in the gap between the tokens which is valid as well. + if tokens_after_start + .get(end) + .is_some_and(|next| range.end() > next.start()) + { + panic!("Offset cannot be in between a token") + } + } + + result } /// Returns a slice of tokens after the given [`TextSize`] offset. /// - /// If the given offset lies within a token, the returned slice will start from the token after - /// that. In other words, the returned slice will not include the token containing the offset. + /// If the given offset is between two tokens, the returned slice will start from the following + /// token. In other words, if the offset is between the end of previous token and start of next + /// token, the returned slice will start from the next token. + /// + /// # Panics + /// + /// If the given offset is within a token range. pub fn after(&self, offset: TextSize) -> &[Token] { let start = self.partition_point(|token| token.start() < offset); - &self[start..] + let result = &self[start..]; + + // If the offset isn't the start of the first token in slice... + if result.first().is_some_and(|token| token.start() != offset) { + // Panic if it's less than the end offset of the previous token. If it's equal to the + // end offset, then it's at a token boundary which is valid. If it's greater than the + // end offset, then it's in the gap between the tokens which is valid as well. + if self + .get(start.saturating_sub(1)) + .is_some_and(|prev| offset < prev.end()) + { + panic!("Offset cannot be in between a token") + } + } + + result } } @@ -506,3 +552,145 @@ impl std::fmt::Display for ModeParseError { write!(f, r#"mode must be "exec", "eval", "ipython", or "single""#) } } + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::lexer::TokenFlags; + + use super::*; + + /// Test case containing a "gap" between two tokens. + /// + /// Code: + const TEST_CASE_WITH_GAP: [(TokenKind, Range); 10] = [ + (TokenKind::Def, 0..3), + (TokenKind::Name, 4..7), + (TokenKind::Lpar, 7..8), + (TokenKind::Rpar, 8..9), + (TokenKind::Colon, 9..10), + (TokenKind::Newline, 10..11), + // Gap ||..|| + (TokenKind::Comment, 15..24), + (TokenKind::NonLogicalNewline, 24..25), + (TokenKind::Indent, 25..29), + (TokenKind::Pass, 29..33), + // No newline at the end to keep the token set full of unique tokens + ]; + + /// Test case containing [`TokenKind::Unknown`] token. + /// + /// Code: + const TEST_CASE_WITH_UNKNOWN: [(TokenKind, Range); 5] = [ + (TokenKind::Name, 0..1), + (TokenKind::Equal, 2..3), + (TokenKind::Unknown, 4..11), + (TokenKind::Plus, 11..12), + (TokenKind::Int, 13..14), + // No newline at the end to keep the token set full of unique tokens + ]; + + /// Helper function to create [`Tokens`] from an iterator of (kind, range). + fn new_tokens(tokens: impl Iterator)>) -> Tokens { + Tokens::new( + tokens + .map(|(kind, range)| { + Token::new( + kind, + TextRange::new(TextSize::new(range.start), TextSize::new(range.end)), + TokenFlags::empty(), + ) + }) + .collect(), + ) + } + + #[test] + fn test_tokens_up_to_first_unknown_empty() { + let tokens = Tokens::new(vec![]); + assert_eq!(tokens.up_to_first_unknown(), &[]); + } + + #[test] + fn test_tokens_up_to_first_unknown() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + let up_to_first_unknown = tokens.up_to_first_unknown(); + assert_eq!(up_to_first_unknown.len(), tokens.len()); + + let tokens = new_tokens(TEST_CASE_WITH_UNKNOWN.into_iter()); + let up_to_first_unknown = tokens.up_to_first_unknown(); + assert_eq!(up_to_first_unknown.len(), 2); + } + + #[test] + fn test_tokens_after() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + + let after = tokens.after(TextSize::new(8)); + assert_eq!(after.len(), 7); + assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar); + + let after = tokens.after(TextSize::new(11)); + assert_eq!(after.len(), 4); + assert_eq!(after.first().unwrap().kind(), TokenKind::Comment); + + let after = tokens.after(TextSize::new(13)); + assert_eq!(after.len(), 4); + assert_eq!(after.first().unwrap().kind(), TokenKind::Comment); + + let after = tokens.after(TextSize::new(33)); + assert_eq!(after.len(), 0); + } + + #[test] + #[should_panic(expected = "Offset cannot be in between a token")] + fn test_tokens_after_panic() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + tokens.after(TextSize::new(5)); + } + + #[test] + fn test_tokens_in_range() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + + let in_range = tokens.tokens_in_range(TextRange::new(4.into(), 10.into())); + assert_eq!(in_range.len(), 4); + assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name); + assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon); + + let in_range = tokens.tokens_in_range(TextRange::new(11.into(), 29.into())); + assert_eq!(in_range.len(), 3); + assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment); + assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent); + + let in_range = tokens.tokens_in_range(TextRange::new(9.into(), 13.into())); + assert_eq!(in_range.len(), 2); + assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon); + assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline); + + let in_range = tokens.tokens_in_range(TextRange::new(8.into(), 15.into())); + assert_eq!(in_range.len(), 3); + assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar); + assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline); + + let in_range = tokens.tokens_in_range(TextRange::new(13.into(), 29.into())); + assert_eq!(in_range.len(), 3); + assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment); + assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent); + } + + #[test] + #[should_panic(expected = "Offset cannot be in between a token")] + fn test_tokens_in_range_start_panic() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + tokens.tokens_in_range(TextRange::new(5.into(), 10.into())); + } + + #[test] + #[should_panic(expected = "Offset cannot be in between a token")] + fn test_tokens_in_range_end_panic() { + let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter()); + tokens.tokens_in_range(TextRange::new(0.into(), 6.into())); + } +}