No regex

astral-sh · Mar 26, 2023 · 2d1be26 · 2d1be26
1 parent 6ca18cc
commit 2d1be26
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 81 deletions.
diff --git a/crates/ruff/src/checkers/logical_lines.rs b/crates/ruff/src/checkers/logical_lines.rs
@@ -54,10 +54,8 @@ pub fn check_logical_lines(
         let indent_size = 4;
 
         if line.flags().contains(TokenFlags::OPERATOR) {
-            for (index, kind) in space_around_operator(line.text()) {
+            for (location, kind) in space_around_operator(line.tokens(), locator) {
                 if settings.rules.enabled(kind.rule()) {
-                    let (token_offset, pos) = line.mapping(index);
-                    let location = Location::new(pos.row(), pos.column() + index - token_offset);
                     diagnostics.push(Diagnostic {
                         kind,
                         location,
@@ -72,10 +70,8 @@ pub fn check_logical_lines(
             .flags()
             .contains(TokenFlags::OPERATOR | TokenFlags::PUNCTUATION)
         {
-            for (index, kind) in extraneous_whitespace(line.text()) {
+            for (location, kind) in extraneous_whitespace(line.tokens(), locator) {
                 if settings.rules.enabled(kind.rule()) {
-                    let (token_offset, pos) = line.mapping(index);
-                    let location = Location::new(pos.row(), pos.column() + index - token_offset);
                     diagnostics.push(Diagnostic {
                         kind,
                         location,
@@ -87,10 +83,8 @@ pub fn check_logical_lines(
             }
         }
         if line.flags().contains(TokenFlags::KEYWORD) {
-            for (index, kind) in whitespace_around_keywords(line.text()) {
+            for (location, kind) in whitespace_around_keywords(line.tokens(), locator) {
                 if settings.rules.enabled(kind.rule()) {
-                    let (token_offset, pos) = line.mapping(index);
-                    let location = Location::new(pos.row(), pos.column() + index - token_offset);
                     diagnostics.push(Diagnostic {
                         kind,
                         location,
@@ -127,9 +121,7 @@ pub fn check_logical_lines(
             }
         }
         if line.flags().contains(TokenFlags::OPERATOR) {
-            for (location, kind) in
-                whitespace_around_named_parameter_equals(line.tokens(), line.text())
-            {
+            for (location, kind) in whitespace_around_named_parameter_equals(line.tokens()) {
                 if settings.rules.enabled(kind.rule()) {
                     diagnostics.push(Diagnostic {
                         kind,

diff --git a/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs b/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
@@ -2,10 +2,14 @@
 
 use once_cell::sync::Lazy;
 use regex::Regex;
+use rustpython_parser::ast::Location;
+use rustpython_parser::Tok;
 
+use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
+use ruff_python_ast::source_code::Locator;
 
 /// ## What it does
 /// Checks for the use of extraneous whitespace after "(".
@@ -101,28 +105,55 @@ impl Violation for WhitespaceBeforePunctuation {
     }
 }
 
-// TODO(charlie): Pycodestyle has a negative lookahead on the end.
-static EXTRANEOUS_WHITESPACE_REGEX: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"[\[({][ \t]|[ \t][]}),;:]").unwrap());
-
 /// E201, E202, E203
 #[cfg(feature = "logical_lines")]
-pub fn extraneous_whitespace(line: &str) -> Vec<(usize, DiagnosticKind)> {
+pub fn extraneous_whitespace(
+    tokens: &[(Location, &Tok, Location)],
+    locator: &Locator,
+) -> Vec<(Location, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in EXTRANEOUS_WHITESPACE_REGEX.find_iter(line) {
-        let text = &line[line_match.range()];
-        let char = text.trim();
-        let found = line_match.start();
-        if text.chars().last().unwrap().is_ascii_whitespace() {
-            diagnostics.push((found + 1, WhitespaceAfterOpenBracket.into()));
-        } else if line.chars().nth(found - 1).map_or(false, |c| c != ',') {
-            if char == "}" || char == "]" || char == ")" {
-                diagnostics.push((found, WhitespaceBeforeCloseBracket.into()));
-            } else {
-                diagnostics.push((found, WhitespaceBeforePunctuation.into()));
+    let mut last_token: Option<&Tok> = None;
+
+    for (start, token, end) in tokens {
+        match token {
+            Tok::Lbrace | Tok::Lpar | Tok::Lsqb => {
+                let after = &locator.contents()[locator.offset(*end)..];
+
+                if !matches!(Whitespace::leading(after), Whitespace::None) {
+                    diagnostics.push((
+                        Location::new(end.row(), end.column()),
+                        WhitespaceAfterOpenBracket.into(),
+                    ));
+                }
             }
+            Tok::Rbrace | Tok::Rpar | Tok::Rsqb | Tok::Comma | Tok::Semi | Tok::Colon => {
+                let before = &locator.contents()[..locator.offset(*start)];
+
+                let diagnostic_kind = if matches!(token, Tok::Comma | Tok::Semi | Tok::Colon) {
+                    DiagnosticKind::from(WhitespaceBeforePunctuation)
+                } else {
+                    DiagnosticKind::from(WhitespaceBeforeCloseBracket)
+                };
+
+                match Whitespace::trailing(before) {
+                    (Whitespace::None, _) => {}
+                    (_, offset) => {
+                        if !matches!(last_token, Some(Tok::Comma)) {
+                            diagnostics.push((
+                                Location::new(start.row(), start.column() - offset),
+                                diagnostic_kind,
+                            ));
+                        }
+                    }
+                }
+            }
+
+            _ => {}
         }
+
+        last_token = Some(token);
     }
+
     diagnostics
 }
 

diff --git a/crates/ruff/src/rules/pycodestyle/rules/mod.rs b/crates/ruff/src/rules/pycodestyle/rules/mod.rs
@@ -101,6 +101,8 @@ impl Whitespace {
         for c in content.chars() {
             if c == '\t' {
                 return Self::Tab;
+            } else if matches!(c, '\n' | '\r') {
+                break;
             } else if c.is_whitespace() {
                 count += 1;
             } else {
@@ -116,24 +118,25 @@ impl Whitespace {
     }
 
     fn trailing(content: &str) -> (Self, usize) {
-        let mut count = 0u32;
-        let mut offset = 0;
+        let mut count = 0;
 
         for c in content.chars().rev() {
             if c == '\t' {
-                return (Self::Tab, offset + 1);
+                return (Self::Tab, count + 1);
+            } else if matches!(c, '\n' | '\r') {
+                // Indent
+                return (Self::None, 0);
             } else if c.is_whitespace() {
                 count += 1;
-                offset += c.len_utf8();
             } else {
                 break;
             }
         }
 
         match count {
             0 => (Self::None, 0),
-            1 => (Self::Single, offset),
-            _ => (Self::Many, offset),
+            1 => (Self::Single, count),
+            _ => (Self::Many, count),
         }
     }
 }
diff --git a/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs b/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
@@ -5,12 +5,13 @@ use regex::Regex;
 use rustpython_parser::ast::Location;
 use rustpython_parser::Tok;
 
-use crate::rules::pycodestyle::helpers::is_op_token;
+use crate::rules::pycodestyle::helpers::{is_op_token, is_ws_needed_token};
 use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_ast::source_code::Locator;
+use ruff_python_ast::types::Range;
 
 /// ## What it does
 /// Checks for extraneous tabs before an operator.
@@ -128,37 +129,81 @@ impl Violation for MultipleSpacesAfterOperator {
     }
 }
 
-static OPERATOR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"[-+*/|!<=>%&^]+|:=").unwrap());
-
 /// E221, E222, E223, E224
 #[cfg(feature = "logical_lines")]
-pub fn space_around_operator(line: &str) -> Vec<(usize, DiagnosticKind)> {
+pub fn space_around_operator(
+    tokens: &[(Location, &Tok, Location)],
+    locator: &Locator,
+) -> Vec<(Location, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in OPERATOR_REGEX.find_iter(line) {
-        let before = &line[..line_match.start()];
-        match Whitespace::trailing(before) {
-            (Whitespace::Tab, offset) => {
-                diagnostics.push((line_match.start() - offset, TabBeforeOperator.into()))
+
+    for (start, token, end) in tokens {
+        if is_operator_token(token) {
+            let start_offset = locator.offset(*start);
+            let before = &locator.contents()[..start_offset];
+
+            match Whitespace::trailing(before) {
+                (Whitespace::Tab, offset) => diagnostics.push((
+                    Location::new(start.row(), start.column() - offset),
+                    TabBeforeOperator.into(),
+                )),
+                (Whitespace::Many, offset) => diagnostics.push((
+                    Location::new(start.row(), start.column() - offset),
+                    MultipleSpacesBeforeOperator.into(),
+                )),
+                _ => {}
             }
-            (Whitespace::Many, offset) => diagnostics.push((
-                line_match.start() - offset,
-                MultipleSpacesBeforeOperator.into(),
-            )),
-            _ => {}
-        }
 
-        let after = &line[line_match.end()..];
-        match Whitespace::leading(after) {
-            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterOperator.into())),
-            Whitespace::Many => {
-                diagnostics.push((line_match.end(), MultipleSpacesAfterOperator.into()))
+            let end_offset = locator.offset(*end);
+            let after = &locator.contents()[end_offset..];
+            match Whitespace::leading(after) {
+                Whitespace::Tab => diagnostics.push((*end, TabAfterOperator.into())),
+                Whitespace::Many => diagnostics.push((*end, MultipleSpacesAfterOperator.into())),
+                _ => {}
             }
-            _ => {}
         }
     }
+
     diagnostics
 }
 
+const fn is_operator_token(token: &Tok) -> bool {
+    matches!(
+        token,
+        Tok::Plus
+            | Tok::Minus
+            | Tok::Star
+            | Tok::Slash
+            | Tok::Vbar
+            | Tok::Amper
+            | Tok::Less
+            | Tok::Greater
+            | Tok::Equal
+            | Tok::Percent
+            | Tok::NotEqual
+            | Tok::LessEqual
+            | Tok::GreaterEqual
+            | Tok::CircumFlex
+            | Tok::LeftShift
+            | Tok::RightShift
+            | Tok::DoubleStar
+            | Tok::PlusEqual
+            | Tok::MinusEqual
+            | Tok::StarEqual
+            | Tok::SlashEqual
+            | Tok::PercentEqual
+            | Tok::AmperEqual
+            | Tok::VbarEqual
+            | Tok::CircumflexEqual
+            | Tok::LeftShiftEqual
+            | Tok::RightShiftEqual
+            | Tok::DoubleStarEqual
+            | Tok::DoubleSlash
+            | Tok::DoubleSlashEqual
+            | Tok::ColonEqual
+    )
+}
+
 #[cfg(not(feature = "logical_lines"))]
 pub fn space_around_operator(_line: &str) -> Vec<(usize, DiagnosticKind)> {
     vec![]

diff --git a/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs b/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
@@ -2,11 +2,15 @@
 
 use once_cell::sync::Lazy;
 use regex::Regex;
+use rustpython_parser::ast::Location;
+use rustpython_parser::Tok;
 
+use crate::rules::pycodestyle::helpers::is_keyword_token;
 use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
+use ruff_python_ast::source_code::Locator;
 
 /// ## What it does
 /// Checks for extraneous whitespace after keywords.
@@ -111,36 +115,41 @@ impl Violation for TabBeforeKeyword {
     }
 }
 
-static KEYWORD_REGEX: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"\b(False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b").unwrap()
-});
-
 /// E271, E272, E273, E274
 #[cfg(feature = "logical_lines")]
-pub fn whitespace_around_keywords(line: &str) -> Vec<(usize, DiagnosticKind)> {
+pub fn whitespace_around_keywords(
+    tokens: &[(Location, &Tok, Location)],
+    locator: &Locator,
+) -> Vec<(Location, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in KEYWORD_REGEX.find_iter(line) {
-        let before = &line[..line_match.start()];
-        match Whitespace::trailing(before) {
-            (Whitespace::Tab, offset) => {
-                diagnostics.push((line_match.start() - offset, TabBeforeKeyword.into()))
+
+    for (start, token, end) in tokens {
+        if is_keyword_token(token) {
+            let start_offset = locator.offset(*start);
+            let before = &locator.contents()[..start_offset];
+
+            match Whitespace::trailing(before) {
+                (Whitespace::Tab, offset) => diagnostics.push((
+                    Location::new(start.row(), start.column() - offset),
+                    TabBeforeKeyword.into(),
+                )),
+                (Whitespace::Many, offset) => diagnostics.push((
+                    Location::new(start.row(), start.column() - offset),
+                    MultipleSpacesBeforeKeyword.into(),
+                )),
+                _ => {}
             }
-            (Whitespace::Many, offset) => diagnostics.push((
-                line_match.start() - offset,
-                MultipleSpacesBeforeKeyword.into(),
-            )),
-            _ => {}
-        }
 
-        let after = &line[line_match.end()..];
-        match Whitespace::leading(after) {
-            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterKeyword.into())),
-            Whitespace::Many => {
-                diagnostics.push((line_match.end(), MultipleSpacesAfterKeyword.into()))
+            let end_offset = locator.offset(*end);
+            let after = &locator.contents()[end_offset..];
+            match Whitespace::leading(after) {
+                Whitespace::Tab => diagnostics.push((*end, TabAfterKeyword.into())),
+                Whitespace::Many => diagnostics.push((*end, MultipleSpacesAfterKeyword.into())),
+                _ => {}
             }
-            _ => {}
         }
     }
+
     diagnostics
 }