From 59e7f102a4d64f4d0e4c562d419e4071c17bcf87 Mon Sep 17 00:00:00 2001
From: Micha Reiser <micha@reiser.io>
Date: Sat, 25 Mar 2023 11:55:39 +0100
Subject: [PATCH] Avoid using Regex captures

---
 .../src/rules/pycodestyle/logical_lines.rs    | 13 ++--
 .../rules/extraneous_whitespace.rs            |  9 ++-
 .../ruff/src/rules/pycodestyle/rules/mod.rs   | 54 ++++++++++++++++
 .../rules/space_around_operator.rs            | 38 +++++++----
 .../rules/whitespace_around_keywords.rs       | 33 ++++++----
 ...ules__pycodestyle__tests__E221_E22.py.snap | 28 ++++++++
 ...ules__pycodestyle__tests__E272_E27.py.snap | 28 ++++++++
 ...ules__pycodestyle__tests__E274_E27.py.snap | 64 +++++++++++++++++--
 8 files changed, 227 insertions(+), 40 deletions(-)
diff --git a/crates/ruff/src/rules/pycodestyle/logical_lines.rs b/crates/ruff/src/rules/pycodestyle/logical_lines.rs
index 97b36ac9f42212..3c1e55da3b18e3 100644
--- a/crates/ruff/src/rules/pycodestyle/logical_lines.rs
+++ b/crates/ruff/src/rules/pycodestyle/logical_lines.rs
@@ -45,7 +45,8 @@ impl<'a> LogicalLines<'a> {
         assert!(u32::try_from(tokens.len()).is_ok());
 
         let single_token = tokens.len() == 1;
-        let mut builder = LogicalLinesBuilder::with_token_capacity(tokens.len());
+        let mut builder =
+            LogicalLinesBuilder::with_capacity(tokens.len(), locator.contents().len());
         let mut parens: u32 = 0;
 
         for (start, token, end) in tokens.iter().flatten() {
@@ -280,10 +281,11 @@ pub struct LogicalLinesBuilder<'a> {
 }
 
 impl<'a> LogicalLinesBuilder<'a> {
-    fn with_token_capacity(capacity: usize) -> Self {
+    fn with_capacity(tokens: usize, string: usize) -> Self {
         Self {
-            tokens: Vec::with_capacity(capacity),
-            mappings: Mappings::with_capacity(capacity + 1),
+            tokens: Vec::with_capacity(tokens),
+            mappings: Mappings::with_capacity(tokens + 1),
+            text: String::with_capacity(string),
             ..Self::default()
         }
     }
@@ -340,6 +342,9 @@ impl<'a> LogicalLinesBuilder<'a> {
 
         // TODO(charlie): "Mute" strings.
         let text = if let Tok::String { value, .. } = token {
+            // Replace the content of strings with a non-whs sequence because some lints
+            // search for whitespace in the document and whitespace inside of the strinig
+            // would complicate the search.
             Cow::Owned(format!("\"{}\"", "x".repeat(value.width())))
         } else {
             Cow::Borrowed(locator.slice(Range {
diff --git a/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs b/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
index 6cf15574577509..a7d8fba96d457c 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
@@ -103,17 +103,16 @@ impl Violation for WhitespaceBeforePunctuation {
 
 // TODO(charlie): Pycodestyle has a negative lookahead on the end.
 static EXTRANEOUS_WHITESPACE_REGEX: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"([\[({][ \t]|[ \t][]}),;:])").unwrap());
+    Lazy::new(|| Regex::new(r"[\[({][ \t]|[ \t][]}),;:]").unwrap());
 
 /// E201, E202, E203
 #[cfg(feature = "logical_lines")]
 pub fn extraneous_whitespace(line: &str) -> Vec<(usize, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in EXTRANEOUS_WHITESPACE_REGEX.captures_iter(line) {
-        let match_ = line_match.get(1).unwrap();
-        let text = match_.as_str();
+    for line_match in EXTRANEOUS_WHITESPACE_REGEX.find_iter(line) {
+        let text = &line[line_match.range()];
         let char = text.trim();
-        let found = match_.start();
+        let found = line_match.start();
         if text.chars().last().unwrap().is_ascii_whitespace() {
             diagnostics.push((found + 1, WhitespaceAfterOpenBracket.into()));
         } else if line.chars().nth(found - 1).map_or(false, |c| c != ',') {
diff --git a/crates/ruff/src/rules/pycodestyle/rules/mod.rs b/crates/ruff/src/rules/pycodestyle/rules/mod.rs
index f36347fa4c8327..2e79852166483e 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/mod.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/mod.rs
@@ -86,3 +86,57 @@ mod whitespace_around_keywords;
 mod whitespace_around_named_parameter_equals;
 mod whitespace_before_comment;
 mod whitespace_before_parameters;
+
+#[allow(unused)]
+enum Whitespace {
+    None,
+    Single,
+    Many,
+    Tab,
+}
+
+impl Whitespace {
+    #[allow(dead_code)]
+    fn leading(content: &str) -> Self {
+        let mut count = 0u32;
+
+        for c in content.chars() {
+            if c == '\t' {
+                return Self::Tab;
+            } else if c.is_whitespace() {
+                count += 1;
+            } else {
+                break;
+            }
+        }
+
+        match count {
+            0 => Self::None,
+            1 => Self::Single,
+            _ => Self::Many,
+        }
+    }
+
+    #[allow(dead_code)]
+    fn trailing(content: &str) -> (Self, usize) {
+        let mut count = 0u32;
+        let mut offset = 0;
+
+        for c in content.chars().rev() {
+            if c == '\t' {
+                return (Self::Tab, offset + 1);
+            } else if c.is_whitespace() {
+                count += 1;
+                offset += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+
+        match count {
+            0 => (Self::None, 0),
+            1 => (Self::Single, offset),
+            _ => (Self::Many, offset),
+        }
+    }
+}
diff --git a/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs b/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
index fedc6b69e2384d..08fff10c2217f3 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
@@ -2,10 +2,15 @@
 
 use once_cell::sync::Lazy;
 use regex::Regex;
+use rustpython_parser::ast::Location;
+use rustpython_parser::Tok;
 
+use crate::rules::pycodestyle::helpers::is_op_token;
+use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
+use ruff_python_ast::source_code::Locator;
 
 /// ## What it does
 /// Checks for extraneous tabs before an operator.
@@ -123,27 +128,32 @@ impl Violation for MultipleSpacesAfterOperator {
     }
 }
 
-static OPERATOR_REGEX: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"[^,\s](\s*)(?:[-+*/|!<=>%&^]+|:=)(\s*)").unwrap());
+static OPERATOR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"[-+*/|!<=>%&^]+|:=").unwrap());
 
 /// E221, E222, E223, E224
 #[cfg(feature = "logical_lines")]
 pub fn space_around_operator(line: &str) -> Vec<(usize, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in OPERATOR_REGEX.captures_iter(line) {
-        let before = line_match.get(1).unwrap();
-        let after = line_match.get(2).unwrap();
-
-        if before.as_str().contains('\t') {
-            diagnostics.push((before.start(), TabBeforeOperator.into()));
-        } else if before.as_str().len() > 1 {
-            diagnostics.push((before.start(), MultipleSpacesBeforeOperator.into()));
+    for line_match in OPERATOR_REGEX.find_iter(line) {
+        let before = &line[..line_match.start()];
+        match Whitespace::trailing(before) {
+            (Whitespace::Tab, offset) => {
+                diagnostics.push((line_match.start() - offset, TabBeforeOperator.into()));
+            }
+            (Whitespace::Many, offset) => diagnostics.push((
+                line_match.start() - offset,
+                MultipleSpacesBeforeOperator.into(),
+            )),
+            _ => {}
         }
 
-        if after.as_str().contains('\t') {
-            diagnostics.push((after.start(), TabAfterOperator.into()));
-        } else if after.as_str().len() > 1 {
-            diagnostics.push((after.start(), MultipleSpacesAfterOperator.into()));
+        let after = &line[line_match.end()..];
+        match Whitespace::leading(after) {
+            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterOperator.into())),
+            Whitespace::Many => {
+                diagnostics.push((line_match.end(), MultipleSpacesAfterOperator.into()));
+            }
+            _ => {}
         }
     }
     diagnostics
diff --git a/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs b/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
index f7c056b917a6ef..1d3d16d11fb653 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
@@ -3,6 +3,7 @@
 use once_cell::sync::Lazy;
 use regex::Regex;
 
+use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
@@ -111,27 +112,33 @@ impl Violation for TabBeforeKeyword {
 }
 
 static KEYWORD_REGEX: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(\s*)\b(?:False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b(\s*)").unwrap()
+    Regex::new(r"\b(False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b").unwrap()
 });
 
 /// E271, E272, E273, E274
 #[cfg(feature = "logical_lines")]
 pub fn whitespace_around_keywords(line: &str) -> Vec<(usize, DiagnosticKind)> {
     let mut diagnostics = vec![];
-    for line_match in KEYWORD_REGEX.captures_iter(line) {
-        let before = line_match.get(1).unwrap();
-        let after = line_match.get(2).unwrap();
-
-        if before.as_str().contains('\t') {
-            diagnostics.push((before.start(), TabBeforeKeyword.into()));
-        } else if before.as_str().len() > 1 {
-            diagnostics.push((before.start(), MultipleSpacesBeforeKeyword.into()));
+    for line_match in KEYWORD_REGEX.find_iter(line) {
+        let before = &line[..line_match.start()];
+        match Whitespace::trailing(before) {
+            (Whitespace::Tab, offset) => {
+                diagnostics.push((line_match.start() - offset, TabBeforeKeyword.into()));
+            }
+            (Whitespace::Many, offset) => diagnostics.push((
+                line_match.start() - offset,
+                MultipleSpacesBeforeKeyword.into(),
+            )),
+            _ => {}
         }
 
-        if after.as_str().contains('\t') {
-            diagnostics.push((after.start(), TabAfterKeyword.into()));
-        } else if after.as_str().len() > 1 {
-            diagnostics.push((after.start(), MultipleSpacesAfterKeyword.into()));
+        let after = &line[line_match.end()..];
+        match Whitespace::leading(after) {
+            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterKeyword.into())),
+            Whitespace::Many => {
+                diagnostics.push((line_match.end(), MultipleSpacesAfterKeyword.into()));
+            }
+            _ => {}
         }
     }
     diagnostics
diff --git a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E221_E22.py.snap b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E221_E22.py.snap
index a980f134779b6c..9992378d62e7c9 100644
--- a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E221_E22.py.snap
+++ b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E221_E22.py.snap
@@ -114,4 +114,32 @@ expression: diagnostics
   fix:
     edits: []
   parent: ~
+- kind:
+    name: MultipleSpacesBeforeOperator
+    body: Multiple spaces before operator
+    suggestion: ~
+    fixable: false
+  location:
+    row: 31
+    column: 3
+  end_location:
+    row: 31
+    column: 3
+  fix:
+    edits: []
+  parent: ~
+- kind:
+    name: MultipleSpacesBeforeOperator
+    body: Multiple spaces before operator
+    suggestion: ~
+    fixable: false
+  location:
+    row: 32
+    column: 3
+  end_location:
+    row: 32
+    column: 3
+  fix:
+    edits: []
+  parent: ~
 
diff --git a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E272_E27.py.snap b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E272_E27.py.snap
index 7b082431d1ffee..f9cb57dfff6c0b 100644
--- a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E272_E27.py.snap
+++ b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E272_E27.py.snap
@@ -2,6 +2,34 @@
 source: crates/ruff/src/rules/pycodestyle/mod.rs
 expression: diagnostics
 ---
+- kind:
+    name: MultipleSpacesBeforeKeyword
+    body: Multiple spaces before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 4
+    column: 8
+  end_location:
+    row: 4
+    column: 8
+  fix:
+    edits: []
+  parent: ~
+- kind:
+    name: MultipleSpacesBeforeKeyword
+    body: Multiple spaces before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 6
+    column: 4
+  end_location:
+    row: 6
+    column: 4
+  fix:
+    edits: []
+  parent: ~
 - kind:
     name: MultipleSpacesBeforeKeyword
     body: Multiple spaces before keyword
diff --git a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E274_E27.py.snap b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E274_E27.py.snap
index 511063725035f3..73711d250914a8 100644
--- a/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E274_E27.py.snap
+++ b/crates/ruff/src/rules/pycodestyle/snapshots/ruff__rules__pycodestyle__tests__E274_E27.py.snap
@@ -2,6 +2,48 @@
 source: crates/ruff/src/rules/pycodestyle/mod.rs
 expression: diagnostics
 ---
+- kind:
+    name: TabBeforeKeyword
+    body: Tab before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 10
+    column: 9
+  end_location:
+    row: 10
+    column: 9
+  fix:
+    edits: []
+  parent: ~
+- kind:
+    name: TabBeforeKeyword
+    body: Tab before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 12
+    column: 5
+  end_location:
+    row: 12
+    column: 5
+  fix:
+    edits: []
+  parent: ~
+- kind:
+    name: TabBeforeKeyword
+    body: Tab before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 12
+    column: 9
+  end_location:
+    row: 12
+    column: 9
+  fix:
+    edits: []
+  parent: ~
 - kind:
     name: TabBeforeKeyword
     body: Tab before keyword
@@ -9,10 +51,24 @@ expression: diagnostics
     fixable: false
   location:
     row: 28
-    column: 1
+    column: 2
   end_location:
     row: 28
-    column: 1
+    column: 2
+  fix:
+    edits: []
+  parent: ~
+- kind:
+    name: TabBeforeKeyword
+    body: Tab before keyword
+    suggestion: ~
+    fixable: false
+  location:
+    row: 30
+    column: 5
+  end_location:
+    row: 30
+    column: 5
   fix:
     edits: []
   parent: ~
@@ -23,10 +79,10 @@ expression: diagnostics
     fixable: false
   location:
     row: 30
-    column: 4
+    column: 9
   end_location:
     row: 30
-    column: 4
+    column: 9
   fix:
     edits: []
   parent: ~