From 1bf62d1f98006429b1363bb7005dece47a8d4833 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Thu, 14 Mar 2024 17:26:04 +0000 Subject: [PATCH] fix the bug --- Cargo.lock | 1 + Cargo.toml | 1 + .../test/fixtures/pyflakes/F821_28.py | 9 ++++ crates/ruff_linter/src/rules/pyflakes/mod.rs | 1 + ...les__pyflakes__tests__F821_F821_28.py.snap | 10 ++++ crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 47 ++++++++++++------- crates/ruff_python_parser/src/lexer/cursor.rs | 10 ---- 8 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py create mode 100644 crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap diff --git a/Cargo.lock b/Cargo.lock index b8345646d05ace..0a72f59ee275c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2368,6 +2368,7 @@ dependencies = [ "static_assertions", "tiny-keccak", "unicode-ident", + "unicode-normalization", "unicode_names2", ] diff --git a/Cargo.toml b/Cargo.toml index d1de94534a0e71..c07df6dbc2ef37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" } unicode-ident = { version = "1.0.12" } unicode-width = { version = "0.1.11" } unicode_names2 = { version = "1.2.2" } +unicode-normalization = { version = "0.1.23" } ureq = { version = "2.9.6" } url = { version = "2.5.0" } uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py new file mode 100644 index 00000000000000..2bdea407cbf0c8 --- /dev/null +++ b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py @@ -0,0 +1,9 @@ +"""Test that unicode identifiers are NFKC-normalised""" + +𝒞 = 500 +print(𝒞) +print(C + 𝒞) # 2 references to the same variable due to NFKC normalization +print(C / 𝒞) +print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) + +print(𝒟) # F821 diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs index aa08d9d32de65f..563c48422c1387 100644 --- a/crates/ruff_linter/src/rules/pyflakes/mod.rs +++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs @@ -156,6 +156,7 @@ mod tests { #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))] #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))] #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))] + #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))] #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))] diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap new file mode 100644 index 00000000000000..e8464267070eb8 --- /dev/null +++ b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap @@ -0,0 +1,10 @@ +--- +source: crates/ruff_linter/src/rules/pyflakes/mod.rs +--- +F821_28.py:9:7: F821 Undefined name `𝒟` + | +7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) +8 | +9 | print(𝒟) # F821 + | ^ F821 + | diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 886bb07fec0b6b..2ccf94a8b181fb 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -28,6 +28,7 @@ rustc-hash = { workspace = true } static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } +unicode-normalization = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index bb6316eb641fa6..d43cf853059be4 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -29,9 +29,10 @@ //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html use std::iter::FusedIterator; -use std::{char, cmp::Ordering, str::FromStr}; +use std::{borrow::Cow, char, cmp::Ordering, str::FromStr}; use unicode_ident::{is_xid_continue, is_xid_start}; +use unicode_normalization::UnicodeNormalization; use ruff_python_ast::{Int, IpyEscapeKind}; use ruff_text_size::{TextLen, TextRange, TextSize}; @@ -197,11 +198,37 @@ impl<'source> Lexer<'source> { _ => {} } - self.cursor.eat_while(is_identifier_continuation); + let mut is_ascii = first.is_ascii(); - let text = self.token_text(); + loop { + let c = self.cursor.first(); + // Arrange things such that ASCII codepoints never + // result in the slower `is_xid_continue` getting called. + if c.is_ascii() { + if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') { + break; + } + } else { + is_ascii = false; + if !is_xid_continue(c) { + break; + } + } + if self.cursor.is_eof() { + break; + } + self.cursor.bump(); + } - let keyword = match text { + let text = { + if is_ascii { + Cow::Borrowed(self.token_text()) + } else { + Cow::Owned(self.token_text().nfkc().collect()) + } + }; + + let keyword = match &*text { "False" => Tok::False, "None" => Tok::None, "True" => Tok::True, @@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } -// Checks if the character c is a valid continuation character as described -// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers -fn is_identifier_continuation(c: char) -> bool { - // Arrange things such that ASCII codepoints never - // result in the slower `is_xid_continue` getting called. - if c.is_ascii() { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') - } else { - is_xid_continue(c) - } -} - /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens) /// characters. /// diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index 6dd8e63d70ad89..eeac9b9228e709 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -119,16 +119,6 @@ impl<'a> Cursor<'a> { } } - /// Eats symbols while predicate returns true or until the end of file is reached. - #[inline] - pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - // It was tried making optimized version of this for eg. line comments, but - // LLVM can inline all of this and compile it down to fast iteration over bytes. - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } - /// Skips the next `count` bytes. /// /// ## Panics