From 0d02da182c92129c980ec7697192634d647635d1 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Thu, 14 Mar 2024 17:26:04 +0000 Subject: [PATCH 1/6] fix the bug --- Cargo.lock | 1 + Cargo.toml | 1 + .../test/fixtures/pyflakes/F821_28.py | 9 ++++ crates/ruff_linter/src/rules/pyflakes/mod.rs | 1 + ...les__pyflakes__tests__F821_F821_28.py.snap | 10 ++++ crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 47 ++++++++++++------- crates/ruff_python_parser/src/lexer/cursor.rs | 10 ---- 8 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py create mode 100644 crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap diff --git a/Cargo.lock b/Cargo.lock index b8345646d05ace..0a72f59ee275c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2368,6 +2368,7 @@ dependencies = [ "static_assertions", "tiny-keccak", "unicode-ident", + "unicode-normalization", "unicode_names2", ] diff --git a/Cargo.toml b/Cargo.toml index d1de94534a0e71..c07df6dbc2ef37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" } unicode-ident = { version = "1.0.12" } unicode-width = { version = "0.1.11" } unicode_names2 = { version = "1.2.2" } +unicode-normalization = { version = "0.1.23" } ureq = { version = "2.9.6" } url = { version = "2.5.0" } uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py new file mode 100644 index 00000000000000..2bdea407cbf0c8 --- /dev/null +++ b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py @@ -0,0 +1,9 @@ +"""Test that unicode identifiers are NFKC-normalised""" + +𝒞 = 500 +print(𝒞) +print(C + 𝒞) # 2 references to the same variable due to NFKC normalization +print(C / 𝒞) +print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) + +print(𝒟) # F821 diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs index aa08d9d32de65f..563c48422c1387 100644 --- a/crates/ruff_linter/src/rules/pyflakes/mod.rs +++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs @@ -156,6 +156,7 @@ mod tests { #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))] #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))] #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))] + #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))] #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))] diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap new file mode 100644 index 00000000000000..e8464267070eb8 --- /dev/null +++ b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap @@ -0,0 +1,10 @@ +--- +source: crates/ruff_linter/src/rules/pyflakes/mod.rs +--- +F821_28.py:9:7: F821 Undefined name `𝒟` + | +7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) +8 | +9 | print(𝒟) # F821 + | ^ F821 + | diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 886bb07fec0b6b..2ccf94a8b181fb 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -28,6 +28,7 @@ rustc-hash = { workspace = true } static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } +unicode-normalization = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index bb6316eb641fa6..a139855bfb0c09 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -29,9 +29,10 @@ //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html use std::iter::FusedIterator; -use std::{char, cmp::Ordering, str::FromStr}; +use std::{borrow::Cow, char, cmp::Ordering, str::FromStr}; use unicode_ident::{is_xid_continue, is_xid_start}; +use unicode_normalization::UnicodeNormalization; use ruff_python_ast::{Int, IpyEscapeKind}; use ruff_text_size::{TextLen, TextRange, TextSize}; @@ -197,11 +198,37 @@ impl<'source> Lexer<'source> { _ => {} } - self.cursor.eat_while(is_identifier_continuation); + let mut is_ascii = first.is_ascii(); - let text = self.token_text(); + loop { + let c = self.cursor.first(); + // Arrange things such that ASCII codepoints never + // result in the slower `is_xid_continue` getting called. + if c.is_ascii() { + if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') { + break; + } + } else { + if !is_xid_continue(c) { + break; + } + is_ascii = false; + } + if self.cursor.is_eof() { + break; + } + self.cursor.bump(); + } - let keyword = match text { + let text = { + if is_ascii { + Cow::Borrowed(self.token_text()) + } else { + Cow::Owned(self.token_text().nfkc().collect()) + } + }; + + let keyword = match &*text { "False" => Tok::False, "None" => Tok::None, "True" => Tok::True, @@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } -// Checks if the character c is a valid continuation character as described -// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers -fn is_identifier_continuation(c: char) -> bool { - // Arrange things such that ASCII codepoints never - // result in the slower `is_xid_continue` getting called. - if c.is_ascii() { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') - } else { - is_xid_continue(c) - } -} - /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens) /// characters. /// diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index 6dd8e63d70ad89..eeac9b9228e709 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -119,16 +119,6 @@ impl<'a> Cursor<'a> { } } - /// Eats symbols while predicate returns true or until the end of file is reached. - #[inline] - pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - // It was tried making optimized version of this for eg. line comments, but - // LLVM can inline all of this and compile it down to fast iteration over bytes. - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } - /// Skips the next `count` bytes. /// /// ## Panics From a8a9863dfa6beba7156a2a95e393dbd32a945756 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Thu, 14 Mar 2024 17:31:18 +0000 Subject: [PATCH 2/6] remove a debug assertion that no longer passes --- .../src/expression/expr_name.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/crates/ruff_python_formatter/src/expression/expr_name.rs b/crates/ruff_python_formatter/src/expression/expr_name.rs index f2014f6771f766..68cf5af95798e8 100644 --- a/crates/ruff_python_formatter/src/expression/expr_name.rs +++ b/crates/ruff_python_formatter/src/expression/expr_name.rs @@ -1,4 +1,4 @@ -use ruff_formatter::{write, FormatContext}; +use ruff_formatter::write; use ruff_python_ast::AnyNodeRef; use ruff_python_ast::ExprName; @@ -11,16 +11,11 @@ pub struct FormatExprName; impl FormatNodeRule for FormatExprName { fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> { - let ExprName { id, range, ctx: _ } = item; - - debug_assert_eq!( - id.as_str(), - f.context() - .source_code() - .slice(*range) - .text(f.context().source_code()) - ); - + let ExprName { + id: _, + range, + ctx: _, + } = item; write!(f, [source_text_slice(*range)]) } From 1646f0fb4a711614e5e1027472f56a1607a957d1 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Fri, 15 Mar 2024 15:44:13 +0000 Subject: [PATCH 3/6] optimize a little bit --- crates/ruff_python_parser/src/lexer.rs | 31 ++++++++++++-------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index a139855bfb0c09..c3a42c0a1c05d8 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -29,7 +29,7 @@ //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html use std::iter::FusedIterator; -use std::{borrow::Cow, char, cmp::Ordering, str::FromStr}; +use std::{char, cmp::Ordering, str::FromStr}; use unicode_ident::{is_xid_continue, is_xid_start}; use unicode_normalization::UnicodeNormalization; @@ -170,7 +170,7 @@ impl<'source> Lexer<'source> { } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self, first: char) -> Result { + fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' match (first, self.cursor.first()) { ('f' | 'F', quote @ ('\'' | '"')) => { @@ -198,7 +198,7 @@ impl<'source> Lexer<'source> { _ => {} } - let mut is_ascii = first.is_ascii(); + let mut is_ascii = ascii_first_char; loop { let c = self.cursor.first(); @@ -220,15 +220,7 @@ impl<'source> Lexer<'source> { self.cursor.bump(); } - let text = { - if is_ascii { - Cow::Borrowed(self.token_text()) - } else { - Cow::Owned(self.token_text().nfkc().collect()) - } - }; - - let keyword = match &*text { + let keyword = match self.token_text() { "False" => Tok::False, "None" => Tok::None, "True" => Tok::True, @@ -267,10 +259,15 @@ impl<'source> Lexer<'source> { "while" => Tok::While, "with" => Tok::With, "yield" => Tok::Yield, - _ => { + text => { + let name = if is_ascii { + text.to_string() + } else { + text.nfkc().collect() + }; return Ok(Tok::Name { - name: text.to_string().into_boxed_str(), - }) + name: name.into_boxed_str(), + }); } }; @@ -933,7 +930,7 @@ impl<'source> Lexer<'source> { if c.is_ascii() { self.consume_ascii_character(c) } else if is_unicode_identifier_start(c) { - let identifier = self.lex_identifier(c)?; + let identifier = self.lex_identifier(c, false)?; self.state = State::Other; Ok((identifier, self.token_range())) @@ -1093,7 +1090,7 @@ impl<'source> Lexer<'source> { // Dispatch based on the given character. fn consume_ascii_character(&mut self, c: char) -> Result { let token = match c { - c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, + c if is_ascii_identifier_start(c) => self.lex_identifier(c, true)?, '0'..='9' => self.lex_number(c)?, '#' => return Ok((self.lex_comment(), self.token_range())), '\'' | '"' => self.lex_string(None, c)?, From d9a68bad8afcb43bc9e0a62f409de7965247a276 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Fri, 15 Mar 2024 18:05:00 +0000 Subject: [PATCH 4/6] Optimize more --- crates/ruff_python_parser/src/lexer.rs | 35 ++++++++----------- crates/ruff_python_parser/src/lexer/cursor.rs | 10 ++++++ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index c3a42c0a1c05d8..4f42bd8c5670f4 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -199,26 +199,8 @@ impl<'source> Lexer<'source> { } let mut is_ascii = ascii_first_char; - - loop { - let c = self.cursor.first(); - // Arrange things such that ASCII codepoints never - // result in the slower `is_xid_continue` getting called. - if c.is_ascii() { - if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') { - break; - } - } else { - if !is_xid_continue(c) { - break; - } - is_ascii = false; - } - if self.cursor.is_eof() { - break; - } - self.cursor.bump(); - } + self.cursor + .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); let keyword = match self.token_text() { "False" => Tok::False, @@ -1607,6 +1589,19 @@ fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool { + // Arrange things such that ASCII codepoints never + // result in the slower `is_xid_continue` getting called. + if c.is_ascii() { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') + } else { + *ascii_only_identifier = false; + is_xid_continue(c) + } +} + /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens) /// characters. /// diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index eeac9b9228e709..6dd8e63d70ad89 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -119,6 +119,16 @@ impl<'a> Cursor<'a> { } } + /// Eats symbols while predicate returns true or until the end of file is reached. + #[inline] + pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } + /// Skips the next `count` bytes. /// /// ## Panics From d785be50606df110d848606165e2f7a9aa406835 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Fri, 15 Mar 2024 18:41:35 +0000 Subject: [PATCH 5/6] docs and better naming --- crates/ruff_python_parser/src/lexer.rs | 27 ++++++++++++++++++++------ crates/ruff_python_parser/src/token.rs | 3 +++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 4f42bd8c5670f4..b8bd7df29b7dd2 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -170,7 +170,11 @@ impl<'source> Lexer<'source> { } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result { + fn lex_identifier( + &mut self, + first: char, + first_char_is_ascii: bool, + ) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' match (first, self.cursor.first()) { ('f' | 'F', quote @ ('\'' | '"')) => { @@ -198,7 +202,14 @@ impl<'source> Lexer<'source> { _ => {} } - let mut is_ascii = ascii_first_char; + // Keep track of whether the identifier is ASCII-only or not. + // + // This is important because Python applies NFKC normalization to + // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers. + // We need to therefore do the same in our lexer, but applying NFKC normalization + // unconditionally is extremely expensive. If we know an identifier is ASCII-only, + // (by far the most common case), we can skip NFKC normalization of the identifier. + let mut is_ascii = first_char_is_ascii; self.cursor .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); @@ -1589,15 +1600,19 @@ fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } -// Checks if the character c is a valid continuation character as described -// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers -fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool { +/// Checks if the character c is a valid continuation character as described +/// in . +/// +/// Additionally, this function also keeps track of whether or not the total +/// identifier is ASCII-only or not by mutably altering a reference to a +/// boolean value passed in. +fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool { // Arrange things such that ASCII codepoints never // result in the slower `is_xid_continue` getting called. if c.is_ascii() { matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') } else { - *ascii_only_identifier = false; + *identifier_is_ascii_only = false; is_xid_continue(c) } } diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 84080c1b8cad07..892915718fa1d3 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -16,6 +16,9 @@ pub enum Tok { /// Token value for a name, commonly known as an identifier. Name { /// The name value. + /// + /// Unicode names are NFKC-normalized by the lexer, + /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers) name: Box, }, /// Token value for an integer. From 35500ab44ada0d9cd8f1f9667ca78bf61dc85b10 Mon Sep 17 00:00:00 2001 From: Alex Waygood Date: Mon, 18 Mar 2024 11:36:46 +0000 Subject: [PATCH 6/6] address Micha's review --- crates/ruff_python_parser/src/lexer.rs | 44 ++++++++++++++++---------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index b8bd7df29b7dd2..e803c0263b1085 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -170,11 +170,7 @@ impl<'source> Lexer<'source> { } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier( - &mut self, - first: char, - first_char_is_ascii: bool, - ) -> Result { + fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' match (first, self.cursor.first()) { ('f' | 'F', quote @ ('\'' | '"')) => { @@ -209,11 +205,19 @@ impl<'source> Lexer<'source> { // We need to therefore do the same in our lexer, but applying NFKC normalization // unconditionally is extremely expensive. If we know an identifier is ASCII-only, // (by far the most common case), we can skip NFKC normalization of the identifier. - let mut is_ascii = first_char_is_ascii; + let mut is_ascii = first.is_ascii(); self.cursor .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); - let keyword = match self.token_text() { + let text = self.token_text(); + + if !is_ascii { + return Ok(Tok::Name { + name: text.nfkc().collect::().into_boxed_str(), + }); + } + + let keyword = match text { "False" => Tok::False, "None" => Tok::None, "True" => Tok::True, @@ -252,15 +256,10 @@ impl<'source> Lexer<'source> { "while" => Tok::While, "with" => Tok::With, "yield" => Tok::Yield, - text => { - let name = if is_ascii { - text.to_string() - } else { - text.nfkc().collect() - }; + _ => { return Ok(Tok::Name { - name: name.into_boxed_str(), - }); + name: text.to_string().into_boxed_str(), + }) } }; @@ -923,7 +922,7 @@ impl<'source> Lexer<'source> { if c.is_ascii() { self.consume_ascii_character(c) } else if is_unicode_identifier_start(c) { - let identifier = self.lex_identifier(c, false)?; + let identifier = self.lex_identifier(c)?; self.state = State::Other; Ok((identifier, self.token_range())) @@ -1083,7 +1082,7 @@ impl<'source> Lexer<'source> { // Dispatch based on the given character. fn consume_ascii_character(&mut self, c: char) -> Result { let token = match c { - c if is_ascii_identifier_start(c) => self.lex_identifier(c, true)?, + c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, '0'..='9' => self.lex_number(c)?, '#' => return Ok((self.lex_comment(), self.token_range())), '\'' | '"' => self.lex_string(None, c)?, @@ -2064,6 +2063,17 @@ def f(arg=%timeit a = b): assert_debug_snapshot!(lex_source(source)); } + fn get_tokens_only(source: &str) -> Vec { + lex_source(source).into_iter().map(|(tok, _)| tok).collect() + } + + #[test] + fn test_nfkc_normalization() { + let source1 = "𝒞 = 500"; + let source2 = "C = 500"; + assert_eq!(get_tokens_only(source1), get_tokens_only(source2)); + } + fn triple_quoted_eol(eol: &str) -> Vec { let source = format!("\"\"\"{eol} test string{eol} \"\"\""); lex_source(&source)