From 2567e14b7a550074c37fc5695e85510b5120a37a Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Tue, 4 Jun 2024 14:15:46 +0530 Subject: [PATCH] Lexer should consider BOM for the start offset (#11732) ## Summary This PR fixes a bug where the lexer didn't consider the BOM into the start offset. fixes: #11731 ## Test Plan Add multiple test cases which involves BOM character in the source for the lexer and verify the snapshot. --- crates/ruff_python_parser/src/lexer.rs | 55 ++++++++++++++----- ...ruff_python_parser__lexer__tests__bom.snap | 29 ++++++++++ ...parser__lexer__tests__bom_with_offset.snap | 29 ++++++++++ ...r__lexer__tests__bom_with_offset_edge.snap | 19 +++++++ 4 files changed, 117 insertions(+), 15 deletions(-) create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index aa4eebdc683a2..1dc686f5af76e 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -30,6 +30,8 @@ mod cursor; mod fstring; mod indentation; +const BOM: char = '\u{feff}'; + /// A lexer for Python source code. #[derive(Debug)] pub struct Lexer<'src> { @@ -100,11 +102,10 @@ impl<'src> Lexer<'src> { errors: Vec::new(), }; - // TODO: Handle possible mismatch between BOM and explicit encoding declaration. - // spell-checker:ignore feff - lexer.cursor.eat_char('\u{feff}'); - - if start_offset > TextSize::new(0) { + if start_offset == TextSize::new(0) { + // TODO: Handle possible mismatch between BOM and explicit encoding declaration. + lexer.cursor.eat_char(BOM); + } else { lexer.cursor.skip_bytes(start_offset.to_usize()); } @@ -1922,8 +1923,8 @@ mod tests { } } - fn lex(source: &str, mode: Mode) -> LexerOutput { - let mut lexer = Lexer::new(source, mode, TextSize::default()); + fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput { + let mut lexer = Lexer::new(source, mode, start_offset); let mut tokens = Vec::new(); loop { let kind = lexer.next_token(); @@ -1943,8 +1944,8 @@ mod tests { } } - fn lex_valid(source: &str, mode: Mode) -> LexerOutput { - let output = lex(source, mode); + fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput { + let output = lex(source, mode, start_offset); if !output.errors.is_empty() { let mut message = "Unexpected lexical errors for a valid source:\n".to_string(); @@ -1959,7 +1960,7 @@ mod tests { } fn lex_invalid(source: &str, mode: Mode) -> LexerOutput { - let output = lex(source, mode); + let output = lex(source, mode, TextSize::default()); assert!( !output.errors.is_empty(), @@ -1970,11 +1971,35 @@ mod tests { } fn lex_source(source: &str) -> LexerOutput { - lex_valid(source, Mode::Module) + lex_valid(source, Mode::Module, TextSize::default()) + } + + fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput { + lex_valid(source, Mode::Module, start_offset) } fn lex_jupyter_source(source: &str) -> LexerOutput { - lex_valid(source, Mode::Ipython) + lex_valid(source, Mode::Ipython, TextSize::default()) + } + + #[test] + fn bom() { + let source = "\u{feff}x = 1"; + assert_snapshot!(lex_source(source)); + } + + #[test] + fn bom_with_offset() { + let source = "\u{feff}x + y + z"; + assert_snapshot!(lex_source_with_offset(source, TextSize::new(7))); + } + + #[test] + fn bom_with_offset_edge() { + // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z) + // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731 + let source = "\u{feff}x + y + z"; + assert_snapshot!(lex_source_with_offset(source, TextSize::new(11))); } fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput { @@ -2118,7 +2143,7 @@ foo = ,func def f(arg=%timeit a = b): pass" .trim(); - let output = lex(source, Mode::Ipython); + let output = lex(source, Mode::Ipython, TextSize::default()); assert!(output.errors.is_empty()); assert_no_ipython_escape_command(&output.tokens); } @@ -2351,7 +2376,7 @@ if first: } fn get_tokens_only(source: &str) -> Vec { - let output = lex(source, Mode::Module); + let output = lex(source, Mode::Module, TextSize::default()); assert!(output.errors.is_empty()); output.tokens.into_iter().map(|token| token.kind).collect() } @@ -2593,7 +2618,7 @@ f"{(lambda x:{x})}" } fn lex_fstring_error(source: &str) -> FStringErrorType { - let output = lex(source, Mode::Module); + let output = lex(source, Mode::Module, TextSize::default()); match output .errors .into_iter() diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap new file mode 100644 index 0000000000000..ea400d2e3b47c --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap @@ -0,0 +1,29 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +## Tokens +``` +[ + ( + Name( + "x", + ), + 3..4, + ), + ( + Equal, + 5..6, + ), + ( + Int( + 1, + ), + 7..8, + ), + ( + Newline, + 8..8, + ), +] +``` diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap new file mode 100644 index 0000000000000..9ae6aaa3cfa24 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap @@ -0,0 +1,29 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: "lex_source_with_offset(source, TextSize::new(7))" +--- +## Tokens +``` +[ + ( + Name( + "y", + ), + 7..8, + ), + ( + Plus, + 9..10, + ), + ( + Name( + "z", + ), + 11..12, + ), + ( + Newline, + 12..12, + ), +] +``` diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap new file mode 100644 index 0000000000000..a6e704c18f3fc --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap @@ -0,0 +1,19 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: "lex_source_with_offset(source, TextSize::new(11))" +--- +## Tokens +``` +[ + ( + Name( + "z", + ), + 11..12, + ), + ( + Newline, + 12..12, + ), +] +```