Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the invalid-escape-sequence rule #5359

Merged
merged 1 commit into from
Jun 25, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 97 additions & 85 deletions crates/ruff/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
use anyhow::{bail, Result};
use log::error;
use ruff_text_size::{TextLen, TextRange, TextSize};

use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::str::{leading_quote, trailing_quote};

/// ## What it does
/// Checks for invalid escape sequences.
Expand All @@ -21,6 +20,9 @@ use ruff_python_ast::source_code::Locator;
/// ```python
/// regex = r"\.png$"
/// ```
///
/// ## References
/// - [Python documentation: String and Bytes literals](https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)
#[violation]
pub struct InvalidEscapeSequence(char);

Expand All @@ -36,24 +38,6 @@ impl AlwaysAutofixableViolation for InvalidEscapeSequence {
}
}

// See: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
const VALID_ESCAPE_SEQUENCES: &[char; 23] = &[
'\n', '\\', '\'', '"', 'a', 'b', 'f', 'n', 'r', 't', 'v', '0', '1', '2', '3', '4', '5', '6',
'7', 'x', // Escape sequences only recognized in string literals
'N', 'u', 'U',
];

/// Return the quotation markers used for a String token.
fn extract_quote(text: &str) -> Result<&str> {
for quote in ["'''", "\"\"\"", "'", "\""] {
if text.ends_with(quote) {
return Ok(quote);
}
}

bail!("Unable to find quotation mark for String token")
}

/// W605
pub(crate) fn invalid_escape_sequence(
locator: &Locator,
Expand All @@ -65,84 +49,112 @@ pub(crate) fn invalid_escape_sequence(
let text = locator.slice(range);

// Determine whether the string is single- or triple-quoted.
let Ok(quote) = extract_quote(text) else {
error!("Unable to find quotation mark for string token");
let Some(leading_quote) = leading_quote(text) else {
return diagnostics;
};
let Some(trailing_quote) = trailing_quote(text) else {
return diagnostics;
};
let quote_pos = text.find(quote).unwrap();
let prefix = &text[..quote_pos];
let body = &text[quote_pos + quote.len()..text.len() - quote.len()];
let body = &text[leading_quote.len()..text.len() - trailing_quote.len()];

if !prefix.contains(['r', 'R']) {
let start_offset =
range.start() + TextSize::try_from(quote_pos).unwrap() + quote.text_len();
if leading_quote.contains(['r', 'R']) {
return diagnostics;
}

let mut chars_iter = body.char_indices().peekable();
let start_offset = range.start() + TextSize::try_from(leading_quote.len()).unwrap();

let mut contains_valid_escape_sequence = false;
let mut chars_iter = body.char_indices().peekable();

while let Some((i, c)) = chars_iter.next() {
if c != '\\' {
continue;
}
let mut contains_valid_escape_sequence = false;

// If the previous character was also a backslash, skip.
if i > 0 && body.as_bytes()[i - 1] == b'\\' {
continue;
}
while let Some((i, c)) = chars_iter.next() {
if c != '\\' {
continue;
}

// If we're at the end of the file, skip.
let Some((_, next_char)) = chars_iter.peek() else {
continue;
};
// If the previous character was also a backslash, skip.
if i > 0 && body.as_bytes()[i - 1] == b'\\' {
continue;
}

// If we're at the end of the line, skip
if matches!(next_char, '\n' | '\r') {
continue;
}
// If we're at the end of the file, skip.
let Some((_, next_char)) = chars_iter.peek() else {
continue;
};

// If the next character is a valid escape sequence, skip.
if VALID_ESCAPE_SEQUENCES.contains(next_char) {
contains_valid_escape_sequence = true;
continue;
}
// If we're at the end of the line, skip
if matches!(next_char, '\n' | '\r') {
continue;
}

let location = start_offset + TextSize::try_from(i).unwrap();
let range = TextRange::at(location, next_char.text_len() + TextSize::from(1));
let diagnostic = Diagnostic::new(InvalidEscapeSequence(*next_char), range);
diagnostics.push(diagnostic);
// If the next character is a valid escape sequence, skip.
// See: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals.
if matches!(
next_char,
'\n'
| '\\'
| '\''
| '"'
| 'a'
| 'b'
| 'f'
| 'n'
| 'r'
| 't'
| 'v'
| '0'
| '1'
| '2'
| '3'
| '4'
| '5'
| '6'
| '7'
| 'x'
// Escape sequences only recognized in string literals
| 'N'
| 'u'
| 'U'
) {
contains_valid_escape_sequence = true;
continue;
}

if autofix {
if contains_valid_escape_sequence {
// Escape with backslash.
for diagnostic in &mut diagnostics {
diagnostic.set_fix(Fix::automatic(Edit::insertion(
r"\".to_string(),
diagnostic.range().start() + TextSize::from(1),
)));
}
} else {
// Turn into raw string.
for diagnostic in &mut diagnostics {
// If necessary, add a space between any leading keyword (`return`, `yield`,
// `assert`, etc.) and the string. For example, `return"foo"` is valid, but
// `returnr"foo"` is not.
let requires_space = locator
.slice(TextRange::up_to(range.start()))
.chars()
.last()
.map_or(false, |char| char.is_ascii_alphabetic());

diagnostic.set_fix(Fix::automatic(Edit::insertion(
if requires_space {
" r".to_string()
} else {
"r".to_string()
},
range.start() + TextSize::try_from(quote_pos).unwrap(),
)));
}
let location = start_offset + TextSize::try_from(i).unwrap();
let range = TextRange::at(location, next_char.text_len() + TextSize::from(1));
let diagnostic = Diagnostic::new(InvalidEscapeSequence(*next_char), range);
diagnostics.push(diagnostic);
}

if autofix {
if contains_valid_escape_sequence {
// Escape with backslash.
for diagnostic in &mut diagnostics {
diagnostic.set_fix(Fix::automatic(Edit::insertion(
r"\".to_string(),
diagnostic.range().start() + TextSize::from(1),
)));
}
} else {
// Turn into raw string.
for diagnostic in &mut diagnostics {
// If necessary, add a space between any leading keyword (`return`, `yield`,
// `assert`, etc.) and the string. For example, `return"foo"` is valid, but
// `returnr"foo"` is not.
let requires_space = locator
.slice(TextRange::up_to(range.start()))
.chars()
.last()
.map_or(false, |char| char.is_ascii_alphabetic());

diagnostic.set_fix(Fix::automatic(Edit::insertion(
if requires_space {
" r".to_string()
} else {
"r".to_string()
},
range.start(),
)));
}
}
}
Expand Down