diff --git a/crates/ruff_python_parser/src/error.rs b/crates/ruff_python_parser/src/error.rs index 782820e56fdf2..0cb0c2d7df659 100644 --- a/crates/ruff_python_parser/src/error.rs +++ b/crates/ruff_python_parser/src/error.rs @@ -2,7 +2,6 @@ use std::fmt; use ruff_text_size::TextRange; -use crate::lexer::{LexicalError, LexicalErrorType}; use crate::TokenKind; /// Represents represent errors that occur during parsing and are @@ -295,3 +294,135 @@ impl std::fmt::Display for ParseErrorType { } } } + +/// Represents an error that occur during lexing and are +/// returned by the `parse_*` functions in the iterator in the +/// [lexer] implementation. +/// +/// [lexer]: crate::lexer +#[derive(Debug, Clone, PartialEq)] +pub struct LexicalError { + /// The type of error that occurred. + error: LexicalErrorType, + /// The location of the error. + location: TextRange, +} + +impl LexicalError { + /// Creates a new `LexicalError` with the given error type and location. + pub fn new(error: LexicalErrorType, location: TextRange) -> Self { + Self { error, location } + } + + pub fn error(&self) -> &LexicalErrorType { + &self.error + } + + pub fn into_error(self) -> LexicalErrorType { + self.error + } + + pub fn location(&self) -> TextRange { + self.location + } +} + +impl std::ops::Deref for LexicalError { + type Target = LexicalErrorType; + + fn deref(&self) -> &Self::Target { + self.error() + } +} + +impl std::error::Error for LexicalError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + Some(self.error()) + } +} + +impl std::fmt::Display for LexicalError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{} at byte offset {}", + self.error(), + u32::from(self.location().start()) + ) + } +} + +/// Represents the different types of errors that can occur during lexing. +#[derive(Debug, Clone, PartialEq)] +pub enum LexicalErrorType { + // TODO: Can probably be removed, the places it is used seem to be able + // to use the `UnicodeError` variant instead. + #[doc(hidden)] + StringError, + /// A string literal without the closing quote. + UnclosedStringError, + /// Decoding of a unicode escape sequence in a string literal failed. + UnicodeError, + /// Missing the `{` for unicode escape sequence. + MissingUnicodeLbrace, + /// Missing the `}` for unicode escape sequence. + MissingUnicodeRbrace, + /// The indentation is not consistent. + IndentationError, + /// An unrecognized token was encountered. + UnrecognizedToken { tok: char }, + /// An f-string error containing the [`FStringErrorType`]. + FStringError(FStringErrorType), + /// Invalid character encountered in a byte literal. + InvalidByteLiteral, + /// An unexpected character was encountered after a line continuation. + LineContinuationError, + /// An unexpected end of file was encountered. + Eof, + /// An unexpected error occurred. + OtherError(Box), +} + +impl std::error::Error for LexicalErrorType {} + +impl std::fmt::Display for LexicalErrorType { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + LexicalErrorType::StringError => write!(f, "Got unexpected string"), + LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"), + LexicalErrorType::InvalidByteLiteral => { + write!(f, "bytes can only contain ASCII literal characters") + } + LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"), + LexicalErrorType::IndentationError => { + write!(f, "unindent does not match any outer indentation level") + } + LexicalErrorType::UnrecognizedToken { tok } => { + write!(f, "Got unexpected token {tok}") + } + LexicalErrorType::LineContinuationError => { + write!(f, "unexpected character after line continuation character") + } + LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"), + LexicalErrorType::OtherError(msg) => write!(f, "{msg}"), + LexicalErrorType::UnclosedStringError => { + write!(f, "missing closing quote in string literal") + } + LexicalErrorType::MissingUnicodeLbrace => { + write!(f, "Missing `{{` in Unicode escape sequence") + } + LexicalErrorType::MissingUnicodeRbrace => { + write!(f, "Missing `}}` in Unicode escape sequence") + } + } + } +} + +#[cfg(target_pointer_width = "64")] +mod sizes { + use crate::error::{LexicalError, LexicalErrorType}; + use static_assertions::assert_eq_size; + + assert_eq_size!(LexicalErrorType, [u8; 24]); + assert_eq_size!(LexicalError, [u8; 32]); +} diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 46005529d5c09..4384df0da9c7e 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -9,23 +9,19 @@ use std::cmp::Ordering; use std::str::FromStr; -use bitflags::bitflags; use unicode_ident::{is_xid_continue, is_xid_start}; use unicode_normalization::UnicodeNormalization; -use ruff_python_ast::str::Quote; -use ruff_python_ast::str_prefix::{ - AnyStringPrefix, ByteStringPrefix, FStringPrefix, StringLiteralPrefix, -}; -use ruff_python_ast::{AnyStringFlags, Int, IpyEscapeKind, StringFlags}; +use ruff_python_ast::{Int, IpyEscapeKind, StringFlags}; use ruff_python_trivia::is_python_whitespace; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; +use ruff_text_size::{TextLen, TextRange, TextSize}; -use crate::error::FStringErrorType; +use crate::error::{FStringErrorType, LexicalError, LexicalErrorType}; use crate::lexer::cursor::{Cursor, EOF_CHAR}; use crate::lexer::fstring::{FStringContext, FStrings, FStringsCheckpoint}; use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint}; -use crate::{Mode, TokenKind}; +use crate::token::{TokenFlags, TokenKind, TokenValue}; +use crate::Mode; mod cursor; mod fstring; @@ -1511,317 +1507,6 @@ impl<'src> Lexer<'src> { } } -bitflags! { - #[derive(Clone, Copy, Debug, PartialEq, Eq)] - pub(crate) struct TokenFlags: u8 { - /// The token is a string with double quotes (`"`). - const DOUBLE_QUOTES = 1 << 0; - /// The token is a triple-quoted string i.e., it starts and ends with three consecutive - /// quote characters (`"""` or `'''`). - const TRIPLE_QUOTED_STRING = 1 << 1; - - /// The token is a unicode string i.e., prefixed with `u` or `U` - const UNICODE_STRING = 1 << 2; - /// The token is a byte string i.e., prefixed with `b` or `B` - const BYTE_STRING = 1 << 3; - /// The token is an f-string i.e., prefixed with `f` or `F` - const F_STRING = 1 << 4; - /// The token is a raw string and the prefix character is in lowercase. - const RAW_STRING_LOWERCASE = 1 << 5; - /// The token is a raw string and the prefix character is in uppercase. - const RAW_STRING_UPPERCASE = 1 << 6; - - /// The token is a raw string i.e., prefixed with `r` or `R` - const RAW_STRING = Self::RAW_STRING_LOWERCASE.bits() | Self::RAW_STRING_UPPERCASE.bits(); - } -} - -impl StringFlags for TokenFlags { - fn quote_style(self) -> Quote { - if self.intersects(TokenFlags::DOUBLE_QUOTES) { - Quote::Double - } else { - Quote::Single - } - } - - fn is_triple_quoted(self) -> bool { - self.intersects(TokenFlags::TRIPLE_QUOTED_STRING) - } - - fn prefix(self) -> AnyStringPrefix { - if self.intersects(TokenFlags::F_STRING) { - if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { - AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: false }) - } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { - AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: true }) - } else { - AnyStringPrefix::Format(FStringPrefix::Regular) - } - } else if self.intersects(TokenFlags::BYTE_STRING) { - if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { - AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: false }) - } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { - AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: true }) - } else { - AnyStringPrefix::Bytes(ByteStringPrefix::Regular) - } - } else if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { - AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false }) - } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { - AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: true }) - } else if self.intersects(TokenFlags::UNICODE_STRING) { - AnyStringPrefix::Regular(StringLiteralPrefix::Unicode) - } else { - AnyStringPrefix::Regular(StringLiteralPrefix::Empty) - } - } -} - -impl TokenFlags { - /// Returns `true` if the token is an f-string. - const fn is_f_string(self) -> bool { - self.intersects(TokenFlags::F_STRING) - } - - /// Returns `true` if the token is a triple-quoted f-string. - fn is_triple_quoted_fstring(self) -> bool { - self.contains(TokenFlags::F_STRING | TokenFlags::TRIPLE_QUOTED_STRING) - } - - /// Returns `true` if the token is a raw string. - const fn is_raw_string(self) -> bool { - self.intersects(TokenFlags::RAW_STRING) - } - - pub(crate) fn as_any_string_flags(self) -> AnyStringFlags { - AnyStringFlags::new(self.prefix(), self.quote_style(), self.is_triple_quoted()) - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct Token { - /// The kind of the token. - kind: TokenKind, - /// The range of the token. - range: TextRange, - /// The set of flags describing this token. - flags: TokenFlags, -} - -impl Token { - pub(crate) fn new(kind: TokenKind, range: TextRange, flags: TokenFlags) -> Token { - Self { kind, range, flags } - } - - /// Returns the token kind. - #[inline] - pub const fn kind(&self) -> TokenKind { - self.kind - } - - /// Returns the token as a tuple of (kind, range). - #[inline] - pub const fn as_tuple(&self) -> (TokenKind, TextRange) { - (self.kind, self.range) - } - - /// Returns `true` if this is any kind of string token. - const fn is_any_string(self) -> bool { - matches!( - self.kind, - TokenKind::String - | TokenKind::FStringStart - | TokenKind::FStringMiddle - | TokenKind::FStringEnd - ) - } - - /// Returns `true` if the current token is a triple-quoted string of any kind. - /// - /// # Panics - /// - /// If it isn't a string or any f-string tokens. - pub fn is_triple_quoted_string(self) -> bool { - assert!(self.is_any_string()); - self.flags.is_triple_quoted() - } - - /// Returns the [`Quote`] style for the current string token of any kind. - /// - /// # Panics - /// - /// If it isn't a string or any f-string tokens. - pub fn string_quote_style(self) -> Quote { - assert!(self.is_any_string()); - self.flags.quote_style() - } -} - -impl Ranged for Token { - fn range(&self) -> TextRange { - self.range - } -} - -/// Represents an error that occur during lexing and are -/// returned by the `parse_*` functions in the iterator in the -/// [lexer] implementation. -/// -/// [lexer]: crate::lexer -#[derive(Debug, Clone, PartialEq)] -pub struct LexicalError { - /// The type of error that occurred. - error: LexicalErrorType, - /// The location of the error. - location: TextRange, -} - -impl LexicalError { - /// Creates a new `LexicalError` with the given error type and location. - pub fn new(error: LexicalErrorType, location: TextRange) -> Self { - Self { error, location } - } - - pub fn error(&self) -> &LexicalErrorType { - &self.error - } - - pub fn into_error(self) -> LexicalErrorType { - self.error - } - - pub fn location(&self) -> TextRange { - self.location - } -} - -impl std::ops::Deref for LexicalError { - type Target = LexicalErrorType; - - fn deref(&self) -> &Self::Target { - self.error() - } -} - -impl std::error::Error for LexicalError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - Some(self.error()) - } -} - -impl std::fmt::Display for LexicalError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!( - f, - "{} at byte offset {}", - self.error(), - u32::from(self.location().start()) - ) - } -} - -/// Represents the different types of errors that can occur during lexing. -#[derive(Debug, Clone, PartialEq)] -pub enum LexicalErrorType { - // TODO: Can probably be removed, the places it is used seem to be able - // to use the `UnicodeError` variant instead. - #[doc(hidden)] - StringError, - /// A string literal without the closing quote. - UnclosedStringError, - /// Decoding of a unicode escape sequence in a string literal failed. - UnicodeError, - /// Missing the `{` for unicode escape sequence. - MissingUnicodeLbrace, - /// Missing the `}` for unicode escape sequence. - MissingUnicodeRbrace, - /// The indentation is not consistent. - IndentationError, - /// An unrecognized token was encountered. - UnrecognizedToken { tok: char }, - /// An f-string error containing the [`FStringErrorType`]. - FStringError(FStringErrorType), - /// Invalid character encountered in a byte literal. - InvalidByteLiteral, - /// An unexpected character was encountered after a line continuation. - LineContinuationError, - /// An unexpected end of file was encountered. - Eof, - /// An unexpected error occurred. - OtherError(Box), -} - -impl std::error::Error for LexicalErrorType {} - -impl std::fmt::Display for LexicalErrorType { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - LexicalErrorType::StringError => write!(f, "Got unexpected string"), - LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"), - LexicalErrorType::InvalidByteLiteral => { - write!(f, "bytes can only contain ASCII literal characters") - } - LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"), - LexicalErrorType::IndentationError => { - write!(f, "unindent does not match any outer indentation level") - } - LexicalErrorType::UnrecognizedToken { tok } => { - write!(f, "Got unexpected token {tok}") - } - LexicalErrorType::LineContinuationError => { - write!(f, "unexpected character after line continuation character") - } - LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"), - LexicalErrorType::OtherError(msg) => write!(f, "{msg}"), - LexicalErrorType::UnclosedStringError => { - write!(f, "missing closing quote in string literal") - } - LexicalErrorType::MissingUnicodeLbrace => { - write!(f, "Missing `{{` in Unicode escape sequence") - } - LexicalErrorType::MissingUnicodeRbrace => { - write!(f, "Missing `}}` in Unicode escape sequence") - } - } - } -} - -#[derive(Clone, Debug, Default)] -pub(crate) enum TokenValue { - #[default] - None, - /// Token value for a name, commonly known as an identifier. - /// - /// Unicode names are NFKC-normalized by the lexer, - /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers) - Name(Box), - /// Token value for an integer. - Int(Int), - /// Token value for a floating point number. - Float(f64), - /// Token value for a complex number. - Complex { - /// The real part of the complex number. - real: f64, - /// The imaginary part of the complex number. - imag: f64, - }, - /// Token value for a string. - String(Box), - /// Token value that includes the portion of text inside the f-string that's not - /// part of the expression part and isn't an opening or closing brace. - FStringMiddle(Box), - /// Token value for IPython escape commands. These are recognized by the lexer - /// only when the mode is [`Mode::Ipython`]. - IpyEscapeCommand { - /// The magic command value. - value: Box, - /// The kind of magic command. - kind: IpyEscapeKind, - }, -} - pub(crate) struct LexerCheckpoint { value: TokenValue, current_kind: TokenKind, diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 0add53e446260..ec1023e05f228 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -67,8 +67,7 @@ use std::ops::Deref; pub use crate::error::{FStringErrorType, ParseError, ParseErrorType}; -pub use crate::lexer::Token; -pub use crate::token::TokenKind; +pub use crate::token::{Token, TokenKind}; use crate::parser::Parser; @@ -592,7 +591,7 @@ impl std::fmt::Display for ModeParseError { mod tests { use std::ops::Range; - use crate::lexer::TokenFlags; + use crate::token::TokenFlags; use super::*; diff --git a/crates/ruff_python_parser/src/parser/expression.rs b/crates/ruff_python_parser/src/parser/expression.rs index 3ca0a44741bd8..eb018f6a5c2d6 100644 --- a/crates/ruff_python_parser/src/parser/expression.rs +++ b/crates/ruff_python_parser/src/parser/expression.rs @@ -11,12 +11,12 @@ use ruff_python_ast::{ }; use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; -use crate::lexer::TokenValue; use crate::parser::progress::ParserProgress; use crate::parser::{helpers, FunctionKind, Parser}; use crate::string::{parse_fstring_literal_element, parse_string_literal, StringType}; +use crate::token::{TokenKind, TokenValue}; use crate::token_set::TokenSet; -use crate::{FStringErrorType, Mode, ParseErrorType, TokenKind}; +use crate::{FStringErrorType, Mode, ParseErrorType}; use super::{FStringElementsKind, Parenthesized, RecoveryContextKind}; diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs index 0e766f06841b8..08c85f7a07148 100644 --- a/crates/ruff_python_parser/src/parser/mod.rs +++ b/crates/ruff_python_parser/src/parser/mod.rs @@ -5,9 +5,9 @@ use bitflags::bitflags; use ruff_python_ast::{Mod, ModExpression, ModModule}; use ruff_text_size::{Ranged, TextRange, TextSize}; -use crate::lexer::TokenValue; use crate::parser::expression::ExpressionContext; use crate::parser::progress::{ParserProgress, TokenId}; +use crate::token::TokenValue; use crate::token_set::TokenSet; use crate::token_source::{TokenSource, TokenSourceCheckpoint}; use crate::{Mode, ParseError, ParseErrorType, TokenKind}; diff --git a/crates/ruff_python_parser/src/parser/pattern.rs b/crates/ruff_python_parser/src/parser/pattern.rs index c0fc818ca0931..88079c60ed11d 100644 --- a/crates/ruff_python_parser/src/parser/pattern.rs +++ b/crates/ruff_python_parser/src/parser/pattern.rs @@ -1,11 +1,11 @@ use ruff_python_ast::{self as ast, Expr, ExprContext, Number, Operator, Pattern, Singleton}; use ruff_text_size::{Ranged, TextSize}; -use crate::lexer::TokenValue; use crate::parser::progress::ParserProgress; use crate::parser::{recovery, Parser, RecoveryContextKind, SequenceMatchPatternParentheses}; +use crate::token::{TokenKind, TokenValue}; use crate::token_set::TokenSet; -use crate::{ParseErrorType, TokenKind}; +use crate::ParseErrorType; use super::expression::ExpressionContext; diff --git a/crates/ruff_python_parser/src/parser/statement.rs b/crates/ruff_python_parser/src/parser/statement.rs index d10599bdf176b..5cd056805ca27 100644 --- a/crates/ruff_python_parser/src/parser/statement.rs +++ b/crates/ruff_python_parser/src/parser/statement.rs @@ -8,14 +8,14 @@ use ruff_python_ast::{ }; use ruff_text_size::{Ranged, TextSize}; -use crate::lexer::TokenValue; use crate::parser::expression::{ParsedExpr, EXPR_SET}; use crate::parser::progress::ParserProgress; use crate::parser::{ helpers, FunctionKind, Parser, RecoveryContext, RecoveryContextKind, WithItemKind, }; +use crate::token::{TokenKind, TokenValue}; use crate::token_set::TokenSet; -use crate::{Mode, ParseErrorType, TokenKind}; +use crate::{Mode, ParseErrorType}; use super::expression::ExpressionContext; use super::Parenthesized; diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 3976da33876ee..8c9d61ba91b79 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -5,7 +5,7 @@ use bstr::ByteSlice; use ruff_python_ast::{self as ast, AnyStringFlags, Expr, StringFlags}; use ruff_text_size::{Ranged, TextRange, TextSize}; -use crate::lexer::{LexicalError, LexicalErrorType}; +use crate::error::{LexicalError, LexicalErrorType}; #[derive(Debug)] pub(crate) enum StringType { @@ -471,7 +471,7 @@ pub(crate) fn parse_fstring_literal_element( mod tests { use ruff_python_ast::Suite; - use crate::lexer::LexicalErrorType; + use crate::error::LexicalErrorType; use crate::{parse_module, FStringErrorType, ParseError, ParseErrorType, Parsed}; const WINDOWS_EOL: &str = "\r\n"; diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index f5c3e6ba8b19f..ee209b9a9f85d 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -7,7 +7,85 @@ use std::fmt; -use ruff_python_ast::{BoolOp, Operator, UnaryOp}; +use bitflags::bitflags; + +use ruff_python_ast::str::Quote; +use ruff_python_ast::str_prefix::{ + AnyStringPrefix, ByteStringPrefix, FStringPrefix, StringLiteralPrefix, +}; +use ruff_python_ast::{AnyStringFlags, BoolOp, Int, IpyEscapeKind, Operator, StringFlags, UnaryOp}; +use ruff_text_size::{Ranged, TextRange}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Token { + /// The kind of the token. + kind: TokenKind, + /// The range of the token. + range: TextRange, + /// The set of flags describing this token. + flags: TokenFlags, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, range: TextRange, flags: TokenFlags) -> Token { + Self { kind, range, flags } + } + + /// Returns the token kind. + #[inline] + pub const fn kind(&self) -> TokenKind { + self.kind + } + + /// Returns the token as a tuple of (kind, range). + #[inline] + pub const fn as_tuple(&self) -> (TokenKind, TextRange) { + (self.kind, self.range) + } + + /// Returns `true` if this is a trivia token. + #[inline] + pub const fn is_trivia(self) -> bool { + matches!(self.kind, TokenKind::Comment | TokenKind::NonLogicalNewline) + } + + /// Returns `true` if the current token is a triple-quoted string of any kind. + /// + /// # Panics + /// + /// If it isn't a string or any f-string tokens. + pub fn is_triple_quoted_string(self) -> bool { + assert!(self.is_any_string()); + self.flags.is_triple_quoted() + } + + /// Returns the [`Quote`] style for the current string token of any kind. + /// + /// # Panics + /// + /// If it isn't a string or any f-string tokens. + pub fn string_quote_style(self) -> Quote { + assert!(self.is_any_string()); + self.flags.quote_style() + } + + /// Returns `true` if this is any kind of string token. + const fn is_any_string(self) -> bool { + matches!( + self.kind, + TokenKind::String + | TokenKind::FStringStart + | TokenKind::FStringMiddle + | TokenKind::FStringEnd + ) + } +} + +impl Ranged for Token { + fn range(&self) -> TextRange { + self.range + } +} /// A kind of a token. #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] @@ -591,11 +669,126 @@ impl fmt::Display for TokenKind { } } -#[cfg(target_pointer_width = "64")] -mod sizes { - use crate::lexer::{LexicalError, LexicalErrorType}; - use static_assertions::assert_eq_size; +bitflags! { + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub(crate) struct TokenFlags: u8 { + /// The token is a string with double quotes (`"`). + const DOUBLE_QUOTES = 1 << 0; + /// The token is a triple-quoted string i.e., it starts and ends with three consecutive + /// quote characters (`"""` or `'''`). + const TRIPLE_QUOTED_STRING = 1 << 1; + + /// The token is a unicode string i.e., prefixed with `u` or `U` + const UNICODE_STRING = 1 << 2; + /// The token is a byte string i.e., prefixed with `b` or `B` + const BYTE_STRING = 1 << 3; + /// The token is an f-string i.e., prefixed with `f` or `F` + const F_STRING = 1 << 4; + /// The token is a raw string and the prefix character is in lowercase. + const RAW_STRING_LOWERCASE = 1 << 5; + /// The token is a raw string and the prefix character is in uppercase. + const RAW_STRING_UPPERCASE = 1 << 6; + + /// The token is a raw string i.e., prefixed with `r` or `R` + const RAW_STRING = Self::RAW_STRING_LOWERCASE.bits() | Self::RAW_STRING_UPPERCASE.bits(); + } +} + +impl StringFlags for TokenFlags { + fn quote_style(self) -> Quote { + if self.intersects(TokenFlags::DOUBLE_QUOTES) { + Quote::Double + } else { + Quote::Single + } + } + + fn is_triple_quoted(self) -> bool { + self.intersects(TokenFlags::TRIPLE_QUOTED_STRING) + } - assert_eq_size!(LexicalErrorType, [u8; 24]); - assert_eq_size!(LexicalError, [u8; 32]); + fn prefix(self) -> AnyStringPrefix { + if self.intersects(TokenFlags::F_STRING) { + if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { + AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: false }) + } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { + AnyStringPrefix::Format(FStringPrefix::Raw { uppercase_r: true }) + } else { + AnyStringPrefix::Format(FStringPrefix::Regular) + } + } else if self.intersects(TokenFlags::BYTE_STRING) { + if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { + AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: false }) + } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { + AnyStringPrefix::Bytes(ByteStringPrefix::Raw { uppercase_r: true }) + } else { + AnyStringPrefix::Bytes(ByteStringPrefix::Regular) + } + } else if self.intersects(TokenFlags::RAW_STRING_LOWERCASE) { + AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false }) + } else if self.intersects(TokenFlags::RAW_STRING_UPPERCASE) { + AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: true }) + } else if self.intersects(TokenFlags::UNICODE_STRING) { + AnyStringPrefix::Regular(StringLiteralPrefix::Unicode) + } else { + AnyStringPrefix::Regular(StringLiteralPrefix::Empty) + } + } +} + +impl TokenFlags { + /// Returns `true` if the token is an f-string. + pub(crate) const fn is_f_string(self) -> bool { + self.intersects(TokenFlags::F_STRING) + } + + /// Returns `true` if the token is a triple-quoted f-string. + pub(crate) fn is_triple_quoted_fstring(self) -> bool { + self.contains(TokenFlags::F_STRING | TokenFlags::TRIPLE_QUOTED_STRING) + } + + /// Returns `true` if the token is a raw string. + pub(crate) const fn is_raw_string(self) -> bool { + self.intersects(TokenFlags::RAW_STRING) + } + + /// Converts this type to [`AnyStringFlags`], setting the equivalent flags. + pub(crate) fn as_any_string_flags(self) -> AnyStringFlags { + AnyStringFlags::new(self.prefix(), self.quote_style(), self.is_triple_quoted()) + } +} + +#[derive(Clone, Debug, Default)] +pub(crate) enum TokenValue { + #[default] + None, + /// Token value for a name, commonly known as an identifier. + /// + /// Unicode names are NFKC-normalized by the lexer, + /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers) + Name(Box), + /// Token value for an integer. + Int(Int), + /// Token value for a floating point number. + Float(f64), + /// Token value for a complex number. + Complex { + /// The real part of the complex number. + real: f64, + /// The imaginary part of the complex number. + imag: f64, + }, + /// Token value for a string. + String(Box), + /// Token value that includes the portion of text inside the f-string that's not + /// part of the expression part and isn't an opening or closing brace. + FStringMiddle(Box), + /// Token value for IPython escape commands. These are recognized by the lexer + /// only when the mode is [`Mode::Ipython`]. + IpyEscapeCommand { + /// The magic command value. + value: Box, + /// The kind of magic command. + kind: IpyEscapeKind, + }, } diff --git a/crates/ruff_python_parser/src/token_source.rs b/crates/ruff_python_parser/src/token_source.rs index 2719abdd646e7..c9c9fa3ce69ad 100644 --- a/crates/ruff_python_parser/src/token_source.rs +++ b/crates/ruff_python_parser/src/token_source.rs @@ -1,7 +1,9 @@ use ruff_text_size::{Ranged, TextRange, TextSize}; -use crate::lexer::{Lexer, LexerCheckpoint, LexicalError, Token, TokenFlags, TokenValue}; -use crate::{Mode, TokenKind}; +use crate::error::LexicalError; +use crate::lexer::{Lexer, LexerCheckpoint}; +use crate::token::{Token, TokenFlags, TokenKind, TokenValue}; +use crate::Mode; /// Token source for the parser that skips over any trivia tokens. #[derive(Debug)]