Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reserve guarded string literals (RFC 3593) #123951

Merged
merged 1 commit into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 85 additions & 8 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ pub enum TokenKind {
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Guarded string literal prefix: `#"` or `##`.
///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
/// Split into the component tokens on older editions.
GuardedStrPrefix,

/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case.
Expand Down Expand Up @@ -191,30 +197,41 @@ pub enum DocStyle {
/// `rustc_ast::ast::LitKind`).
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99", "1f32".
/// `12_u8`, `0o100`, `0b120i99`, `1f32`.
estebank marked this conversation as resolved.
Show resolved Hide resolved
Int { base: Base, empty_int: bool },
/// "12.34f32", "1e3", but not "1f32".
/// `12.34f32`, `1e3`, but not `1f32`.
Float { base: Base, empty_exponent: bool },
/// "'a'", "'\\'", "'''", "';"
/// `'a'`, `'\\'`, `'''`, `';`
Char { terminated: bool },
/// "b'a'", "b'\\'", "b'''", "b';"
/// `b'a'`, `b'\\'`, `b'''`, `b';`
Byte { terminated: bool },
/// ""abc"", ""abc"
/// `"abc"`, `"abc`
Str { terminated: bool },
/// "b"abc"", "b"abc"
/// `b"abc"`, `b"abc`
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
RawCStr { n_hashes: Option<u8> },
}

/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
///
/// Can capture fewer closing hashes than starting hashes,
/// for more efficient lexing and better backwards diagnostics.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct GuardedStr {
pub n_hashes: u32,
pub terminated: bool,
pub token_len: u32,
petrochenkov marked this conversation as resolved.
Show resolved Hide resolved
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
Expand Down Expand Up @@ -403,6 +420,12 @@ impl Cursor<'_> {
TokenKind::Literal { kind: literal_kind, suffix_start }
}

// Guarded string literal prefix: `#"` or `##`
'#' if matches!(self.first(), '"' | '#') => {
self.bump();
TokenKind::GuardedStrPrefix
}

// One-symbol tokens.
';' => Semi,
',' => Comma,
Expand Down Expand Up @@ -780,6 +803,60 @@ impl Cursor<'_> {
false
}

/// Attempt to lex for a guarded string literal.
///
/// Used by `rustc_parse::lexer` to lex for guarded strings
/// conditionally based on edition.
///
/// Note: this will not reset the `Cursor` when a
/// guarded string is not found. It is the caller's
/// responsibility to do so.
pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
debug_assert!(self.prev() != '#');

let mut n_start_hashes: u32 = 0;
while self.first() == '#' {
n_start_hashes += 1;
self.bump();
}

if self.first() != '"' {
return None;
}
self.bump();
debug_assert!(self.prev() == '"');

// Lex the string itself as a normal string literal
// so we can recover that for older editions later.
let terminated = self.double_quoted_string();
if !terminated {
let token_len = self.pos_within_token();
self.reset_pos_within_token();

return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
}

// Consume closing '#' symbols.
// Note that this will not consume extra trailing `#` characters:
// `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
// followed by a `#` token.
let mut n_end_hashes = 0;
while self.first() == '#' && n_end_hashes < n_start_hashes {
n_end_hashes += 1;
self.bump();
}

// Reserved syntax, always an error, so it doesn't matter if
// `n_start_hashes != n_end_hashes`.

self.eat_literal_suffix();

let token_len = self.pos_within_token();
self.reset_pos_within_token();

Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,9 @@ lint_reserved_prefix = prefix `{$prefix}` is unknown
.label = unknown prefix
.suggestion = insert whitespace here to avoid this being parsed as a prefix in Rust 2021

lint_reserved_string = will be parsed as a guarded string in Rust 2024
.suggestion = insert whitespace here to avoid this being parsed as a guarded string in Rust 2024

lint_shadowed_into_iter =
this method call resolves to `<&{$target} as IntoIterator>::into_iter` (due to backwards compatibility), but will resolve to `<{$target} as IntoIterator>::into_iter` in Rust {$edition}
.use_iter_suggestion = use `.iter()` instead of `.into_iter()` to avoid ambiguity
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/src/context/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ pub(super) fn decorate_lint(sess: &Session, diagnostic: BuiltinLintDiag, diag: &
lints::RawPrefix { label: label_span, suggestion: label_span.shrink_to_hi() }
.decorate_lint(diag);
}
BuiltinLintDiag::ReservedString(suggestion) => {
lints::ReservedString { suggestion }.decorate_lint(diag);
}
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
lints::UnusedBuiltinAttribute { invoc_span, attr_name, macro_name }.decorate_lint(diag);
}
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_lint/src/lints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3053,3 +3053,10 @@ pub(crate) enum MutRefSugg {
#[derive(LintDiagnostic)]
#[diag(lint_unqualified_local_imports)]
pub(crate) struct UnqualifiedLocalImportsDiag {}

#[derive(LintDiagnostic)]
#[diag(lint_reserved_string)]
pub(crate) struct ReservedString {
#[suggestion(code = " ", applicability = "machine-applicable")]
pub suggestion: Span,
}
41 changes: 41 additions & 0 deletions compiler/rustc_lint_defs/src/builtin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ declare_lint_pass! {
RUST_2021_INCOMPATIBLE_OR_PATTERNS,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
RUST_2021_PRELUDE_COLLISIONS,
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
RUST_2024_INCOMPATIBLE_PAT,
RUST_2024_PRELUDE_COLLISIONS,
SELF_CONSTRUCTOR_FROM_OUTER_ITEM,
Expand Down Expand Up @@ -4996,3 +4997,43 @@ declare_lint! {
Warn,
"detects pointer to integer transmutes in const functions and associated constants",
}

declare_lint! {
/// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
/// that will be parsed as part of a guarded string literal in Rust 2024.
///
/// ### Example
///
/// ```rust,edition2021,compile_fail
/// #![deny(rust_2024_guarded_string_incompatible_syntax)]
///
/// macro_rules! m {
/// (# $x:expr #) => ();
/// (# $x:expr) => ();
/// }
///
/// m!(#"hey"#);
/// m!(#"hello");
/// ```
///
/// {{produces}}
///
/// ### Explanation
///
/// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
/// followed by the string literal `"hey"` then the final `#`.
/// In Rust 2024, the whole sequence is considered a single token.
///
/// This lint suggests to add whitespace between the leading `#`
/// and the string to keep them separated in Rust 2024.
// Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
#[allow(rustdoc::invalid_rust_codeblocks)]
pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
Allow,
"will be parsed as a guarded string in Rust 2024",
@future_incompatible = FutureIncompatibleInfo {
reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
};
crate_level_only
}
2 changes: 2 additions & 0 deletions compiler/rustc_lint_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,8 @@ pub enum BuiltinLintDiag {
ReservedPrefix(Span, String),
/// `'r#` in edition < 2021.
RawPrefix(Span),
/// `##` or `#"` is edition < 2024.
ReservedString(Span),
TrailingMacro(bool, Ident),
BreakWithLabelAndLoop(Span),
UnicodeTextFlow(Span, String),
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_parse/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
.label = the label
.suggestion = add `:` after the label

parse_reserved_string = invalid string literal
.note = unprefixed guarded string literals are reserved for future use since Rust 2024
.suggestion_whitespace = consider inserting whitespace here

parse_return_types_use_thin_arrow = return types are denoted using `->`
.suggestion = use `->` instead

Expand Down
18 changes: 18 additions & 0 deletions compiler/rustc_parse/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,24 @@ pub(crate) enum UnknownPrefixSugg {
},
}

#[derive(Diagnostic)]
#[diag(parse_reserved_string)]
#[note]
pub(crate) struct ReservedString {
#[primary_span]
pub span: Span,
#[subdiagnostic]
pub sugg: Option<GuardedStringSugg>,
}
#[derive(Subdiagnostic)]
#[suggestion(
parse_suggestion_whitespace,
code = " ",
applicability = "maybe-incorrect",
style = "verbose"
)]
pub(crate) struct GuardedStringSugg(#[primary_span] pub Span);

#[derive(Diagnostic)]
#[diag(parse_too_many_hashes)]
pub(crate) struct TooManyHashes {
Expand Down
84 changes: 83 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
use rustc_session::lint::BuiltinLintDiag;
use rustc_session::lint::builtin::{
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::Symbol;
Expand Down Expand Up @@ -251,6 +252,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -781,6 +783,86 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
}
}

/// Detect guarded string literal syntax
///
/// RFC 3598 reserved this syntax for future use. As of Rust 2024,
/// using this syntax produces an error. In earlier editions, however, it
/// only results in an (allowed by default) lint, and is treated as
/// separate tokens.
fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
let span = self.mk_sp(start, self.pos);
let edition2024 = span.edition().at_least_rust_2024();

let space_pos = start + BytePos(1);
let space_span = self.mk_sp(space_pos, space_pos);

let mut cursor = Cursor::new(str_before);

let (span, unterminated) = match cursor.guarded_double_quoted_string() {
Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
let end = start + BytePos(token_len);
let span = self.mk_sp(start, end);
let str_start = start + BytePos(n_hashes);

if edition2024 {
self.cursor = cursor;
self.pos = end;
}

let unterminated = if terminated { None } else { Some(str_start) };

(span, unterminated)
}
_ => {
// We should only get here in the `##+` case.
debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");

(span, None)
}
};
if edition2024 {
if let Some(str_start) = unterminated {
// Only a fatal error if string is unterminated.
self.dcx()
.struct_span_fatal(
self.mk_sp(str_start, self.pos),
"unterminated double quote string",
)
.with_code(E0765)
.emit()
petrochenkov marked this conversation as resolved.
Show resolved Hide resolved
}

let sugg = if span.from_expansion() {
None
} else {
Some(errors::GuardedStringSugg(space_span))
};

// In Edition 2024 and later, emit a hard error.
let err = self.dcx().emit_err(errors::ReservedString { span, sugg });

token::Literal(token::Lit {
kind: token::Err(err),
symbol: self.symbol_from_to(start, self.pos),
suffix: None,
})
} else {
// Before Rust 2024, only emit a lint for migration.
self.psess.buffer_lint(
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
span,
ast::CRATE_NODE_ID,
BuiltinLintDiag::ReservedString(space_span),
);

// For backwards compatibility, roll back to after just the first `#`
// and return the `Pound` token.
self.pos = start + BytePos(1);
self.cursor = Cursor::new(&str_before[1..]);
token::Pound
}
}

fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
}
Expand Down
1 change: 1 addition & 0 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,7 @@ impl<'src> Classifier<'src> {
// Number literals.
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
},
TokenKind::GuardedStrPrefix => return no_highlight(sink),
TokenKind::Ident | TokenKind::RawIdent if lookahead == Some(TokenKind::Bang) => {
self.in_macro = true;
sink(Highlight::EnterSpan { class: Class::Macro(self.new_span(before, text)) });
Expand Down
6 changes: 6 additions & 0 deletions src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@ impl<'a> Converter<'a> {
}

rustc_lexer::TokenKind::RawIdent => IDENT,

rustc_lexer::TokenKind::GuardedStrPrefix => {
err = "Invalid string literal (reserved syntax)";
ERROR
},

rustc_lexer::TokenKind::Literal { kind, .. } => {
self.extend_literal(token_text.len(), kind);
return;
Expand Down
Loading
Loading