From c8844dfdc00fbfe4b5e7839635214b744410312f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 20 May 2024 08:45:54 +1000 Subject: [PATCH 1/6] Clarify the meaning of the span within `mbe::TokenTree::MetaVar`. --- compiler/rustc_expand/src/mbe.rs | 3 ++- compiler/rustc_expand/src/mbe/quoted.rs | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_expand/src/mbe.rs b/compiler/rustc_expand/src/mbe.rs index a805c4fcf7b97..32d12c4fd0cc9 100644 --- a/compiler/rustc_expand/src/mbe.rs +++ b/compiler/rustc_expand/src/mbe.rs @@ -73,7 +73,8 @@ enum TokenTree { Delimited(DelimSpan, DelimSpacing, Delimited), /// A kleene-style repetition sequence, e.g. `$($e:expr)*` (RHS) or `$($e),*` (LHS). Sequence(DelimSpan, SequenceRepetition), - /// e.g., `$var`. + /// e.g., `$var`. The span covers the leading dollar and the ident. (The span within the ident + /// only covers the ident, e.g. `var`.) MetaVar(Span, Ident), /// e.g., `$var:expr`. Only appears on the LHS. MetaVarDecl(Span, Ident /* name to bind */, Option), diff --git a/compiler/rustc_expand/src/mbe/quoted.rs b/compiler/rustc_expand/src/mbe/quoted.rs index d3ea48e2e2a8e..2e37fd70df9a7 100644 --- a/compiler/rustc_expand/src/mbe/quoted.rs +++ b/compiler/rustc_expand/src/mbe/quoted.rs @@ -176,7 +176,7 @@ fn parse_tree<'a>( // Depending on what `tree` is, we could be parsing different parts of a macro match tree { // `tree` is a `$` token. Look at the next token in `trees` - &tokenstream::TokenTree::Token(Token { kind: token::Dollar, span }, _) => { + &tokenstream::TokenTree::Token(Token { kind: token::Dollar, span: dollar_span }, _) => { // FIXME: Handle `Invisible`-delimited groups in a more systematic way // during parsing. let mut next = outer_trees.next(); @@ -209,7 +209,7 @@ fn parse_tree<'a>( err.emit(); // Returns early the same read `$` to avoid spanning // unrelated diagnostics that could be performed afterwards - return TokenTree::token(token::Dollar, span); + return TokenTree::token(token::Dollar, dollar_span); } Ok(elem) => { maybe_emit_macro_metavar_expr_feature( @@ -251,7 +251,7 @@ fn parse_tree<'a>( // special metavariable that names the crate of the invocation. Some(tokenstream::TokenTree::Token(token, _)) if token.is_ident() => { let (ident, is_raw) = token.ident().unwrap(); - let span = ident.span.with_lo(span.lo()); + let span = ident.span.with_lo(dollar_span.lo()); if ident.name == kw::Crate && matches!(is_raw, IdentIsRaw::No) { TokenTree::token(token::Ident(kw::DollarCrate, is_raw), span) } else { @@ -260,16 +260,19 @@ fn parse_tree<'a>( } // `tree` is followed by another `$`. This is an escaped `$`. - Some(&tokenstream::TokenTree::Token(Token { kind: token::Dollar, span }, _)) => { + Some(&tokenstream::TokenTree::Token( + Token { kind: token::Dollar, span: dollar_span2 }, + _, + )) => { if parsing_patterns { span_dollar_dollar_or_metavar_in_the_lhs_err( sess, - &Token { kind: token::Dollar, span }, + &Token { kind: token::Dollar, span: dollar_span2 }, ); } else { - maybe_emit_macro_metavar_expr_feature(features, sess, span); + maybe_emit_macro_metavar_expr_feature(features, sess, dollar_span2); } - TokenTree::token(token::Dollar, span) + TokenTree::token(token::Dollar, dollar_span2) } // `tree` is followed by some other token. This is an error. @@ -281,7 +284,7 @@ fn parse_tree<'a>( } // There are no more tokens. Just return the `$` we already have. - None => TokenTree::token(token::Dollar, span), + None => TokenTree::token(token::Dollar, dollar_span), } } From 3fc8f8998c45fbc211b98b5bae04cc87f8eb1619 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 20 May 2024 09:20:49 +1000 Subject: [PATCH 2/6] Clarify `parse` a little. - Name the colon span as `colon_span` to distinguish it from the other `span` local variable. - Just use basic pattern matching, which is easier to read than `map_or`. --- compiler/rustc_expand/src/mbe/quoted.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_expand/src/mbe/quoted.rs b/compiler/rustc_expand/src/mbe/quoted.rs index 2e37fd70df9a7..8ad7cb15c92a9 100644 --- a/compiler/rustc_expand/src/mbe/quoted.rs +++ b/compiler/rustc_expand/src/mbe/quoted.rs @@ -62,7 +62,10 @@ pub(super) fn parse( match tree { TokenTree::MetaVar(start_sp, ident) if parsing_patterns => { let span = match trees.next() { - Some(&tokenstream::TokenTree::Token(Token { kind: token::Colon, span }, _)) => { + Some(&tokenstream::TokenTree::Token( + Token { kind: token::Colon, span: colon_span }, + _, + )) => { match trees.next() { Some(tokenstream::TokenTree::Token(token, _)) => match token.ident() { Some((fragment, _)) => { @@ -126,10 +129,12 @@ pub(super) fn parse( } _ => token.span, }, - tree => tree.map_or(span, tokenstream::TokenTree::span), + Some(tree) => tree.span(), + None => colon_span, } } - tree => tree.map_or(start_sp, tokenstream::TokenTree::span), + Some(tree) => tree.span(), + None => start_sp, }; result.push(TokenTree::MetaVarDecl(span, ident, None)); From b6de7821982caf99f58e2925ae3cb78673b31e24 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 17 May 2024 09:54:53 +1000 Subject: [PATCH 3/6] Clarify a comment. --- compiler/rustc_expand/src/proc_macro_server.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs index 1f3547c841a90..ec7e4416b9130 100644 --- a/compiler/rustc_expand/src/proc_macro_server.rs +++ b/compiler/rustc_expand/src/proc_macro_server.rs @@ -309,10 +309,10 @@ impl ToInternal> use rustc_ast::token::*; // The code below is conservative, using `token_alone`/`Spacing::Alone` - // in most places. When the resulting code is pretty-printed by - // `print_tts` it ends up with spaces between most tokens, which is - // safe but ugly. It's hard in general to do better when working at the - // token level. + // in most places. It's hard in general to do better when working at + // the token level. When the resulting code is pretty-printed by + // `print_tts` the `space_between` function helps avoid a lot of + // unnecessary whitespace, so the results aren't too bad. let (tree, rustc) = self; match tree { TokenTree::Punct(Punct { ch, joint, span }) => { From c679a5510222cb861a2a23088d1365707d1e0d6f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 17 May 2024 09:49:09 +1000 Subject: [PATCH 4/6] Convert some `token_joint_hidden` calls to `token_joint`. This has no noticeable effect, but it makes these cases follow the guidelines in the comments on `Spacing`, which say that `Joint` should be used "for each token that (a) should be pretty-printed without a space after it, and (b) is followed by a punctuation token". These two tokens are both followed by a comma, which is a punctuation token. --- compiler/rustc_builtin_macros/src/assert/context.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_builtin_macros/src/assert/context.rs b/compiler/rustc_builtin_macros/src/assert/context.rs index 085ea3458bbfa..7814422611439 100644 --- a/compiler/rustc_builtin_macros/src/assert/context.rs +++ b/compiler/rustc_builtin_macros/src/assert/context.rs @@ -153,7 +153,7 @@ impl<'cx, 'a> Context<'cx, 'a> { fn build_panic(&self, expr_str: &str, panic_path: Path) -> P { let escaped_expr_str = escape_to_fmt(expr_str); let initial = [ - TokenTree::token_joint_hidden( + TokenTree::token_joint( token::Literal(token::Lit { kind: token::LitKind::Str, symbol: Symbol::intern(&if self.fmt_string.is_empty() { @@ -172,7 +172,7 @@ impl<'cx, 'a> Context<'cx, 'a> { ]; let captures = self.capture_decls.iter().flat_map(|cap| { [ - TokenTree::token_joint_hidden( + TokenTree::token_joint( token::Ident(cap.ident.name, IdentIsRaw::No), cap.ident.span, ), From a1b6d46e040176e63954ba3ba5bb18cd4ed3691a Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 17 May 2024 08:23:24 +1000 Subject: [PATCH 5/6] Use `JointHidden` in a couple of suitable places. This has no notable effect, but it's appropriate because the relevant tokens are followed by delimiters. --- compiler/rustc_ast/src/tokenstream.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_ast/src/tokenstream.rs b/compiler/rustc_ast/src/tokenstream.rs index fb666550e9302..3d46415507def 100644 --- a/compiler/rustc_ast/src/tokenstream.rs +++ b/compiler/rustc_ast/src/tokenstream.rs @@ -661,11 +661,11 @@ impl TokenStream { if attr_style == AttrStyle::Inner { vec![ TokenTree::token_joint(token::Pound, span), - TokenTree::token_alone(token::Not, span), + TokenTree::token_joint_hidden(token::Not, span), body, ] } else { - vec![TokenTree::token_alone(token::Pound, span), body] + vec![TokenTree::token_joint_hidden(token::Pound, span), body] } } } From 4d513cb4bf7d8c3151594096e97cd848f5fcab77 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 20 May 2024 12:54:38 +1000 Subject: [PATCH 6/6] Add some comments. --- compiler/rustc_ast_pretty/src/pprust/state.rs | 40 ++++++++++++++----- compiler/rustc_expand/src/mbe.rs | 2 + compiler/rustc_expand/src/mbe/transcribe.rs | 15 +++++++ 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_ast_pretty/src/pprust/state.rs b/compiler/rustc_ast_pretty/src/pprust/state.rs index 545b98a9135af..f02fe4cf0a728 100644 --- a/compiler/rustc_ast_pretty/src/pprust/state.rs +++ b/compiler/rustc_ast_pretty/src/pprust/state.rs @@ -681,22 +681,40 @@ pub trait PrintState<'a>: std::ops::Deref + std::ops::Dere } } + // The easiest way to implement token stream pretty printing would be to + // print each token followed by a single space. But that would produce ugly + // output, so we go to some effort to do better. + // + // First, we track whether each token that appears in source code is + // followed by a space, with `Spacing`, and reproduce that in the output. + // This works well in a lot of cases. E.g. `stringify!(x + y)` produces + // "x + y" and `stringify!(x+y)` produces "x+y". + // + // But this doesn't work for code produced by proc macros (which have no + // original source text representation) nor for code produced by decl + // macros (which are tricky because the whitespace after tokens appearing + // in macro rules isn't always what you want in the produced output). For + // these we mostly use `Spacing::Alone`, which is the conservative choice. + // + // So we have a backup mechanism for when `Spacing::Alone` occurs between a + // pair of tokens: we check if that pair of tokens can obviously go + // together without a space between them. E.g. token `x` followed by token + // `,` is better printed as `x,` than `x ,`. (Even if the original source + // code was `x ,`.) + // + // Finally, we must be careful about changing the output. Token pretty + // printing is used by `stringify!` and `impl Display for + // proc_macro::TokenStream`, and some programs rely on the output having a + // particular form, even though they shouldn't. In particular, some proc + // macros do `format!({stream})` on a token stream and then "parse" the + // output with simple string matching that can't handle whitespace changes. + // E.g. we have seen cases where a proc macro can handle `a :: b` but not + // `a::b`. See #117433 for some examples. fn print_tts(&mut self, tts: &TokenStream, convert_dollar_crate: bool) { let mut iter = tts.trees().peekable(); while let Some(tt) = iter.next() { let spacing = self.print_tt(tt, convert_dollar_crate); if let Some(next) = iter.peek() { - // Should we print a space after `tt`? There are two guiding - // factors. - // - `spacing` is the more important and accurate one. Most - // tokens have good spacing information, and - // `Joint`/`JointHidden` get used a lot. - // - `space_between` is the backup. Code produced by proc - // macros has worse spacing information, with no - // `JointHidden` usage and too much `Alone` usage, which - // would result in over-spaced output such as - // `( x () , y . z )`. `space_between` avoids some of the - // excess whitespace. if spacing == Spacing::Alone && space_between(tt, next) { self.space(); } diff --git a/compiler/rustc_expand/src/mbe.rs b/compiler/rustc_expand/src/mbe.rs index 32d12c4fd0cc9..08d4a03945489 100644 --- a/compiler/rustc_expand/src/mbe.rs +++ b/compiler/rustc_expand/src/mbe.rs @@ -68,6 +68,8 @@ pub(crate) enum KleeneOp { /// `MetaVarExpr` are "first-class" token trees. Useful for parsing macros. #[derive(Debug, PartialEq, Encodable, Decodable)] enum TokenTree { + /// A token. Unlike `tokenstream::TokenTree::Token` this lacks a `Spacing`. + /// See the comments about `Spacing` in the `transcribe` function. Token(Token), /// A delimited sequence, e.g. `($e:expr)` (RHS) or `{ $e }` (LHS). Delimited(DelimSpan, DelimSpacing, Delimited), diff --git a/compiler/rustc_expand/src/mbe/transcribe.rs b/compiler/rustc_expand/src/mbe/transcribe.rs index 3901b82eb52ec..8a084dcb4fe3a 100644 --- a/compiler/rustc_expand/src/mbe/transcribe.rs +++ b/compiler/rustc_expand/src/mbe/transcribe.rs @@ -253,8 +253,23 @@ pub(super) fn transcribe<'a>( mbe::TokenTree::MetaVar(mut sp, mut original_ident) => { // Find the matched nonterminal from the macro invocation, and use it to replace // the meta-var. + // + // We use `Spacing::Alone` everywhere here, because that's the conservative choice + // and spacing of declarative macros is tricky. E.g. in this macro: + // ``` + // macro_rules! idents { + // ($($a:ident,)*) => { stringify!($($a)*) } + // } + // ``` + // `$a` has no whitespace after it and will be marked `JointHidden`. If you then + // call `idents!(x,y,z,)`, each of `x`, `y`, and `z` will be marked as `Joint`. So + // if you choose to use `$x`'s spacing or the identifier's spacing, you'll end up + // producing "xyz", which is bad because it effectively merges tokens. + // `Spacing::Alone` is the safer option. Fortunately, `space_between` will avoid + // some of the unnecessary whitespace. let ident = MacroRulesNormalizedIdent::new(original_ident); if let Some(cur_matched) = lookup_cur_matched(ident, interp, &repeats) { + // njn: explain the use of alone here let tt = match cur_matched { MatchedSingle(ParseNtResult::Tt(tt)) => { // `tt`s are emitted into the output stream directly as "raw tokens",