From deab741ab4c91c740b265db4d4950f31fc4f6cc3 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 21 Aug 2024 15:32:57 +1000
Subject: [PATCH 1/8] Clarify a comment.

---
 compiler/rustc_parse/src/parser/attr_wrapper.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index 49df2811d525b..cfbac4876d537 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -510,9 +510,11 @@ fn make_attr_token_stream(
 }
 
 /// Tokens are needed if:
-/// - any non-single-segment attributes (other than doc comments) are present; or
-/// - any `cfg_attr` attributes are present;
-/// - any single-segment, non-builtin attributes are present.
+/// - any non-single-segment attributes (other than doc comments) are present,
+///   e.g. `rustfmt::skip`; or
+/// - any `cfg_attr` attributes are present; or
+/// - any single-segment, non-builtin attributes are present, e.g. `derive`,
+///   `test`, `global_allocator`.
 fn needs_tokens(attrs: &[ast::Attribute]) -> bool {
     attrs.iter().any(|attr| match attr.ident() {
         None => !attr.is_doc_comment(),

From 312ecdb2ed37249ad539bc0732278b80b4f3c7f8 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 20 Aug 2024 12:25:31 +1000
Subject: [PATCH 2/8] Avoid unnecessary `cloned`.

---
 compiler/rustc_parse/src/parser/attr_wrapper.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index cfbac4876d537..67d1e410c589d 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -363,7 +363,7 @@ impl<'a> Parser<'a> {
                 [parser_replacements_start..parser_replacements_end]
                 .iter()
                 .cloned()
-                .chain(inner_attr_parser_replacements.iter().cloned())
+                .chain(inner_attr_parser_replacements.into_iter())
                 .map(|(parser_range, data)| {
                     (NodeRange::new(parser_range, collect_pos.start_pos), data)
                 })

From 1ae521e9d57ba3a67b1007204da2836d8b19b4a2 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 20 Aug 2024 17:47:53 +1000
Subject: [PATCH 3/8] Return earlier in some cases in `collect_token`.

This example triggers an assertion failure:
```
fn f() -> u32 {
    #[cfg_eval] #[cfg(not(FALSE))] 0
}
```
The sequence of events:
- `configure_annotatable` calls `parse_expr_force_collect`, which calls
  `collect_tokens`.
- Within that, we end up in `parse_expr_dot_or_call`, which again calls
  `collect_tokens`.
  - The return value of the `f` call is the expression `0`.
  - This inner call collects tokens for `0` (parser range 10..11) and
    creates a replacement covering `#[cfg(not(FALSE))] 0` (parser range
    0..11).
- We return to the outer `collect_tokens` call. The return value of the
  `f` call is *again* the expression `0`, again with the range 10..11,
  but the replacement from earlier covers the range 0..11. The code
  mistakenly assumes that any attributes from an inner `collect_tokens`
  call fit entirely within the body of the result of an outer
  `collect_tokens` call. So it adjusts the replacement parser range
  0..11 to a node range by subtracting 10, resulting in -10..1. This is
  an invalid range and triggers an assertion failure.

It's tricky to follow, but basically things get complicated when an AST
node is returned from an inner `collect_tokens` call and then returned
again from an outer `collect_token` node without being wrapped in any
kind of additional layer.

This commit changes `collect_tokens` to return early in some extra cases,
avoiding the construction of lazy tokens. In the example above, the
outer `collect_tokens` returns earlier because the `0` token already has
tokens and `self.capture_state.capturing` is `Capturing::No`. This early
return avoids the creation of the invalid range and the assertion
failure.

Fixes #129166. Note: these invalid ranges have been happening for a long
time. #128725 looks like it's at fault only because it introduced the
assertion that catches the invalid ranges.
---
 .../rustc_parse/src/parser/attr_wrapper.rs    | 39 +++++++++++--------
 tests/crashes/129166.rs                       |  7 ----
 .../invalid-node-range-issue-129166.rs        | 11 ++++++
 .../invalid-node-range-issue-129166.stderr    |  8 ++++
 4 files changed, 41 insertions(+), 24 deletions(-)
 delete mode 100644 tests/crashes/129166.rs
 create mode 100644 tests/ui/conditional-compilation/invalid-node-range-issue-129166.rs
 create mode 100644 tests/ui/conditional-compilation/invalid-node-range-issue-129166.stderr

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index 67d1e410c589d..81b683705f308 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -234,6 +234,8 @@ impl<'a> Parser<'a> {
         force_collect: ForceCollect,
         f: impl FnOnce(&mut Self, AttrVec) -> PResult<'a, (R, Trailing, UsePreAttrPos)>,
     ) -> PResult<'a, R> {
+        let possible_capture_mode = self.capture_cfg;
+
         // We must collect if anything could observe the collected tokens, i.e.
         // if any of the following conditions hold.
         // - We are force collecting tokens (because force collection requires
@@ -244,9 +246,9 @@ impl<'a> Parser<'a> {
             // - Our target supports custom inner attributes (custom
             //   inner attribute invocation might require token capturing).
             || R::SUPPORTS_CUSTOM_INNER_ATTRS
-            // - We are in `capture_cfg` mode (which requires tokens if
+            // - We are in "possible capture mode" (which requires tokens if
             //   the parsed node has `#[cfg]` or `#[cfg_attr]` attributes).
-            || self.capture_cfg;
+            || possible_capture_mode;
         if !needs_collection {
             return Ok(f(self, attrs.attrs)?.0);
         }
@@ -267,7 +269,7 @@ impl<'a> Parser<'a> {
             res?
         };
 
-        // When we're not in `capture_cfg` mode, then skip collecting and
+        // When we're not in "definite capture mode", then skip collecting and
         // return early if either of the following conditions hold.
         // - `None`: Our target doesn't support tokens at all (e.g. `NtIdent`).
         // - `Some(Some(_))`: Our target already has tokens set (e.g. we've
@@ -278,7 +280,10 @@ impl<'a> Parser<'a> {
         // Note that this check is independent of `force_collect`. There's no
         // need to collect tokens when we don't support tokens or already have
         // tokens.
-        if !self.capture_cfg && matches!(ret.tokens_mut(), None | Some(Some(_))) {
+        let definite_capture_mode = self.capture_cfg
+            && matches!(self.capture_state.capturing, Capturing::Yes)
+            && has_cfg_or_cfg_attr(ret.attrs());
+        if !definite_capture_mode && matches!(ret.tokens_mut(), None | Some(Some(_))) {
             return Ok(ret);
         }
 
@@ -298,11 +303,11 @@ impl<'a> Parser<'a> {
             //   the earlier `needs_tokens` check, and we don't need to
             //   check `R::SUPPORTS_CUSTOM_INNER_ATTRS`.)
             || needs_tokens(ret.attrs())
-            // - We are in `capture_cfg` mode and there are `#[cfg]` or
-            //   `#[cfg_attr]` attributes. (During normal non-`capture_cfg`
-            //   parsing, we don't need any special capturing for those
-            //   attributes, because they're builtin.)
-            || (self.capture_cfg && has_cfg_or_cfg_attr(ret.attrs()));
+            // - We are in "definite capture mode", which requires that there
+            //   are `#[cfg]` or `#[cfg_attr]` attributes. (During normal
+            //   non-`capture_cfg` parsing, we don't need any special capturing
+            //   for those attributes, because they're builtin.)
+            || definite_capture_mode;
         if !needs_collection {
             return Ok(ret);
         }
@@ -399,20 +404,18 @@ impl<'a> Parser<'a> {
             break_last_token: self.break_last_token,
             node_replacements,
         });
+        let mut tokens_used = false;
 
         // If we support tokens and don't already have them, store the newly captured tokens.
         if let Some(target_tokens @ None) = ret.tokens_mut() {
+            tokens_used = true;
             *target_tokens = Some(tokens.clone());
         }
 
-        // If `capture_cfg` is set and we're inside a recursive call to
-        // `collect_tokens`, then we need to register a replace range if we
-        // have `#[cfg]` or `#[cfg_attr]`. This allows us to run eager
-        // cfg-expansion on the captured token stream.
-        if self.capture_cfg
-            && matches!(self.capture_state.capturing, Capturing::Yes)
-            && has_cfg_or_cfg_attr(ret.attrs())
-        {
+        // If in "definite capture mode" we need to register a replace range
+        // for the `#[cfg]` and/or `#[cfg_attr]` attrs. This allows us to run
+        // eager cfg-expansion on the captured token stream.
+        if definite_capture_mode {
             assert!(!self.break_last_token, "Should not have unglued last token with cfg attr");
 
             // What is the status here when parsing the example code at the top of this method?
@@ -430,6 +433,7 @@ impl<'a> Parser<'a> {
             let start_pos =
                 if has_outer_attrs { attrs.start_pos.unwrap() } else { collect_pos.start_pos };
             let target = AttrsTarget { attrs: ret.attrs().iter().cloned().collect(), tokens };
+            tokens_used = true;
             self.capture_state
                 .parser_replacements
                 .push((ParserRange(start_pos..end_pos), Some(target)));
@@ -439,6 +443,7 @@ impl<'a> Parser<'a> {
             self.capture_state.parser_replacements.clear();
             self.capture_state.inner_attr_parser_ranges.clear();
         }
+        assert!(tokens_used); // check we didn't create `tokens` unnecessarily
         Ok(ret)
     }
 }
diff --git a/tests/crashes/129166.rs b/tests/crashes/129166.rs
deleted file mode 100644
index d3635d410db5e..0000000000000
--- a/tests/crashes/129166.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-//@ known-bug: rust-lang/rust#129166
-
-fn main() {
-    #[cfg_eval]
-    #[cfg]
-    0
-}
diff --git a/tests/ui/conditional-compilation/invalid-node-range-issue-129166.rs b/tests/ui/conditional-compilation/invalid-node-range-issue-129166.rs
new file mode 100644
index 0000000000000..794e6fad3fc22
--- /dev/null
+++ b/tests/ui/conditional-compilation/invalid-node-range-issue-129166.rs
@@ -0,0 +1,11 @@
+// This was triggering an assertion failure in `NodeRange::new`.
+
+#![feature(cfg_eval)]
+#![feature(stmt_expr_attributes)]
+
+fn f() -> u32 {
+    #[cfg_eval] #[cfg(not(FALSE))] 0
+    //~^ ERROR removing an expression is not supported in this position
+}
+
+fn main() {}
diff --git a/tests/ui/conditional-compilation/invalid-node-range-issue-129166.stderr b/tests/ui/conditional-compilation/invalid-node-range-issue-129166.stderr
new file mode 100644
index 0000000000000..0699e182bd5f2
--- /dev/null
+++ b/tests/ui/conditional-compilation/invalid-node-range-issue-129166.stderr
@@ -0,0 +1,8 @@
+error: removing an expression is not supported in this position
+  --> $DIR/invalid-node-range-issue-129166.rs:7:17
+   |
+LL |     #[cfg_eval] #[cfg(not(FALSE))] 0
+   |                 ^^^^^^^^^^^^^^^^^^
+
+error: aborting due to 1 previous error
+

From 0bae33fcd503473aec70aef28b0e08abce965557 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 20 Aug 2024 12:28:39 +1000
Subject: [PATCH 4/8] Avoid nested replacement ranges.

In a case like this:
```
mod a {
    mod b {
        #[cfg_attr(unix, inline)]
        fn f() {
            #[cfg_attr(linux, inline)]
            fn g1() {}
            #[cfg_attr(linux, inline)]
            fn g2() {}
        }
    }
}
```
We currently end up with the following replacement ranges.
- The lazy tokens for `f` has replacement ranges for `g1` and `g2`.
- The lazy tokens for `a` has replacement ranges for `f`, `g1`, and
  `g2`.

I.e. the replacement ranges for `g1` and `g2` are duplicated. In
general, replacement ranges for inner AST nodes are duplicated up the
chain for each nested `collect_tokens` call. And the code that processes
the replacements is careful about the ordering in which the replacements
are applied, to ensure that inner replacements are applied before outer
replacements.

But all of this is unnecessary. If you apply an inner replacement and
then an outer replacement, the outer replacement completely overwrites
the inner replacement.

This commit avoids the duplication by removing replacements from
`self.capture_state.parser_replacements` when they are used. (The effect
on the example above is that the lazy tokesn for `a` no longer include
replacement ranges for `g1` and `g2`.) This eliminates the possibility
of nested replacements on individual AST nodes, which avoids the need
for careful ordering of replacements.
---
 .../rustc_parse/src/parser/attr_wrapper.rs    | 28 +++++--------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index 81b683705f308..a74c87ca2a71e 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -134,9 +134,8 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
                 node_replacements.array_windows()
             {
                 assert!(
-                    node_range.0.end <= next_node_range.0.start
-                        || node_range.0.end >= next_node_range.0.end,
-                    "Node ranges should be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
+                    node_range.0.end <= next_node_range.0.start,
+                    "Node ranges should be disjoint: ({:?}, {:?}) ({:?}, {:?})",
                     node_range,
                     tokens,
                     next_node_range,
@@ -144,20 +143,8 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
                 );
             }
 
-            // Process the replace ranges, starting from the highest start
-            // position and working our way back. If have tokens like:
-            //
-            // `#[cfg(FALSE)] struct Foo { #[cfg(FALSE)] field: bool }`
-            //
-            // Then we will generate replace ranges for both
-            // the `#[cfg(FALSE)] field: bool` and the entire
-            // `#[cfg(FALSE)] struct Foo { #[cfg(FALSE)] field: bool }`
-            //
-            // By starting processing from the replace range with the greatest
-            // start position, we ensure that any (outer) replace range which
-            // encloses another (inner) replace range will fully overwrite the
-            // inner range's replacement.
-            for (node_range, target) in node_replacements.into_iter().rev() {
+            // Process the replace ranges.
+            for (node_range, target) in node_replacements.into_iter() {
                 assert!(
                     !node_range.0.is_empty(),
                     "Cannot replace an empty node range: {:?}",
@@ -364,10 +351,9 @@ impl<'a> Parser<'a> {
             // from `ParserRange` form to `NodeRange` form. We will perform the actual
             // replacement only when we convert the `LazyAttrTokenStream` to an
             // `AttrTokenStream`.
-            self.capture_state.parser_replacements
-                [parser_replacements_start..parser_replacements_end]
-                .iter()
-                .cloned()
+            self.capture_state
+                .parser_replacements
+                .drain(parser_replacements_start..parser_replacements_end)
                 .chain(inner_attr_parser_replacements.into_iter())
                 .map(|(parser_range, data)| {
                     (NodeRange::new(parser_range, collect_pos.start_pos), data)

From 39b38a94e397bb2f55bb7b8e2a5cd69f649cadc1 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 22 Aug 2024 06:58:31 +1000
Subject: [PATCH 5/8] Split the assertion in `NodeRange::new`.

---
 compiler/rustc_parse/src/parser/mod.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
index 61e3fa2e6c560..b72957ace521e 100644
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -238,7 +238,8 @@ impl NodeRange {
     // is the position of the function's start token. This gives
     // `NodeRange(10..15)`.
     fn new(ParserRange(parser_range): ParserRange, start_pos: u32) -> NodeRange {
-        assert!(parser_range.start >= start_pos && parser_range.end >= start_pos);
+        assert!(!parser_range.is_empty());
+        assert!(parser_range.start >= start_pos);
         NodeRange((parser_range.start - start_pos)..(parser_range.end - start_pos))
     }
 }

From b7e7f6e903ac28021a4b099d63eefd35f125c2a6 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 21 Aug 2024 15:01:10 +1000
Subject: [PATCH 6/8] Tweak `tests/ui/proc-macro/macro-rules-derive-cfg.rs`.

- Trim some unnecessary fat from the type declaration.
- Add another attribute, to make it a stronger test of `cfg_attr`
  processing. Note that the current output is incorrect, because it
  duplicates the added attribute. The next commit will fix this.
---
 tests/ui/proc-macro/macro-rules-derive-cfg.rs |  12 +-
 .../proc-macro/macro-rules-derive-cfg.stdout  | 188 +++++++++---------
 2 files changed, 102 insertions(+), 98 deletions(-)

diff --git a/tests/ui/proc-macro/macro-rules-derive-cfg.rs b/tests/ui/proc-macro/macro-rules-derive-cfg.rs
index 68026a60be686..c34f1695761cf 100644
--- a/tests/ui/proc-macro/macro-rules-derive-cfg.rs
+++ b/tests/ui/proc-macro/macro-rules-derive-cfg.rs
@@ -14,17 +14,15 @@ extern crate test_macros;
 macro_rules! produce_it {
     ($expr:expr) => {
         #[derive(Print)]
-        struct Foo {
-            val: [bool; {
-                let a = #[cfg_attr(not(FALSE), rustc_dummy(first))] $expr;
-                0
-            }]
-        }
+        struct Foo(
+            [bool; #[cfg_attr(not(FALSE), rustc_dummy(first))] $expr]
+        );
     }
 }
 
 produce_it!(#[cfg_attr(not(FALSE), rustc_dummy(second))] {
-    #![cfg_attr(not(FALSE), allow(unused))]
+    #![cfg_attr(not(FALSE), rustc_dummy(third))]
+    #[cfg_attr(not(FALSE), rustc_dummy(fourth))]
     30
 });
 
diff --git a/tests/ui/proc-macro/macro-rules-derive-cfg.stdout b/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
index fadf210127e4d..5205a4adf2547 100644
--- a/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
+++ b/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
@@ -1,21 +1,11 @@
-PRINT-DERIVE INPUT (DISPLAY): struct Foo
+PRINT-DERIVE INPUT (DISPLAY): struct
+Foo([bool; #[rustc_dummy(first)] #[rustc_dummy(second)]
+{ #![rustc_dummy(third)] #[rustc_dummy(fourth)] #[rustc_dummy(fourth)] 30 }]);
+PRINT-DERIVE DEEP-RE-COLLECTED (DISPLAY): struct
+Foo([bool; #[rustc_dummy(first)] #[rustc_dummy(second)]
 {
-    val :
-    [bool;
-    {
-        let a = #[rustc_dummy(first)] #[rustc_dummy(second)]
-        { #![allow(unused)] 30 }; 0
-    }]
-}
-PRINT-DERIVE DEEP-RE-COLLECTED (DISPLAY): struct Foo
-{
-    val :
-    [bool;
-    {
-        let a = #[rustc_dummy(first)] #[rustc_dummy(second)]
-        { #! [allow(unused)] 30 }; 0
-    }]
-}
+    #! [rustc_dummy(third)] #[rustc_dummy(fourth)] #[rustc_dummy(fourth)] 30
+}]);
 PRINT-DERIVE INPUT (DEBUG): TokenStream [
     Ident {
         ident: "struct",
@@ -26,155 +16,171 @@ PRINT-DERIVE INPUT (DEBUG): TokenStream [
         span: $DIR/macro-rules-derive-cfg.rs:17:16: 17:19 (#3),
     },
     Group {
-        delimiter: Brace,
+        delimiter: Parenthesis,
         stream: TokenStream [
-            Ident {
-                ident: "val",
-                span: $DIR/macro-rules-derive-cfg.rs:18:13: 18:16 (#3),
-            },
-            Punct {
-                ch: ':',
-                spacing: Alone,
-                span: $DIR/macro-rules-derive-cfg.rs:18:16: 18:17 (#3),
-            },
             Group {
                 delimiter: Bracket,
                 stream: TokenStream [
                     Ident {
                         ident: "bool",
-                        span: $DIR/macro-rules-derive-cfg.rs:18:19: 18:23 (#3),
+                        span: $DIR/macro-rules-derive-cfg.rs:18:14: 18:18 (#3),
                     },
                     Punct {
                         ch: ';',
                         spacing: Alone,
-                        span: $DIR/macro-rules-derive-cfg.rs:18:23: 18:24 (#3),
+                        span: $DIR/macro-rules-derive-cfg.rs:18:18: 18:19 (#3),
+                    },
+                    Punct {
+                        ch: '#',
+                        spacing: Alone,
+                        span: $DIR/macro-rules-derive-cfg.rs:18:20: 18:21 (#3),
                     },
                     Group {
-                        delimiter: Brace,
+                        delimiter: Bracket,
                         stream: TokenStream [
                             Ident {
-                                ident: "let",
-                                span: $DIR/macro-rules-derive-cfg.rs:19:17: 19:20 (#3),
+                                ident: "rustc_dummy",
+                                span: $DIR/macro-rules-derive-cfg.rs:18:43: 18:54 (#3),
+                            },
+                            Group {
+                                delimiter: Parenthesis,
+                                stream: TokenStream [
+                                    Ident {
+                                        ident: "first",
+                                        span: $DIR/macro-rules-derive-cfg.rs:18:55: 18:60 (#3),
+                                    },
+                                ],
+                                span: $DIR/macro-rules-derive-cfg.rs:18:54: 18:61 (#3),
                             },
+                        ],
+                        span: $DIR/macro-rules-derive-cfg.rs:18:21: 18:63 (#3),
+                    },
+                    Punct {
+                        ch: '#',
+                        spacing: Alone,
+                        span: $DIR/macro-rules-derive-cfg.rs:23:13: 23:14 (#0),
+                    },
+                    Group {
+                        delimiter: Bracket,
+                        stream: TokenStream [
                             Ident {
-                                ident: "a",
-                                span: $DIR/macro-rules-derive-cfg.rs:19:21: 19:22 (#3),
+                                ident: "rustc_dummy",
+                                span: $DIR/macro-rules-derive-cfg.rs:23:36: 23:47 (#0),
                             },
-                            Punct {
-                                ch: '=',
-                                spacing: Alone,
-                                span: $DIR/macro-rules-derive-cfg.rs:19:23: 19:24 (#3),
+                            Group {
+                                delimiter: Parenthesis,
+                                stream: TokenStream [
+                                    Ident {
+                                        ident: "second",
+                                        span: $DIR/macro-rules-derive-cfg.rs:23:48: 23:54 (#0),
+                                    },
+                                ],
+                                span: $DIR/macro-rules-derive-cfg.rs:23:47: 23:55 (#0),
                             },
+                        ],
+                        span: $DIR/macro-rules-derive-cfg.rs:23:14: 23:57 (#0),
+                    },
+                    Group {
+                        delimiter: Brace,
+                        stream: TokenStream [
                             Punct {
                                 ch: '#',
+                                spacing: Joint,
+                                span: $DIR/macro-rules-derive-cfg.rs:24:5: 24:6 (#0),
+                            },
+                            Punct {
+                                ch: '!',
                                 spacing: Alone,
-                                span: $DIR/macro-rules-derive-cfg.rs:19:25: 19:26 (#3),
+                                span: $DIR/macro-rules-derive-cfg.rs:24:6: 24:7 (#0),
                             },
                             Group {
                                 delimiter: Bracket,
                                 stream: TokenStream [
                                     Ident {
                                         ident: "rustc_dummy",
-                                        span: $DIR/macro-rules-derive-cfg.rs:19:48: 19:59 (#3),
+                                        span: $DIR/macro-rules-derive-cfg.rs:24:29: 24:40 (#0),
                                     },
                                     Group {
                                         delimiter: Parenthesis,
                                         stream: TokenStream [
                                             Ident {
-                                                ident: "first",
-                                                span: $DIR/macro-rules-derive-cfg.rs:19:60: 19:65 (#3),
+                                                ident: "third",
+                                                span: $DIR/macro-rules-derive-cfg.rs:24:41: 24:46 (#0),
                                             },
                                         ],
-                                        span: $DIR/macro-rules-derive-cfg.rs:19:59: 19:66 (#3),
+                                        span: $DIR/macro-rules-derive-cfg.rs:24:40: 24:47 (#0),
                                     },
                                 ],
-                                span: $DIR/macro-rules-derive-cfg.rs:19:26: 19:68 (#3),
+                                span: $DIR/macro-rules-derive-cfg.rs:24:7: 24:49 (#0),
                             },
                             Punct {
                                 ch: '#',
                                 spacing: Alone,
-                                span: $DIR/macro-rules-derive-cfg.rs:26:13: 26:14 (#0),
+                                span: $DIR/macro-rules-derive-cfg.rs:25:5: 25:6 (#0),
                             },
                             Group {
                                 delimiter: Bracket,
                                 stream: TokenStream [
                                     Ident {
                                         ident: "rustc_dummy",
-                                        span: $DIR/macro-rules-derive-cfg.rs:26:36: 26:47 (#0),
+                                        span: $DIR/macro-rules-derive-cfg.rs:25:28: 25:39 (#0),
                                     },
                                     Group {
                                         delimiter: Parenthesis,
                                         stream: TokenStream [
                                             Ident {
-                                                ident: "second",
-                                                span: $DIR/macro-rules-derive-cfg.rs:26:48: 26:54 (#0),
+                                                ident: "fourth",
+                                                span: $DIR/macro-rules-derive-cfg.rs:25:40: 25:46 (#0),
                                             },
                                         ],
-                                        span: $DIR/macro-rules-derive-cfg.rs:26:47: 26:55 (#0),
+                                        span: $DIR/macro-rules-derive-cfg.rs:25:39: 25:47 (#0),
                                     },
                                 ],
-                                span: $DIR/macro-rules-derive-cfg.rs:26:14: 26:57 (#0),
+                                span: $DIR/macro-rules-derive-cfg.rs:25:6: 25:49 (#0),
+                            },
+                            Punct {
+                                ch: '#',
+                                spacing: Alone,
+                                span: $DIR/macro-rules-derive-cfg.rs:25:5: 25:6 (#0),
                             },
                             Group {
-                                delimiter: Brace,
+                                delimiter: Bracket,
                                 stream: TokenStream [
-                                    Punct {
-                                        ch: '#',
-                                        spacing: Joint,
-                                        span: $DIR/macro-rules-derive-cfg.rs:27:5: 27:6 (#0),
-                                    },
-                                    Punct {
-                                        ch: '!',
-                                        spacing: Alone,
-                                        span: $DIR/macro-rules-derive-cfg.rs:27:6: 27:7 (#0),
+                                    Ident {
+                                        ident: "rustc_dummy",
+                                        span: $DIR/macro-rules-derive-cfg.rs:25:28: 25:39 (#0),
                                     },
                                     Group {
-                                        delimiter: Bracket,
+                                        delimiter: Parenthesis,
                                         stream: TokenStream [
                                             Ident {
-                                                ident: "allow",
-                                                span: $DIR/macro-rules-derive-cfg.rs:27:29: 27:34 (#0),
-                                            },
-                                            Group {
-                                                delimiter: Parenthesis,
-                                                stream: TokenStream [
-                                                    Ident {
-                                                        ident: "unused",
-                                                        span: $DIR/macro-rules-derive-cfg.rs:27:35: 27:41 (#0),
-                                                    },
-                                                ],
-                                                span: $DIR/macro-rules-derive-cfg.rs:27:34: 27:42 (#0),
+                                                ident: "fourth",
+                                                span: $DIR/macro-rules-derive-cfg.rs:25:40: 25:46 (#0),
                                             },
                                         ],
-                                        span: $DIR/macro-rules-derive-cfg.rs:27:7: 27:44 (#0),
-                                    },
-                                    Literal {
-                                        kind: Integer,
-                                        symbol: "30",
-                                        suffix: None,
-                                        span: $DIR/macro-rules-derive-cfg.rs:28:5: 28:7 (#0),
+                                        span: $DIR/macro-rules-derive-cfg.rs:25:39: 25:47 (#0),
                                     },
                                 ],
-                                span: $DIR/macro-rules-derive-cfg.rs:26:58: 29:2 (#0),
-                            },
-                            Punct {
-                                ch: ';',
-                                spacing: Alone,
-                                span: $DIR/macro-rules-derive-cfg.rs:19:74: 19:75 (#3),
+                                span: $DIR/macro-rules-derive-cfg.rs:25:6: 25:49 (#0),
                             },
                             Literal {
                                 kind: Integer,
-                                symbol: "0",
+                                symbol: "30",
                                 suffix: None,
-                                span: $DIR/macro-rules-derive-cfg.rs:20:17: 20:18 (#3),
+                                span: $DIR/macro-rules-derive-cfg.rs:26:5: 26:7 (#0),
                             },
                         ],
-                        span: $DIR/macro-rules-derive-cfg.rs:18:25: 21:14 (#3),
+                        span: $DIR/macro-rules-derive-cfg.rs:23:58: 27:2 (#0),
                     },
                 ],
-                span: $DIR/macro-rules-derive-cfg.rs:18:18: 21:15 (#3),
+                span: $DIR/macro-rules-derive-cfg.rs:18:13: 18:70 (#3),
             },
         ],
-        span: $DIR/macro-rules-derive-cfg.rs:17:20: 22:10 (#3),
+        span: $DIR/macro-rules-derive-cfg.rs:17:19: 19:10 (#3),
+    },
+    Punct {
+        ch: ';',
+        spacing: Alone,
+        span: $DIR/macro-rules-derive-cfg.rs:19:10: 19:11 (#3),
     },
 ]

From 1fdabfbebbb12b2836f91aeec3157fd092f9a8ad Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 21 Aug 2024 14:16:42 +1000
Subject: [PATCH 7/8] Avoid double-handling of attributes in `collect_tokens`.

By keeping track of attributes that have been previously processed.

This fixes the `macro-rules-derive-cfg.stdout` test, and is necessary
for #124141 which removes nonterminals.

Also shrink the `SmallVec` inline size used in `IntervalSet`. 2 gives
slightly better perf than 4 now that there's an `IntervalSet` in
`Parser`, which is cloned reasonably often.
---
 Cargo.lock                                    |  1 +
 compiler/rustc_index/src/interval.rs          |  2 +-
 compiler/rustc_parse/Cargo.toml               |  1 +
 .../rustc_parse/src/parser/attr_wrapper.rs    | 23 +++++++++++---
 compiler/rustc_parse/src/parser/mod.rs        |  7 ++++-
 .../proc-macro/macro-rules-derive-cfg.stdout  | 31 ++-----------------
 6 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a18219b56837e..4ce6029afe8a5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4172,6 +4172,7 @@ dependencies = [
  "rustc_errors",
  "rustc_feature",
  "rustc_fluent_macro",
+ "rustc_index",
  "rustc_lexer",
  "rustc_macros",
  "rustc_session",
diff --git a/compiler/rustc_index/src/interval.rs b/compiler/rustc_index/src/interval.rs
index be028feca605f..503470f896d09 100644
--- a/compiler/rustc_index/src/interval.rs
+++ b/compiler/rustc_index/src/interval.rs
@@ -18,7 +18,7 @@ mod tests;
 #[derive(Debug, Clone)]
 pub struct IntervalSet<I> {
     // Start, end
-    map: SmallVec<[(u32, u32); 4]>,
+    map: SmallVec<[(u32, u32); 2]>,
     domain: usize,
     _data: PhantomData<I>,
 }
diff --git a/compiler/rustc_parse/Cargo.toml b/compiler/rustc_parse/Cargo.toml
index b33896154f239..c59ae48a07d03 100644
--- a/compiler/rustc_parse/Cargo.toml
+++ b/compiler/rustc_parse/Cargo.toml
@@ -12,6 +12,7 @@ rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_errors = { path = "../rustc_errors" }
 rustc_feature = { path = "../rustc_feature" }
 rustc_fluent_macro = { path = "../rustc_fluent_macro" }
+rustc_index = { path = "../rustc_index" }
 rustc_lexer = { path = "../rustc_lexer" }
 rustc_macros = { path = "../rustc_macros" }
 rustc_session = { path = "../rustc_session" }
diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index a74c87ca2a71e..b2abe5464b463 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -256,6 +256,20 @@ impl<'a> Parser<'a> {
             res?
         };
 
+        // Ignore any attributes we've previously processed. This happens when
+        // an inner call to `collect_tokens` returns an AST node and then an
+        // outer call ends up with the same AST node without any additional
+        // wrapping layer.
+        let ret_attrs: AttrVec = ret
+            .attrs()
+            .iter()
+            .cloned()
+            .filter(|attr| {
+                let is_unseen = self.capture_state.seen_attrs.insert(attr.id);
+                is_unseen
+            })
+            .collect();
+
         // When we're not in "definite capture mode", then skip collecting and
         // return early if either of the following conditions hold.
         // - `None`: Our target doesn't support tokens at all (e.g. `NtIdent`).
@@ -269,7 +283,7 @@ impl<'a> Parser<'a> {
         // tokens.
         let definite_capture_mode = self.capture_cfg
             && matches!(self.capture_state.capturing, Capturing::Yes)
-            && has_cfg_or_cfg_attr(ret.attrs());
+            && has_cfg_or_cfg_attr(&ret_attrs);
         if !definite_capture_mode && matches!(ret.tokens_mut(), None | Some(Some(_))) {
             return Ok(ret);
         }
@@ -289,7 +303,7 @@ impl<'a> Parser<'a> {
             //   outer and inner attributes. So this check is more precise than
             //   the earlier `needs_tokens` check, and we don't need to
             //   check `R::SUPPORTS_CUSTOM_INNER_ATTRS`.)
-            || needs_tokens(ret.attrs())
+            || needs_tokens(&ret_attrs)
             // - We are in "definite capture mode", which requires that there
             //   are `#[cfg]` or `#[cfg_attr]` attributes. (During normal
             //   non-`capture_cfg` parsing, we don't need any special capturing
@@ -328,7 +342,7 @@ impl<'a> Parser<'a> {
         // `Parser::parse_inner_attributes`, and pair them in a `ParserReplacement` with `None`,
         // which means the relevant tokens will be removed. (More details below.)
         let mut inner_attr_parser_replacements = Vec::new();
-        for attr in ret.attrs() {
+        for attr in ret_attrs.iter() {
             if attr.style == ast::AttrStyle::Inner {
                 if let Some(inner_attr_parser_range) =
                     self.capture_state.inner_attr_parser_ranges.remove(&attr.id)
@@ -418,7 +432,7 @@ impl<'a> Parser<'a> {
             // cfg-expand this AST node.
             let start_pos =
                 if has_outer_attrs { attrs.start_pos.unwrap() } else { collect_pos.start_pos };
-            let target = AttrsTarget { attrs: ret.attrs().iter().cloned().collect(), tokens };
+            let target = AttrsTarget { attrs: ret_attrs, tokens };
             tokens_used = true;
             self.capture_state
                 .parser_replacements
@@ -428,6 +442,7 @@ impl<'a> Parser<'a> {
             // the outermost call to this method.
             self.capture_state.parser_replacements.clear();
             self.capture_state.inner_attr_parser_ranges.clear();
+            self.capture_state.seen_attrs.clear();
         }
         assert!(tokens_used); // check we didn't create `tokens` unnecessarily
         Ok(ret)
diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
index b72957ace521e..3ffc5fa401100 100644
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -35,6 +35,7 @@ use rustc_ast_pretty::pprust;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{Applicability, Diag, FatalError, MultiSpan, PResult};
+use rustc_index::interval::IntervalSet;
 use rustc_session::parse::ParseSess;
 use rustc_span::symbol::{kw, sym, Ident, Symbol};
 use rustc_span::{Span, DUMMY_SP};
@@ -183,7 +184,7 @@ pub struct Parser<'a> {
 // This type is used a lot, e.g. it's cloned when matching many declarative macro rules with nonterminals. Make sure
 // it doesn't unintentionally get bigger.
 #[cfg(target_pointer_width = "64")]
-rustc_data_structures::static_assert_size!(Parser<'_>, 256);
+rustc_data_structures::static_assert_size!(Parser<'_>, 288);
 
 /// Stores span information about a closure.
 #[derive(Clone, Debug)]
@@ -261,6 +262,9 @@ struct CaptureState {
     capturing: Capturing,
     parser_replacements: Vec<ParserReplacement>,
     inner_attr_parser_ranges: FxHashMap<AttrId, ParserRange>,
+    // `IntervalSet` is good for perf because attrs are mostly added to this
+    // set in contiguous ranges.
+    seen_attrs: IntervalSet<AttrId>,
 }
 
 /// Iterator over a `TokenStream` that produces `Token`s. It's a bit odd that
@@ -458,6 +462,7 @@ impl<'a> Parser<'a> {
                 capturing: Capturing::No,
                 parser_replacements: Vec::new(),
                 inner_attr_parser_ranges: Default::default(),
+                seen_attrs: IntervalSet::new(u32::MAX as usize),
             },
             current_closure: None,
             recovery: Recovery::Allowed,
diff --git a/tests/ui/proc-macro/macro-rules-derive-cfg.stdout b/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
index 5205a4adf2547..c1e46b50d40c2 100644
--- a/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
+++ b/tests/ui/proc-macro/macro-rules-derive-cfg.stdout
@@ -1,11 +1,9 @@
 PRINT-DERIVE INPUT (DISPLAY): struct
 Foo([bool; #[rustc_dummy(first)] #[rustc_dummy(second)]
-{ #![rustc_dummy(third)] #[rustc_dummy(fourth)] #[rustc_dummy(fourth)] 30 }]);
+{ #![rustc_dummy(third)] #[rustc_dummy(fourth)] 30 }]);
 PRINT-DERIVE DEEP-RE-COLLECTED (DISPLAY): struct
 Foo([bool; #[rustc_dummy(first)] #[rustc_dummy(second)]
-{
-    #! [rustc_dummy(third)] #[rustc_dummy(fourth)] #[rustc_dummy(fourth)] 30
-}]);
+{ #! [rustc_dummy(third)] #[rustc_dummy(fourth)] 30 }]);
 PRINT-DERIVE INPUT (DEBUG): TokenStream [
     Ident {
         ident: "struct",
@@ -138,31 +136,6 @@ PRINT-DERIVE INPUT (DEBUG): TokenStream [
                                 ],
                                 span: $DIR/macro-rules-derive-cfg.rs:25:6: 25:49 (#0),
                             },
-                            Punct {
-                                ch: '#',
-                                spacing: Alone,
-                                span: $DIR/macro-rules-derive-cfg.rs:25:5: 25:6 (#0),
-                            },
-                            Group {
-                                delimiter: Bracket,
-                                stream: TokenStream [
-                                    Ident {
-                                        ident: "rustc_dummy",
-                                        span: $DIR/macro-rules-derive-cfg.rs:25:28: 25:39 (#0),
-                                    },
-                                    Group {
-                                        delimiter: Parenthesis,
-                                        stream: TokenStream [
-                                            Ident {
-                                                ident: "fourth",
-                                                span: $DIR/macro-rules-derive-cfg.rs:25:40: 25:46 (#0),
-                                            },
-                                        ],
-                                        span: $DIR/macro-rules-derive-cfg.rs:25:39: 25:47 (#0),
-                                    },
-                                ],
-                                span: $DIR/macro-rules-derive-cfg.rs:25:6: 25:49 (#0),
-                            },
                             Literal {
                                 kind: Integer,
                                 symbol: "30",

From d4bf28c014772f7233fc396992164c72c08fbfd9 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 22 Aug 2024 13:14:40 +1000
Subject: [PATCH 8/8] Optimize `collect_tokens` a little.

Use `Cow` to avoid cloning `ret.attrs()` unless necessary. This requires
moving some things around to satisfy the borrow checker.
---
 .../rustc_parse/src/parser/attr_wrapper.rs    | 63 ++++++++++++-------
 1 file changed, 40 insertions(+), 23 deletions(-)

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index b2abe5464b463..205cca830b2e0 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::{iter, mem};
 
 use rustc_ast::token::{Delimiter, Token, TokenKind};
@@ -6,6 +7,7 @@ use rustc_ast::tokenstream::{
     Spacing, ToAttrTokenStream,
 };
 use rustc_ast::{self as ast, AttrVec, Attribute, HasAttrs, HasTokens};
+use rustc_data_structures::fx::FxHashSet;
 use rustc_errors::PResult;
 use rustc_session::parse::ParseSess;
 use rustc_span::{sym, Span, DUMMY_SP};
@@ -256,27 +258,40 @@ impl<'a> Parser<'a> {
             res?
         };
 
+        // - `None`: Our target doesn't support tokens at all (e.g. `NtIdent`).
+        // - `Some(None)`: Our target supports tokens and has none.
+        // - `Some(Some(_))`: Our target already has tokens set (e.g. we've
+        //   parsed something like `#[my_attr] $item`).
+        let ret_can_hold_tokens = matches!(ret.tokens_mut(), Some(None));
+
         // Ignore any attributes we've previously processed. This happens when
         // an inner call to `collect_tokens` returns an AST node and then an
         // outer call ends up with the same AST node without any additional
         // wrapping layer.
-        let ret_attrs: AttrVec = ret
-            .attrs()
-            .iter()
-            .cloned()
-            .filter(|attr| {
-                let is_unseen = self.capture_state.seen_attrs.insert(attr.id);
-                is_unseen
-            })
-            .collect();
+        let mut seen_indices = FxHashSet::default();
+        for (i, attr) in ret.attrs().iter().enumerate() {
+            let is_unseen = self.capture_state.seen_attrs.insert(attr.id);
+            if !is_unseen {
+                seen_indices.insert(i);
+            }
+        }
+        let ret_attrs: Cow<'_, [Attribute]> =
+            if seen_indices.is_empty() {
+                Cow::Borrowed(ret.attrs())
+            } else {
+                let ret_attrs =
+                    ret.attrs()
+                        .iter()
+                        .enumerate()
+                        .filter_map(|(i, attr)| {
+                            if seen_indices.contains(&i) { None } else { Some(attr.clone()) }
+                        })
+                        .collect();
+                Cow::Owned(ret_attrs)
+            };
 
         // When we're not in "definite capture mode", then skip collecting and
-        // return early if either of the following conditions hold.
-        // - `None`: Our target doesn't support tokens at all (e.g. `NtIdent`).
-        // - `Some(Some(_))`: Our target already has tokens set (e.g. we've
-        //   parsed something like `#[my_attr] $item`). The actual parsing code
-        //   takes care of prepending any attributes to the nonterminal, so we
-        //   don't need to modify the already captured tokens.
+        // return early if `ret` doesn't support tokens or already has some.
         //
         // Note that this check is independent of `force_collect`. There's no
         // need to collect tokens when we don't support tokens or already have
@@ -284,7 +299,7 @@ impl<'a> Parser<'a> {
         let definite_capture_mode = self.capture_cfg
             && matches!(self.capture_state.capturing, Capturing::Yes)
             && has_cfg_or_cfg_attr(&ret_attrs);
-        if !definite_capture_mode && matches!(ret.tokens_mut(), None | Some(Some(_))) {
+        if !definite_capture_mode && !ret_can_hold_tokens {
             return Ok(ret);
         }
 
@@ -406,12 +421,6 @@ impl<'a> Parser<'a> {
         });
         let mut tokens_used = false;
 
-        // If we support tokens and don't already have them, store the newly captured tokens.
-        if let Some(target_tokens @ None) = ret.tokens_mut() {
-            tokens_used = true;
-            *target_tokens = Some(tokens.clone());
-        }
-
         // If in "definite capture mode" we need to register a replace range
         // for the `#[cfg]` and/or `#[cfg_attr]` attrs. This allows us to run
         // eager cfg-expansion on the captured token stream.
@@ -432,7 +441,8 @@ impl<'a> Parser<'a> {
             // cfg-expand this AST node.
             let start_pos =
                 if has_outer_attrs { attrs.start_pos.unwrap() } else { collect_pos.start_pos };
-            let target = AttrsTarget { attrs: ret_attrs, tokens };
+            let target =
+                AttrsTarget { attrs: ret_attrs.iter().cloned().collect(), tokens: tokens.clone() };
             tokens_used = true;
             self.capture_state
                 .parser_replacements
@@ -444,6 +454,13 @@ impl<'a> Parser<'a> {
             self.capture_state.inner_attr_parser_ranges.clear();
             self.capture_state.seen_attrs.clear();
         }
+
+        // If we support tokens and don't already have them, store the newly captured tokens.
+        if let Some(target_tokens @ None) = ret.tokens_mut() {
+            tokens_used = true;
+            *target_tokens = Some(tokens);
+        }
+
         assert!(tokens_used); // check we didn't create `tokens` unnecessarily
         Ok(ret)
     }