From b7e6717679567ae8bd03cd1fe671fb94f419e53f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 10 Oct 2023 15:08:26 +0200
Subject: [PATCH] Add `_CCCL_IMPLICIT_SYSTEM_HEADER` to cub headers

---
 cub/cub/agent/agent_adjacent_difference.cuh   |    5 +-
 cub/cub/agent/agent_batch_memcpy.cuh          |    4 +
 cub/cub/agent/agent_histogram.cuh             |   11 +-
 cub/cub/agent/agent_merge_sort.cuh            |    3 +
 cub/cub/agent/agent_radix_sort_downsweep.cuh  |   15 +-
 cub/cub/agent/agent_radix_sort_histogram.cuh  |   11 +-
 cub/cub/agent/agent_radix_sort_onesweep.cuh   |   29 +-
 cub/cub/agent/agent_radix_sort_upsweep.cuh    |    9 +-
 cub/cub/agent/agent_reduce.cuh                |   17 +-
 cub/cub/agent/agent_reduce_by_key.cuh         |   11 +-
 cub/cub/agent/agent_rle.cuh                   |   39 +-
 cub/cub/agent/agent_scan.cuh                  |   19 +-
 cub/cub/agent/agent_scan_by_key.cuh           |   19 +-
 cub/cub/agent/agent_segment_fixup.cuh         |    7 +-
 cub/cub/agent/agent_segmented_radix_sort.cuh  |    7 +-
 cub/cub/agent/agent_select_if.cuh             |   21 +-
 cub/cub/agent/agent_spmv_orig.cuh             |    5 +-
 cub/cub/agent/agent_sub_warp_merge_sort.cuh   |   13 +-
 cub/cub/agent/agent_three_way_partition.cuh   |    5 +-
 cub/cub/agent/agent_unique_by_key.cuh         |   16 +-
 cub/cub/agent/single_pass_scan_operators.cuh  |   63 +-
 cub/cub/block/block_adjacent_difference.cuh   |   23 +-
 cub/cub/block/block_discontinuity.cuh         |    7 +-
 cub/cub/block/block_exchange.cuh              |    9 +-
 cub/cub/block/block_histogram.cuh             |    9 +-
 cub/cub/block/block_load.cuh                  |    5 +-
 cub/cub/block/block_merge_sort.cuh            |    4 +
 cub/cub/block/block_radix_rank.cuh            |   43 +-
 cub/cub/block/block_radix_sort.cuh            |  363 ++---
 cub/cub/block/block_raking_layout.cuh         |    7 +-
 cub/cub/block/block_reduce.cuh                |   15 +-
 cub/cub/block/block_run_length_decode.cuh     |    3 +
 cub/cub/block/block_scan.cuh                  |   27 +-
 cub/cub/block/block_shuffle.cuh               |    7 +-
 cub/cub/block/block_store.cuh                 |   11 +-
 cub/cub/block/radix_rank_sort_operations.cuh  |   23 +-
 .../block_histogram_atomic.cuh                |    8 +-
 .../specializations/block_histogram_sort.cuh  |    9 +-
 .../specializations/block_reduce_raking.cuh   |    9 +-
 .../block_reduce_raking_commutative_only.cuh  |    9 +-
 .../block_reduce_warp_reductions.cuh          |    9 +-
 .../specializations/block_scan_raking.cuh     |   10 +-
 .../specializations/block_scan_warp_scans.cuh |    9 +-
 cub/cub/config.cuh                            |    5 +
 cub/cub/cub.cuh                               |    4 +-
 cub/cub/detail/choose_offset.cuh              |    5 +-
 cub/cub/detail/cpp_compatibility.cuh          |    4 +-
 cub/cub/detail/detect_cuda_runtime.cuh        |    5 +
 cub/cub/detail/device_double_buffer.cuh       |    4 +
 cub/cub/detail/device_synchronize.cuh         |    6 +-
 cub/cub/detail/exec_check_disable.cuh         |    4 +-
 cub/cub/detail/strong_load.cuh                |    5 +-
 cub/cub/detail/strong_store.cuh               |    5 +-
 cub/cub/detail/temporary_storage.cuh          |    4 +
 cub/cub/detail/type_traits.cuh                |    6 +-
 cub/cub/detail/uninitialized_copy.cuh         |   15 +-
 cub/cub/device/device_adjacent_difference.cuh |    5 +-
 cub/cub/device/device_copy.cuh                |    5 +-
 cub/cub/device/device_histogram.cuh           |  631 ++++----
 cub/cub/device/device_memcpy.cuh              |    5 +-
 cub/cub/device/device_merge_sort.cuh          |    5 +-
 cub/cub/device/device_partition.cuh           |   23 +-
 cub/cub/device/device_radix_sort.cuh          | 1347 +++++++++--------
 cub/cub/device/device_reduce.cuh              |  399 ++---
 cub/cub/device/device_run_length_encode.cuh   |    5 +-
 cub/cub/device/device_scan.cuh                |  741 ++++-----
 .../device/device_segmented_radix_sort.cuh    |  773 +++++-----
 cub/cub/device/device_segmented_reduce.cuh    |    5 +-
 cub/cub/device/device_segmented_sort.cuh      |  169 ++-
 cub/cub/device/device_select.cuh              |  377 ++---
 cub/cub/device/device_spmv.cuh                |    4 +
 .../dispatch/dispatch_adjacent_difference.cuh |   13 +-
 .../device/dispatch/dispatch_batch_memcpy.cuh |    5 +-
 .../device/dispatch/dispatch_histogram.cuh    |    9 +-
 .../device/dispatch/dispatch_merge_sort.cuh   |    4 +
 .../device/dispatch/dispatch_radix_sort.cuh   |   85 +-
 cub/cub/device/dispatch/dispatch_reduce.cuh   |    5 +-
 .../dispatch/dispatch_reduce_by_key.cuh       |    9 +-
 cub/cub/device/dispatch/dispatch_rle.cuh      |    9 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     |   17 +-
 .../device/dispatch/dispatch_scan_by_key.cuh  |   17 +-
 .../dispatch/dispatch_segmented_sort.cuh      |    4 +
 .../device/dispatch/dispatch_select_if.cuh    |  103 +-
 .../device/dispatch/dispatch_spmv_orig.cuh    |   11 +-
 .../dispatch/dispatch_three_way_partition.cuh |    5 +-
 .../dispatch/dispatch_unique_by_key.cuh       |    2 +-
 .../dispatch/tuning/tuning_histogram.cuh      |    5 +-
 .../dispatch/tuning/tuning_reduce_by_key.cuh  |    5 +-
 .../tuning/tuning_run_length_encode.cuh       |   19 +-
 .../device/dispatch/tuning/tuning_scan.cuh    |    9 +-
 .../dispatch/tuning/tuning_scan_by_key.cuh    |   25 +-
 .../dispatch/tuning/tuning_select_if.cuh      |   25 +-
 .../tuning/tuning_three_way_partition.cuh     |    5 +-
 .../dispatch/tuning/tuning_unique_by_key.cuh  |    7 +-
 cub/cub/grid/grid_barrier.cuh                 |   13 +-
 cub/cub/grid/grid_even_share.cuh              |    9 +-
 cub/cub/grid/grid_mapping.cuh                 |    3 +
 cub/cub/grid/grid_queue.cuh                   |    5 +-
 cub/cub/host/mutex.cuh                        |    9 +-
 cub/cub/iterator/arg_index_input_iterator.cuh |    9 +-
 .../cache_modified_input_iterator.cuh         |    5 +-
 .../cache_modified_output_iterator.cuh        |    9 +-
 cub/cub/iterator/constant_input_iterator.cuh  |    9 +-
 cub/cub/iterator/counting_input_iterator.cuh  |    9 +-
 cub/cub/iterator/discard_output_iterator.cuh  |   10 +-
 cub/cub/iterator/tex_obj_input_iterator.cuh   |    5 +-
 cub/cub/iterator/tex_ref_input_iterator.cuh   |    5 +-
 cub/cub/iterator/transform_input_iterator.cuh |    9 +-
 cub/cub/thread/thread_load.cuh                |    5 +-
 cub/cub/thread/thread_operators.cuh           |   33 +-
 cub/cub/thread/thread_reduce.cuh              |   15 +-
 cub/cub/thread/thread_scan.cuh                |    7 +-
 cub/cub/thread/thread_search.cuh              |    9 +-
 cub/cub/thread/thread_sort.cuh                |    3 +
 cub/cub/thread/thread_store.cuh               |    5 +-
 cub/cub/util_allocator.cuh                    |   36 +-
 cub/cub/util_arch.cuh                         |   12 +-
 cub/cub/util_compiler.cuh                     |    5 +
 cub/cub/util_cpp_dialect.cuh                  |    4 +
 cub/cub/util_debug.cuh                        |   27 +-
 cub/cub/util_deprecated.cuh                   |    6 +-
 cub/cub/util_device.cuh                       |   12 +-
 cub/cub/util_macro.cuh                        |   10 +-
 cub/cub/util_math.cuh                         |    7 +-
 cub/cub/util_namespace.cuh                    |    5 +-
 cub/cub/util_ptx.cuh                          |   50 +-
 cub/cub/util_temporary_storage.cuh            |    4 +
 cub/cub/util_type.cuh                         |   15 +-
 cub/cub/version.cuh                           |    5 +
 .../specializations/warp_exchange_shfl.cuh    |    9 +-
 .../specializations/warp_exchange_smem.cuh    |    5 +-
 .../warp/specializations/warp_reduce_shfl.cuh |   27 +-
 .../warp/specializations/warp_reduce_smem.cuh |    7 +-
 .../warp/specializations/warp_scan_shfl.cuh   |    7 +-
 .../warp/specializations/warp_scan_smem.cuh   |    7 +-
 cub/cub/warp/warp_exchange.cuh                |    5 +-
 cub/cub/warp/warp_load.cuh                    |    5 +-
 cub/cub/warp/warp_merge_sort.cuh              |    5 +-
 cub/cub/warp/warp_reduce.cuh                  |    5 +-
 cub/cub/warp/warp_scan.cuh                    |    5 +-
 cub/cub/warp/warp_store.cuh                   |    5 +-
 thrust/thrust/system/cuda/detail/util.h       |    2 +-
 142 files changed, 3402 insertions(+), 2988 deletions(-)

diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
index b135fbbf53b..2855242e7eb 100644
--- a/cub/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_namespace.cuh"
 #include "../block/block_load.cuh"
@@ -159,7 +162,7 @@ struct AgentDifference
       }
       else
       {
-        InputT tile_prev_input = MayAlias 
+        InputT tile_prev_input = MayAlias
                                ? first_tile_previous[tile_idx]
                                : *(input_it + tile_base - 1);
 
diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
index 20db5bb6aec..a7602f7786c 100644
--- a/cub/cub/agent/agent_batch_memcpy.cuh
+++ b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_exchange.cuh>
 #include <cub/block/block_load.cuh>
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 94c2efcaf4e..ddae3ec9cad 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -33,11 +33,14 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "../util_type.cuh"
 #include "../block/block_load.cuh"
-#include "../config.cuh"
 #include "../grid/grid_queue.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 
@@ -301,8 +304,8 @@ struct AgentHistogram
         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
         {
             int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
+            for (int privatized_bin = threadIdx.x;
+                    privatized_bin < channel_bins;
                     privatized_bin += BLOCK_THREADS)
             {
                 int         output_bin  = -1;
@@ -631,7 +634,7 @@ struct AgentHistogram
                 // Consume a partially-full tile at the end of the row
                 OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
                 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-            } 
+            }
             else
             {
                 // Consume full tile
diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
index 2e994dd97e1..adbaa572d2b 100644
--- a/cub/cub/agent/agent_merge_sort.cuh
+++ b/cub/cub/agent/agent_merge_sort.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_namespace.cuh"
 #include "../block/block_load.cuh"
diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh
index d7e77bb882c..b66ad972a5c 100644
--- a/cub/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -35,6 +35,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdint.h>
 #include <type_traits>
 
@@ -44,7 +48,6 @@
 #include <cub/block/block_radix_rank.cuh>
 #include <cub/block/block_exchange.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
-#include <cub/config.cuh>
 #include <cub/util_type.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 
@@ -135,7 +138,7 @@ struct AgentRadixSortDownsweep
     using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;
 
     // Radix ranking type to use
-    using BlockRadixRankT = 
+    using BlockRadixRankT =
       cub::detail::block_radix_rank_t<
         RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;
 
@@ -202,7 +205,7 @@ struct AgentRadixSortDownsweep
     // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
     OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
 
-    std::uint32_t current_bit; 
+    std::uint32_t current_bit;
     std::uint32_t num_bits;
 
     // Whether to short-cirucit
@@ -488,15 +491,15 @@ struct AgentRadixSortDownsweep
         OffsetT           relative_bin_offsets[ITEMS_PER_THREAD];
 
         // Assign default (min/max) value to all keys
-        bit_ordered_type default_key = IS_DESCENDING 
-                                     ? traits::min_raw_binary_key(decomposer) 
+        bit_ordered_type default_key = IS_DESCENDING
+                                     ? traits::min_raw_binary_key(decomposer)
                                      : traits::max_raw_binary_key(decomposer);
 
         // Load tile of keys
         LoadKeys(
             keys,
             block_offset,
-            valid_items, 
+            valid_items,
             default_key,
             Int2Type<FULL_TILE>(),
             Int2Type<LOAD_WARP_STRIPED>());
diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
index 9e895b8d0d2..b5af14e8145 100644
--- a/cub/cub/agent/agent_radix_sort_histogram.cuh
+++ b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -34,9 +34,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../block/block_load.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../thread/thread_reduce.cuh"
 #include "../util_math.cuh"
 #include "../util_type.cuh"
@@ -120,7 +123,7 @@ struct AgentRadixSortHistogram
     // thread fields
     // shared memory storage
     _TempStorage& s;
-  
+
     // bins for the histogram
     OffsetT* d_bins_out;
 
@@ -175,7 +178,7 @@ struct AgentRadixSortHistogram
     }
 
     __device__ __forceinline__
-    void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])    
+    void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
     {
         // tile_offset < num_items always, hence the line below works
         bool full_tile = num_items - tile_offset >= TILE_ITEMS;
@@ -264,7 +267,7 @@ struct AgentRadixSortHistogram
                 AccumulateSharedHistograms(tile_offset, keys);
             }
             CTA_SYNC();
-            
+
             // Accumulate the result in global memory.
             AccumulateGlobalHistograms();
             CTA_SYNC();
diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
index 6c57d1f0215..e4d47c9797f 100644
--- a/cub/cub/agent/agent_radix_sort_onesweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../block/block_radix_rank.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
 #include "../block/block_store.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
@@ -64,7 +67,7 @@ template <
     int NOMINAL_BLOCK_THREADS_4B,
     int NOMINAL_ITEMS_PER_THREAD_4B,
     typename ComputeT,
-    /** \brief Number of private histograms to use in the ranker; 
+    /** \brief Number of private histograms to use in the ranker;
         ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
     int _RANK_NUM_PARTS,
     /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
@@ -106,7 +109,7 @@ struct AgentRadixSortOnesweep
         RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
         TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
         RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
-        RADIX_DIGITS = 1 << RADIX_BITS,        
+        RADIX_DIGITS = 1 << RADIX_BITS,
         BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
         FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
         WARP_THREADS = CUB_PTX_WARP_THREADS,
@@ -127,7 +130,7 @@ struct AgentRadixSortOnesweep
       typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
 
     typedef PortionOffsetT AtomicOffsetT;
-  
+
     static constexpr RadixRankAlgorithm RANK_ALGORITHM =
                                     AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
     static constexpr BlockScanAlgorithm SCAN_ALGORITHM =
@@ -224,7 +227,7 @@ struct AgentRadixSortOnesweep
     __device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
     {
         #pragma unroll
-        for (int u = 0; u < BINS_PER_THREAD; ++u) 
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
         {
             int bin = ThreadBin(u);
             if (FULL_BINS || bin < RADIX_DIGITS)
@@ -260,7 +263,7 @@ struct AgentRadixSortOnesweep
             agent.TryShortCircuit(keys, bins);
         }
     };
-  
+
     __device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
     {
         #pragma unroll
@@ -452,7 +455,7 @@ struct AgentRadixSortOnesweep
             {
                 s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
             }
-        }        
+        }
     }
 
     __device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
@@ -512,7 +515,7 @@ struct AgentRadixSortOnesweep
         constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
         constexpr int ALIGN = 8;
         constexpr auto CACHE_MODIFIER = STORE_CG;
-        
+
         int warp_start = warp * ITEMS_PER_WARP;
         int warp_end = (warp + 1) * ITEMS_PER_WARP;
         int warp_offset = warp_start;
@@ -596,11 +599,11 @@ struct AgentRadixSortOnesweep
         // compute digits corresponding to the keys
         int digits[ITEMS_PER_THREAD];
         ComputeKeyDigits(digits);
-        
+
         // load values
         ValueT values[ITEMS_PER_THREAD];
         LoadValues(block_idx * TILE_ITEMS, values);
-        
+
         // scatter values
         CTA_SYNC();
         ScatterValuesShared(values, ranks);
@@ -608,7 +611,7 @@ struct AgentRadixSortOnesweep
         CTA_SYNC();
         ScatterValuesGlobal(digits);
     }
-        
+
 
     __device__ __forceinline__ void GatherScatterValues(
         int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
@@ -628,7 +631,7 @@ struct AgentRadixSortOnesweep
         BlockRadixRankT(s.rank_temp_storage).RankKeys(
             keys, ranks, digit_extractor(), exclusive_digit_prefix,
             CountsCallback(*this, bins, keys));
-        
+
         // scatter keys in shared memory
         CTA_SYNC();
         ScatterKeysShared(keys, ranks);
@@ -637,7 +640,7 @@ struct AgentRadixSortOnesweep
         LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
         LookbackGlobal(bins);
         UpdateBinsGlobal(bins, exclusive_digit_prefix);
-                
+
         // scatter keys in global memory
         CTA_SYNC();
         ScatterKeysGlobal();
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index 1a534454627..6b387b0b771 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_load.cuh"
 #include "../warp/warp_reduce.cuh"
 #include "../block/block_load.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 
@@ -321,7 +324,7 @@ struct AgentRadixSortUpsweep
         const OffsetT &block_end)
     {
         // Process partial tile if necessary using single loads
-        for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) 
+        for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
         {
             // Load and bucket key
             bit_ordered_type key = d_keys_in[block_offset + offset];
@@ -346,7 +349,7 @@ struct AgentRadixSortUpsweep
     :
         temp_storage(temp_storage.Alias()),
         d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in)),
-        current_bit(current_bit), 
+        current_bit(current_bit),
         num_bits(num_bits),
         decomposer(decomposer)
     {}
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index 4a29d707b30..15da056b57b 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,17 +27,20 @@
  ******************************************************************************/
 
 /**
- * @file cub::AgentReduce implements a stateful abstraction of CUDA thread 
+ * @file cub::AgentReduce implements a stateful abstraction of CUDA thread
  *       blocks for participating in device-wide reduction.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/type_traits.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/grid/grid_mapping.cuh>
@@ -368,7 +371,7 @@ struct AgentReduce
         .Reduce(thread_aggregate, reduction_op, valid_items);
     }
 
-    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse 
+    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse
     // the block reduction below. This also workaround hang in nvcc.
     ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);
 
@@ -439,7 +442,7 @@ private:
 
     even_share.block_offset += even_share.block_stride;
 
-    // Consume subsequent full tiles of input, at least one full tile was processed, so 
+    // Consume subsequent full tiles of input, at least one full tile was processed, so
     // `even_share.block_end >= TILE_ITEMS`
     while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
     {
diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh
index f04fb73f053..428bc06591e 100644
--- a/cub/cub/agent/agent_reduce_by_key.cuh
+++ b/cub/cub/agent/agent_reduce_by_key.cuh
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_discontinuity.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 
@@ -68,8 +71,8 @@ CUB_NAMESPACE_BEGIN
  * @tparam _SCAN_ALGORITHM
  *   The BlockScan algorithm to use
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int _BLOCK_THREADS,
@@ -95,7 +98,7 @@ struct AgentReduceByKeyPolicy
   ///< The BlockScan algorithm to use
   static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 
-  struct detail 
+  struct detail
   {
     using delay_constructor_t = DelayConstructorT;
   };
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index ea09b1e02c7..0082dd00b90 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "single_pass_scan_operators.cuh"
@@ -41,7 +45,6 @@
 #include "../block/block_scan.cuh"
 #include "../block/block_exchange.cuh"
 #include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
 #include "../grid/grid_queue.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 #include "../iterator/constant_input_iterator.cuh"
@@ -56,28 +59,28 @@ CUB_NAMESPACE_BEGIN
 /**
  * Parameterizable tuning policy type for AgentRle
  *
- * @tparam _BLOCK_THREADS 
+ * @tparam _BLOCK_THREADS
  *   Threads per thread block
  *
- * @tparam _ITEMS_PER_THREAD 
+ * @tparam _ITEMS_PER_THREAD
  *   Items per thread (per tile of input)
  *
- * @tparam _LOAD_ALGORITHM 
+ * @tparam _LOAD_ALGORITHM
  *   The BlockLoad algorithm to use
  *
- * @tparam _LOAD_MODIFIER 
+ * @tparam _LOAD_MODIFIER
  *   Cache load modifier for reading input elements
  *
- * @tparam _STORE_WARP_TIME_SLICING 
- *   Whether or not only one warp's worth of shared memory should be allocated and time-sliced among 
- *   block-warps during any store-related data transpositions 
+ * @tparam _STORE_WARP_TIME_SLICING
+ *   Whether or not only one warp's worth of shared memory should be allocated and time-sliced among
+ *   block-warps during any store-related data transpositions
  *   (versus each warp having its own storage)
  *
- * @tparam _SCAN_ALGORITHM 
+ * @tparam _SCAN_ALGORITHM
  *   The BlockScan algorithm to use
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int _BLOCK_THREADS,
@@ -100,7 +103,7 @@ struct AgentRlePolicy
     static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
     static constexpr BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
 
-    struct detail 
+    struct detail
     {
         using delay_constructor_t = DelayConstructorT;
     };
@@ -115,7 +118,7 @@ struct AgentRlePolicy
  ******************************************************************************/
 
 /**
- * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
  */
 template <
     typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
@@ -413,12 +416,12 @@ struct AgentRle
 
         // `thread_exclusive_in_warp.key`:
         //      number of non-trivial runs starts in previous threads
-        // `thread_exclusive_in_warp.val`: 
+        // `thread_exclusive_in_warp.val`:
         //      number of items in the last non-trivial run in previous threads
 
         // `thread_aggregate.key`:
         //      number of non-trivial runs starts in this thread
-        // `thread_aggregate.val`: 
+        // `thread_aggregate.val`:
         //      number of items in the last non-trivial run in this thread
         LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
         WarpScanPairs(temp_storage.aliasable.scan_storage.warp_scan[warp_id]).Scan(
@@ -430,7 +433,7 @@ struct AgentRle
 
         // `thread_inclusive.key`:
         //      number of non-trivial runs starts in this and previous warp threads
-        // `thread_inclusive.val`: 
+        // `thread_inclusive.val`:
         //      number of items in the last non-trivial run in this or previous warp threads
 
         // Last lane in each warp shares its warp-aggregate
@@ -733,7 +736,7 @@ struct AgentRle
             if (thread_exclusive_in_warp.key == 0)
             {
                 // If there are no non-trivial runs starts in the previous warp threads, then
-                // `thread_exclusive_in_warp.val` denotes the number of items in the last 
+                // `thread_exclusive_in_warp.val` denotes the number of items in the last
                 // non-trivial run of the previous CTA threads, so the better name for it is
                 // `thread_exclusive_in_tile`.
                 thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
@@ -830,7 +833,7 @@ struct AgentRle
             if (thread_exclusive_in_warp.key == 0)
             {
                 // If there are no non-trivial runs starts in the previous warp threads, then
-                // `thread_exclusive_in_warp.val` denotes the number of items in the last 
+                // `thread_exclusive_in_warp.val` denotes the number of items in the last
                 // non-trivial run of the previous grid threads, so the better name for it is
                 // `thread_exclusive_in_grid`.
                 thread_exclusive_in_warp.value += thread_exclusive.value;
diff --git a/cub/cub/agent/agent_scan.cuh b/cub/cub/agent/agent_scan.cuh
index c076fa42106..5b201a8d0c0 100644
--- a/cub/cub/agent/agent_scan.cuh
+++ b/cub/cub/agent/agent_scan.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,19 +27,22 @@
  ******************************************************************************/
 
 /**
- * @file cub::AgentScan implements a stateful abstraction of CUDA thread blocks 
+ * @file cub::AgentScan implements a stateful abstraction of CUDA thread blocks
  *       for participating in device-wide prefix scan .
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 
@@ -73,8 +76,8 @@ CUB_NAMESPACE_BEGIN
  * @tparam _SCAN_ALGORITHM
  *   The BlockScan algorithm to use
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int NOMINAL_BLOCK_THREADS_4B,
@@ -95,7 +98,7 @@ struct AgentScanPolicy : ScalingType
   static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
   static constexpr BlockScanAlgorithm SCAN_ALGORITHM   = _SCAN_ALGORITHM;
 
-  struct detail 
+  struct detail
   {
     using delay_constructor_t = DelayConstructorT;
   };
diff --git a/cub/cub/agent/agent_scan_by_key.cuh b/cub/cub/agent/agent_scan_by_key.cuh
index dffe38f8648..16229220a80 100644
--- a/cub/cub/agent/agent_scan_by_key.cuh
+++ b/cub/cub/agent/agent_scan_by_key.cuh
@@ -12,9 +12,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -26,18 +26,21 @@
  ******************************************************************************/
 
 /**
- * @file AgentScanByKey implements a stateful abstraction of CUDA thread blocks 
+ * @file AgentScanByKey implements a stateful abstraction of CUDA thread blocks
  *       for participating in device-wide prefix scan by key.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_discontinuity.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
@@ -52,8 +55,8 @@ CUB_NAMESPACE_BEGIN
 /**
  * Parameterizable tuning policy type for AgentScanByKey
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int _BLOCK_THREADS,
@@ -73,7 +76,7 @@ struct AgentScanByKeyPolicy
   static constexpr BlockScanAlgorithm SCAN_ALGORITHM   = _SCAN_ALGORITHM;
   static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
 
-  struct detail 
+  struct detail
   {
     using delay_constructor_t = DelayConstructorT;
   };
diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh
index a0802fd6032..5be44b0fa4a 100644
--- a/cub/cub/agent/agent_segment_fixup.cuh
+++ b/cub/cub/agent/agent_segment_fixup.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "single_pass_scan_operators.cuh"
@@ -40,7 +44,6 @@
 #include "../block/block_store.cuh"
 #include "../block/block_scan.cuh"
 #include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 #include "../iterator/constant_input_iterator.cuh"
 
@@ -240,7 +243,7 @@ struct AgentSegmentFixup
         else
             BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
 
-        // RLE 
+        // RLE
         #pragma unroll
         for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
         {
diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh
index a629771120e..f791e2b609e 100644
--- a/cub/cub/agent/agent_segmented_radix_sort.cuh
+++ b/cub/cub/agent/agent_segmented_radix_sort.cuh
@@ -27,10 +27,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_radix_sort_downsweep.cuh>
 #include <cub/agent/agent_radix_sort_upsweep.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cub/config.cuh>
 #include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
 
@@ -152,7 +155,7 @@ struct AgentSegmentedRadixSort
     // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
     // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
 
-    bit_ordered_type default_key_bits = IS_DESCENDING 
+    bit_ordered_type default_key_bits = IS_DESCENDING
                                       ? traits::min_raw_binary_key(decomposer)
                                       : traits::max_raw_binary_key(decomposer);
     KeyT oob_default = reinterpret_cast<KeyT &>(default_key_bits);
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index fcccffe7f6a..6faa04cbed0 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "single_pass_scan_operators.cuh"
@@ -41,7 +45,6 @@
 #include "../block/block_scan.cuh"
 #include "../block/block_exchange.cuh"
 #include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
 #include "../grid/grid_queue.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 
@@ -55,23 +58,23 @@ CUB_NAMESPACE_BEGIN
 /**
  * Parameterizable tuning policy type for AgentSelectIf
  *
- * @tparam _BLOCK_THREADS 
+ * @tparam _BLOCK_THREADS
  *   Threads per thread block
  *
- * @tparam _ITEMS_PER_THREAD 
+ * @tparam _ITEMS_PER_THREAD
  *   Items per thread (per tile of input)
  *
- * @tparam _LOAD_ALGORITHM 
+ * @tparam _LOAD_ALGORITHM
  *   The BlockLoad algorithm to use
  *
- * @tparam _LOAD_MODIFIER 
+ * @tparam _LOAD_MODIFIER
  *   Cache load modifier for reading input elements
  *
- * @tparam _SCAN_ALGORITHM 
+ * @tparam _SCAN_ALGORITHM
  *   The BlockScan algorithm to use
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int _BLOCK_THREADS,
@@ -92,7 +95,7 @@ struct AgentSelectIfPolicy
     static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
     static constexpr BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
 
-    struct detail 
+    struct detail
     {
         using delay_constructor_t = DelayConstructorT;
     };
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 4409436f9d0..bca147a5530 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -33,13 +33,16 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "../util_type.cuh"
 #include "../block/block_reduce.cuh"
 #include "../block/block_scan.cuh"
 #include "../block/block_exchange.cuh"
-#include "../config.cuh"
 #include "../thread/thread_search.cuh"
 #include "../thread/thread_operators.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
index 21ca2e5030c..80c8c7caa95 100644
--- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh
+++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -27,8 +27,11 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/block/radix_rank_sort_operations.cuh>
-#include <cub/config.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_load.cuh>
 #include <cub/warp/warp_merge_sort.cuh>
@@ -159,21 +162,21 @@ class AgentSubWarpSort
     return lhs == rhs;
   }
 
-  __device__ static bool get_oob_default(Int2Type<true> /* is bool */) 
+  __device__ static bool get_oob_default(Int2Type<true> /* is bool */)
   {
     // Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
     // comparison with oob unreliable.
     return !IS_DESCENDING;
   }
 
-  __device__ static KeyT get_oob_default(Int2Type<false> /* is bool */) 
+  __device__ static KeyT get_oob_default(Int2Type<false> /* is bool */)
   {
     // For FP64 the difference is:
     // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
     // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
 
     // Segmented sort doesn't support custom types at the moment.
-    bit_ordered_type default_key_bits = IS_DESCENDING 
+    bit_ordered_type default_key_bits = IS_DESCENDING
                                       ? traits::min_raw_binary_key(detail::identity_decomposer_t{})
                                       : traits::max_raw_binary_key(detail::identity_decomposer_t{});
     return reinterpret_cast<KeyT &>(default_key_bits);
@@ -253,7 +256,7 @@ public:
       KeyT keys[PolicyT::ITEMS_PER_THREAD];
       ValueT values[PolicyT::ITEMS_PER_THREAD];
 
-      KeyT oob_default = 
+      KeyT oob_default =
         AgentSubWarpSort::get_oob_default(Int2Type<std::is_same<bool, KeyT>::value>{});
 
       WarpLoadKeysT(storage.load_keys)
diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh
index 93cc8d3966e..4186f61895c 100644
--- a/cub/cub/agent/agent_three_way_partition.cuh
+++ b/cub/cub/agent/agent_three_way_partition.cuh
@@ -27,6 +27,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <type_traits>
 
@@ -36,7 +40,6 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 
 
diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index bf943a00b09..4d406f653a6 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -32,6 +32,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <type_traits>
 
@@ -51,8 +55,8 @@ CUB_NAMESPACE_BEGIN
 /**
  * Parameterizable tuning policy type for AgentUniqueByKey
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int                     _BLOCK_THREADS,
@@ -72,7 +76,7 @@ struct AgentUniqueByKeyPolicy
     static constexpr cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
     static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 
-    struct detail 
+    struct detail
     {
         using delay_constructor_t = DelayConstructorT;
     };
@@ -211,7 +215,7 @@ struct AgentUniqueByKey
         ValueOutputIteratorT         d_values_out_,
         EqualityOpT                  equality_op_,
         OffsetT                      num_items_)
-    : 
+    :
         temp_storage(temp_storage_.Alias()),
         d_keys_in(d_keys_in_),
         d_values_in(d_values_in_),
@@ -567,10 +571,10 @@ struct AgentUniqueByKey
         {
             int  num_remaining  = static_cast<int>(num_items - tile_offset);
             OffsetT num_selections = ConsumeTile<true>(num_remaining,
-                                                       tile_idx,                                    
+                                                       tile_idx,
                                                        tile_offset,
                                                        tile_state);
-            if (threadIdx.x == 0)                                                               
+            if (threadIdx.x == 0)
             {
                 *d_num_selected_out = num_selections;
             }
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 342e859246c..c222718601e 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
-#include <cub/config.cuh>
 #include <cub/detail/strong_load.cuh>
 #include <cub/detail/strong_store.cuh>
 #include <cub/detail/uninitialized_copy.cuh>
@@ -110,22 +113,22 @@ enum ScanTileStatus
     SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
 };
 
-namespace detail 
+namespace detail
 {
 
 template <int Delay, unsigned int GridThreshold = 500>
 __device__ __forceinline__ void delay()
 {
   NV_IF_TARGET(NV_PROVIDES_SM_70,
-               (if (Delay > 0) 
+               (if (Delay > 0)
                 {
-                  if (gridDim.x < GridThreshold) 
+                  if (gridDim.x < GridThreshold)
                   {
                     __threadfence_block();
                   }
-                  else 
+                  else
                   {
-                    __nanosleep(Delay); 
+                    __nanosleep(Delay);
                   }
                 }));
 }
@@ -134,15 +137,15 @@ template <unsigned int GridThreshold = 500>
 __device__ __forceinline__ void delay(int ns)
 {
   NV_IF_TARGET(NV_PROVIDES_SM_70,
-               (if (ns > 0) 
+               (if (ns > 0)
                 {
-                  if (gridDim.x < GridThreshold) 
+                  if (gridDim.x < GridThreshold)
                   {
                     __threadfence_block();
                   }
-                  else 
+                  else
                   {
-                    __nanosleep(ns); 
+                    __nanosleep(ns);
                   }
                 }));
 }
@@ -194,7 +197,7 @@ struct no_delay_constructor_t
 {
   struct delay_t
   {
-    __device__ __forceinline__ void operator()() 
+    __device__ __forceinline__ void operator()()
     {
       NV_IF_TARGET(NV_PROVIDES_SM_70,
                   (),
@@ -215,7 +218,7 @@ struct reduce_by_key_delay_constructor_t
 {
   struct delay_t
   {
-    __device__ __forceinline__ void operator()() 
+    __device__ __forceinline__ void operator()()
     {
       NV_DISPATCH_TARGET(
         NV_IS_EXACTLY_SM_80, (delay<Delay, GridThreshold>();),
@@ -262,7 +265,7 @@ struct exponential_backoff_constructor_t
     }
   };
 
-  __device__ __forceinline__ exponential_backoff_constructor_t(unsigned int /* seed */) 
+  __device__ __forceinline__ exponential_backoff_constructor_t(unsigned int /* seed */)
   {
     always_delay<L2WriteLatency>();
   }
@@ -437,7 +440,7 @@ struct exponential_backon_constructor_t
 
   unsigned int max_delay = InitialDelay;
 
-  __device__ __forceinline__ exponential_backon_constructor_t(unsigned int /* seed */) 
+  __device__ __forceinline__ exponential_backon_constructor_t(unsigned int /* seed */)
   {
     always_delay<L2WriteLatency>();
   }
@@ -613,7 +616,7 @@ struct ScanTileState<T, true>
     /**
      * Wait for the corresponding tile to become non-invalid
      */
-    template <class DelayT = detail::default_delay_t<T>> 
+    template <class DelayT = detail::default_delay_t<T>>
     __device__ __forceinline__ void WaitForValid(
         int             tile_idx,
         StatusWord      &status,
@@ -628,7 +631,7 @@ struct ScanTileState<T, true>
         }
 
         while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
-        {   
+        {
           delay_or_prevent_hoisting();
           TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
           tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
@@ -642,11 +645,11 @@ struct ScanTileState<T, true>
      * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
      * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
      */
-     __device__ __forceinline__ T LoadValid(int tile_idx)                        
-    {                                                                           
+     __device__ __forceinline__ T LoadValid(int tile_idx)
+    {
         TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx];
         TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-        return tile_descriptor.value;                                           
+        return tile_descriptor.value;
     }
 };
 
@@ -704,7 +707,7 @@ struct ScanTileState<T, false>
             error = CubDebug(
               AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
 
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -784,7 +787,7 @@ struct ScanTileState<T, false>
     /**
      * Wait for the corresponding tile to become non-invalid
      */
-    template <class DelayT = detail::default_no_delay_t> 
+    template <class DelayT = detail::default_no_delay_t>
     __device__ __forceinline__ void WaitForValid(
         int             tile_idx,
         StatusWord      &status,
@@ -798,7 +801,7 @@ struct ScanTileState<T, false>
           __threadfence();
         } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
 
-        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+        if (status == StatusWord(SCAN_TILE_PARTIAL))
         {
           value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
         }
@@ -812,9 +815,9 @@ struct ScanTileState<T, false>
      * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
      * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
      */
-    __device__ __forceinline__ T LoadValid(int tile_idx)                        
-    {                                                                           
-        return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx];                                          
+    __device__ __forceinline__ T LoadValid(int tile_idx)
+    {
+        return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx];
     }
 };
 
@@ -1008,7 +1011,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
     /**
      * Wait for the corresponding tile to become non-invalid
      */
-    template <class DelayT = detail::fixed_delay_constructor_t<350, 450>::delay_t> 
+    template <class DelayT = detail::fixed_delay_constructor_t<350, 450>::delay_t>
     __device__ __forceinline__ void WaitForValid(
         int                     tile_idx,
         StatusWord              &status,
@@ -1058,8 +1061,8 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
  * the current tile by using the call-back warp to wait on on
  * aggregates/prefixes from predecessor tiles to become available.
  *
- * @tparam DelayConstructorT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <
@@ -1096,7 +1099,7 @@ struct TilePrefixCallbackOp
     T                           exclusive_prefix;   ///< Exclusive prefix for the tile
     T                           inclusive_prefix;   ///< Inclusive prefix for the tile
 
-    // Constructs prefix functor for a given tile index. 
+    // Constructs prefix functor for a given tile index.
     // Precondition: thread blocks processing all of the predecessor tiles were scheduled.
     __device__ __forceinline__ TilePrefixCallbackOp(ScanTileStateT &tile_status,
                                                     TempStorage &temp_storage,
@@ -1117,7 +1120,7 @@ struct TilePrefixCallbackOp
     {}
 
     // Block until all predecessors within the warp-wide window have non-invalid status
-    template <class DelayT = detail::default_delay_t<T>> 
+    template <class DelayT = detail::default_delay_t<T>>
     __device__ __forceinline__
     void ProcessWindow(
         int         predecessor_idx,        ///< Preceding tile index to inspect
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index 524ffbebfaa..33f2c321a55 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -36,6 +36,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_ptx.cuh"
 
@@ -489,7 +492,7 @@ public:
     }
 
     /**
-     * @brief Subtracts the left element of each adjacent pair of elements 
+     * @brief Subtracts the left element of each adjacent pair of elements
      *        partitioned across a CUDA thread block.
      *
      * @par
@@ -497,7 +500,7 @@ public:
      * - \smemreuse
      *
      * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
      * to compute the left difference between adjacent elements.
      *
      * @par
@@ -516,7 +519,7 @@ public:
      *
      * __global__ void ExampleKernel(...)
      * {
-     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // Specialize BlockAdjacentDifference for a 1D block of
      *   // 128 threads of type int
      *   using BlockAdjacentDifferenceT =
      *      cub::BlockAdjacentDifference<int, 128>;
@@ -607,7 +610,7 @@ public:
     }
 
     /**
-     * @brief Subtracts the left element of each adjacent pair of elements 
+     * @brief Subtracts the left element of each adjacent pair of elements
      *        partitioned across a CUDA thread block.
      *
      * @par
@@ -615,7 +618,7 @@ public:
      * - \smemreuse
      *
      * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
      * to compute the left difference between adjacent elements.
      *
      * @par
@@ -634,7 +637,7 @@ public:
      *
      * __global__ void ExampleKernel(...)
      * {
-     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // Specialize BlockAdjacentDifference for a 1D block of
      *   // 128 threads of type int
      *   using BlockAdjacentDifferenceT =
      *      cub::BlockAdjacentDifference<int, 128>;
@@ -725,9 +728,9 @@ public:
       {
         output[0] = input[0];
       }
-      else if (linear_tid == 0) 
+      else if (linear_tid == 0)
       {
-        output[0] = difference_op(input[0], 
+        output[0] = difference_op(input[0],
                                   tile_predecessor_item);
       }
       else
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index a3bf17f3190..98801ef3c88 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_ptx.cuh"
 
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index 5f4bc2e84a8..ae96f8abfe3 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/uninitialized_copy.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index b9ab759607d..ee0c851e892 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "specializations/block_histogram_sort.cuh"
 #include "specializations/block_histogram_atomic.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index b419ab6e726..bb6081b5b57 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <type_traits>
 
 #include "../block/block_exchange.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index dc07ef6c294..49693540fba 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -27,6 +27,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/thread/thread_sort.cuh>
 #include <cub/util_math.cuh>
 #include <cub/util_namespace.cuh>
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 18655af4f0b..64fcd7979c7 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,13 +33,16 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdint.h>
 
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_scan.cuh"
 #include "../block/block_scan.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
@@ -102,7 +105,7 @@ struct warp_in_block_matcher_t
 {
   static __device__ std::uint32_t match_any(std::uint32_t label, std::uint32_t warp_id)
   {
-    if (warp_id == static_cast<std::uint32_t>(PartialWarpId)) 
+    if (warp_id == static_cast<std::uint32_t>(PartialWarpId))
     {
       return MatchAny<Bits, PartialWarpThreads>(label);
     }
@@ -155,7 +158,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
  *   constexpr int block_threads = 2;
  *   constexpr int radix_bits = 5;
  *
- *   // Specialize BlockRadixRank for a 1D block of 2 threads 
+ *   // Specialize BlockRadixRank for a 1D block of 2 threads
  *   using block_radix_rank = cub::BlockRadixRank<block_threads, radix_bits>;
  *   using storage_t = typename block_radix_rank::TempStorage;
  *
@@ -172,7 +175,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
  *
  *   ...
  * \endcode
- * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`.  
+ * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`.
  * The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`.
  *
  * \par Re-using dynamically allocating shared memory
@@ -758,8 +761,8 @@ public:
             // Mask of peers who have same digit as me
             uint32_t peer_mask =
               detail::warp_in_block_matcher_t<
-                RADIX_BITS, 
-                PARTIAL_WARP_THREADS, 
+                RADIX_BITS,
+                PARTIAL_WARP_THREADS,
                 WARPS - 1>::match_any(digit, warp_id);
 
             // Pointer to smem digit counter for this key
@@ -918,7 +921,7 @@ struct BlockRadixRankMatchEarlyCounts
     // types
     typedef cub::BlockScan<int, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
 
-    
+
 
     // temporary storage
     struct TempStorage
@@ -981,7 +984,7 @@ struct BlockRadixRankMatchEarlyCounts
                 for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
                 {
                     match_masks[bin] = 0;
-                }                    
+                }
             }
             WARP_SYNC(WARP_MASK);
 
@@ -992,7 +995,7 @@ struct BlockRadixRankMatchEarlyCounts
             {
                 atomicAdd(&warp_histograms[Digit(keys[u])][part], 1);
             }
-            
+
             // sum different parts;
             // no extra work is necessary if NUM_PARTS == 1
             if (NUM_PARTS > 1)
@@ -1025,7 +1028,7 @@ struct BlockRadixRankMatchEarlyCounts
         {
             // sum up warp-private histograms
             #pragma unroll
-            for (int u = 0; u < BINS_PER_THREAD; ++u) 
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
             {
                 bins[u] = 0;
                 int bin = ThreadBin(u);
@@ -1127,12 +1130,12 @@ struct BlockRadixRankMatchEarlyCounts
             int (&exclusive_digit_prefix)[BINS_PER_THREAD])
         {
             ComputeHistogramsWarp(keys);
-            
+
             CTA_SYNC();
             int bins[BINS_PER_THREAD];
             ComputeOffsetsWarpUpsweep(bins);
             callback(bins);
-            
+
             BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
 
             ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
@@ -1164,7 +1167,7 @@ struct BlockRadixRankMatchEarlyCounts
     {
         BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback>
             internal(temp_storage, digit_extractor, callback);
-        internal.RankKeys(keys, ranks, exclusive_digit_prefix);        
+        internal.RankKeys(keys, ranks, exclusive_digit_prefix);
     }
 
     template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
@@ -1193,13 +1196,13 @@ struct BlockRadixRankMatchEarlyCounts
 
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-namespace detail 
+namespace detail
 {
 
-// `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm 
-// template parameter. Other algorithms don't provide the same template parameters, not allowing 
-// multi-dimensional thread block specializations. 
-// 
+// `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm
+// template parameter. Other algorithms don't provide the same template parameters, not allowing
+// multi-dimensional thread block specializations.
+//
 // TODO(senior-zero) for 3.0:
 // - Put existing implementations into the detail namespace
 // - Support multi-dimensional thread blocks in the rest of implementations
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index e275d6f611e..0a7dc571c1b 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,18 +34,21 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "block_exchange.cuh"
 #include "block_radix_rank.cuh"
 #include "radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
 CUB_NAMESPACE_BEGIN
 
 //! @rst
-//! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting 
-//! items partitioned across a CUDA thread block using a radix sorting method.  
+//! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
+//! items partitioned across a CUDA thread block using a radix sorting method.
 //!
 //! .. image:: ../img/sorting_logo.png
 //!     :align: center
@@ -156,7 +159,7 @@ CUB_NAMESPACE_BEGIN
 //!         ...
 //!
 //! Suppose the set of input ``thread_keys`` across the block of threads is
-//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.  
+//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
 //! The corresponding output ``thread_keys`` in those threads will be
 //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
 //!
@@ -173,40 +176,40 @@ CUB_NAMESPACE_BEGIN
 //!
 //! @ingroup BlockModule
 //!
-//! @tparam KeyT                 
+//! @tparam KeyT
 //!   KeyT type
 //!
-//! @tparam BLOCK_DIM_X          
+//! @tparam BLOCK_DIM_X
 //!   The thread block length in threads along the X dimension
 //!
-//! @tparam ITEMS_PER_THREAD     
+//! @tparam ITEMS_PER_THREAD
 //!   The number of items per thread
 //!
-//! @tparam ValueT               
+//! @tparam ValueT
 //!   **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
 //!
-//! @tparam RADIX_BITS           
+//! @tparam RADIX_BITS
 //!   **[optional]** The number of radix bits per digit place (default: 4 bits)
 //!
-//! @tparam MEMOIZE_OUTER_SCAN   
-//!  **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory 
-//!  reads at the expense of higher register pressure (default: true for architectures SM35 and 
+//! @tparam MEMOIZE_OUTER_SCAN
+//!  **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
+//!  reads at the expense of higher register pressure (default: true for architectures SM35 and
 //!  newer, false otherwise).
 //!
-//! @tparam INNER_SCAN_ALGORITHM 
-//!   **[optional]** The cub::BlockScanAlgorithm algorithm to use 
+//! @tparam INNER_SCAN_ALGORITHM
+//!   **[optional]** The cub::BlockScanAlgorithm algorithm to use
 //!   (default: cub::BLOCK_SCAN_WARP_SCANS)
 //!
-//! @tparam SMEM_CONFIG          
+//! @tparam SMEM_CONFIG
 //!   **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
 //!
-//! @tparam BLOCK_DIM_Y          
+//! @tparam BLOCK_DIM_Y
 //!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
 //!
-//! @tparam BLOCK_DIM_Z          
+//! @tparam BLOCK_DIM_Z
 //!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
 //!
-//! @tparam LEGACY_PTX_ARCH      
+//! @tparam LEGACY_PTX_ARCH
 //!   **[optional]** Unused
 template <
     typename                KeyT,
@@ -575,7 +578,7 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
+    //! Performs an ascending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
     //!
     //! * @granularity
@@ -608,27 +611,27 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -648,7 +651,7 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
+    //! Performs an ascending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
     //!
     //! * @granularity
@@ -681,19 +684,19 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -758,7 +761,7 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
+    //! Performs an ascending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
     //!
     //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
@@ -796,13 +799,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -810,16 +813,16 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -841,7 +844,7 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
+    //! Performs an ascending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
     //!
     //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
@@ -879,13 +882,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -893,8 +896,8 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -959,7 +962,7 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
+    //! Performs a descending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
     //!
     //! * @granularity
@@ -992,27 +995,27 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1035,7 +1038,7 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
+    //! Performs a descending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
     //!
     //! * @granularity
@@ -1068,19 +1071,19 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1153,7 +1156,7 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
+    //! Performs a descending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
     //!
     //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
@@ -1191,13 +1194,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -1205,16 +1208,16 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1236,7 +1239,7 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
+    //! Performs a descending block-wide radix sort over a
     //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
     //!
     //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
@@ -1274,13 +1277,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -1288,8 +1291,8 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1364,8 +1367,8 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a 
+    //! Performs an ascending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1377,7 +1380,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1398,27 +1401,27 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1441,8 +1444,8 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a 
+    //! Performs an ascending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1454,7 +1457,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1475,19 +1478,19 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1560,8 +1563,8 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a 
+    //! Performs an ascending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1573,7 +1576,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1594,13 +1597,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -1608,16 +1611,16 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1639,8 +1642,8 @@ public:
     }
 
     //! @rst
-    //! Performs an ascending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a 
+    //! Performs an ascending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1652,7 +1655,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1673,13 +1676,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -1687,8 +1690,8 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1756,8 +1759,8 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a 
+    //! Performs a descending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1769,7 +1772,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1790,27 +1793,27 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1833,8 +1836,8 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a 
+    //! Performs a descending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1846,7 +1849,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1867,19 +1870,19 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -1952,8 +1955,8 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a 
+    //! Performs a descending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -1965,7 +1968,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -1986,13 +1989,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -2000,16 +2003,16 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     //!
-    //! @param[in] begin_bit 
-    //!   The least-significant bit index (inclusive) needed for 
+    //! @param[in] begin_bit
+    //!   The least-significant bit index (inclusive) needed for
     //!   key comparison
     //!
-    //! @param[in] end_bit 
-    //!   The most-significant bit index (exclusive) needed for key 
+    //! @param[in] end_bit
+    //!   The most-significant bit index (exclusive) needed for key
     //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
     template <class DecomposerT>
     __device__ __forceinline__         //
@@ -2031,8 +2034,8 @@ public:
     }
 
     //! @rst
-    //! Performs a descending block-wide radix sort over a 
-    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a 
+    //! Performs a descending block-wide radix sort over a
+    //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
     //! :ref:`striped arrangement <flexible-data-arrangement>`.
     //!
     //! * @granularity
@@ -2044,7 +2047,7 @@ public:
     //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
     //! ``custom_t`` objects, we have to tell CUB about relevant members of the
     //! ``custom_t`` type. We do this by providing a decomposer that returns a
-    //! tuple of references to relevant members of the key. 
+    //! tuple of references to relevant members of the key.
     //!
     //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu
     //!     :language: c++
@@ -2065,13 +2068,13 @@ public:
     //! @endrst
     //!
     //! @tparam DecomposerT
-    //!   **[inferred]** Type of a callable object responsible for decomposing a 
+    //!   **[inferred]** Type of a callable object responsible for decomposing a
     //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-    //!   The leftmost element of the tuple is considered the most significant. 
+    //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+    //!   The leftmost element of the tuple is considered the most significant.
     //!   The call operator must not modify members of the key.
     //!
-    //! @param[in,out] keys 
+    //! @param[in,out] keys
     //!   Keys to sort
     //!
     //! @param[in,out] values
@@ -2079,8 +2082,8 @@ public:
     //!
     //! @param decomposer
     //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-    //!   references to its constituent arithmetic types. The leftmost element of 
-    //!   the tuple is considered the most significant. The call operator must not 
+    //!   references to its constituent arithmetic types. The leftmost element of
+    //!   the tuple is considered the most significant. The call operator must not
     //!   modify members of the key.
     template <class DecomposerT>
     __device__ __forceinline__         //
diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh
index 4d49f54f2c8..573252bae44 100644
--- a/cub/cub/block/block_raking_layout.cuh
+++ b/cub/cub/block/block_raking_layout.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -35,6 +35,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index 5a9db703db9..eae6257009f 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "specializations/block_reduce_raking.cuh"
 #include "specializations/block_reduce_raking_commutative_only.cuh"
 #include "specializations/block_reduce_warp_reductions.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 #include "../thread/thread_operators.cuh"
@@ -349,7 +352,7 @@ public:
     template <typename ReductionOp>
     __device__ __forceinline__ T Reduce(
         T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor
     {
         return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
     }
@@ -396,7 +399,7 @@ public:
         typename ReductionOp>
     __device__ __forceinline__ T Reduce(
         T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor
     {
         // Reduce partials
         T partial = internal::ThreadReduce(inputs, reduction_op);
@@ -441,7 +444,7 @@ public:
     template <typename ReductionOp>
     __device__ __forceinline__ T Reduce(
         T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor
         int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
     {
         // Determine if we skip bounds checking
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 41a3ab22b15..dd0340a25ee 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../thread/thread_search.cuh"
 #include "../util_math.cuh"
 #include "../util_namespace.cuh"
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index 544c15f0dbb..22689972cd5 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "specializations/block_scan_raking.cuh"
 #include "specializations/block_scan_warp_scans.cuh"
-#include "../config.cuh"
 #include "../util_type.cuh"
 #include "../util_ptx.cuh"
 
@@ -735,7 +738,7 @@ public:
         T               input,                          ///< [in] Calling thread's input item
         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
         T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
     }
@@ -785,7 +788,7 @@ public:
         T               input,              ///< [in] Calling thread's input items
         T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
         T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        ScanOp          scan_op,            ///< [in] Binary scan functor
         T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
@@ -873,7 +876,7 @@ public:
     __device__ __forceinline__ void ExclusiveScan(
         T                       input,                          ///< [in] Calling thread's input item
         T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
         BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
@@ -1717,7 +1720,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T               input,                          ///< [in] Calling thread's input item
         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
     }
@@ -1766,7 +1769,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T               input,                          ///< [in] Calling thread's input item
         T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
         T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
@@ -1854,7 +1857,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T                       input,                          ///< [in] Calling thread's input item
         T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
         BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
@@ -1914,7 +1917,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
         T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -1984,7 +1987,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
         T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
         T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
     {
         if (ITEMS_PER_THREAD == 1)
@@ -2098,7 +2101,7 @@ public:
     __device__ __forceinline__ void InclusiveScan(
         T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
         T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
         BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
     {
         if (ITEMS_PER_THREAD == 1)
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index 58938301c11..c96b8066ae6 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 2cb6bee4337..adbb5506ebe 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,11 +33,14 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <type_traits>
 
 #include "block_exchange.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
@@ -878,7 +881,7 @@ private:
             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
         }
     };
- 
+
 
     /******************************************************************************
      * Type definitions
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index 679dfd4230e..31c0225488d 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <thrust/type_traits/integer_sequence.h>
 
 #include <cuda/std/tuple>
@@ -40,7 +44,6 @@
 
 #include <type_traits>
 
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 #include "cub/detail/cpp_compatibility.cuh"
@@ -96,14 +99,14 @@ struct BaseDigitExtractor<KeyT, FLOATING_POINT>
  * key from a digit. */
 template <typename KeyT>
 struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
-{   
+{
     using typename BaseDigitExtractor<KeyT>::UnsignedBits;
 
-    std::uint32_t bit_start; 
+    std::uint32_t bit_start;
     std::uint32_t num_bits;
 
     explicit __device__ __forceinline__ BFEDigitExtractor(
-        std::uint32_t bit_start = 0, 
+        std::uint32_t bit_start = 0,
         std::uint32_t num_bits = 0)
       : bit_start(bit_start)
       , num_bits(num_bits)
@@ -139,14 +142,14 @@ struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
 
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-namespace detail 
+namespace detail
 {
 
 template <bool... Bs>
 struct logic_helper_t;
 
 template <bool>
-struct true_t 
+struct true_t
 {
   static constexpr bool value = true;
 };
@@ -181,9 +184,9 @@ for_each_member_impl(F f, const ::cuda::std::tuple<Ts&...>& tpl)
   static_assert(sizeof...(Ts), "Empty aggregates are not supported");
 
   // Most radix operations are indifferent to the order of operations.
-  // Conversely, the digit extractor traverses fields from the least significant 
-  // to the most significant to imitate bitset printing where higher bits are on 
-  // the left. It also maps to intuition, where something coming first is more 
+  // Conversely, the digit extractor traverses fields from the least significant
+  // to the most significant to imitate bitset printing where higher bits are on
+  // the left. It also maps to intuition, where something coming first is more
   // important. Therefore, we traverse fields on the opposite order.
   for_each_member_impl_helper(f, tpl, THRUST_NS_QUALIFIER::make_reversed_index_sequence<sizeof...(Ts)>{});
 }
@@ -578,7 +581,7 @@ struct traits_t<T, false /* is_fundamental */>
 } // namespace detail
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-//! Twiddling keys for radix sort 
+//! Twiddling keys for radix sort
 template <bool IS_DESCENDING, typename KeyT>
 struct RadixSortTwiddle
 {
diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh
index 93299fa7192..1e777ee4a7f 100644
--- a/cub/cub/block/specializations/block_histogram_atomic.cuh
+++ b/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -35,6 +35,8 @@
 
 #include "../../config.cuh"
 
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -57,7 +59,7 @@ struct BlockHistogramAtomic
     /// Composite data onto an existing histogram
     template <
         typename            T,
-        typename            CounterT,     
+        typename            CounterT,
         int                 ITEMS_PER_THREAD>
     __device__ __forceinline__ void Composite(
         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh
index 79659ae106d..c23cdb0f6e1 100644
--- a/cub/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../block/block_radix_sort.cuh"
 #include "../../block/block_discontinuity.cuh"
-#include "../../config.cuh"
 #include "../../util_ptx.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 423f18f8679..7790a9de7c1 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../block/block_raking_layout.cuh"
 #include "../../warp/warp_reduce.cuh"
 #include "../../thread/thread_reduce.cuh"
-#include "../../config.cuh"
 #include "../../util_ptx.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index 1fc1caab15a..f0d119a01ea 100644
--- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "block_reduce_raking.cuh"
 #include "../../warp/warp_reduce.cuh"
 #include "../../thread/thread_reduce.cuh"
-#include "../../config.cuh"
 #include "../../util_ptx.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 1e51a9fcfe1..d7db7b8f1d2 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/uninitialized_copy.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_reduce.cuh>
diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh
index 4891dad141d..b35e71d45c4 100644
--- a/cub/cub/block/specializations/block_scan_raking.cuh
+++ b/cub/cub/block/specializations/block_scan_raking.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,7 +34,9 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include <cub/block/block_raking_layout.cuh>
 #include <cub/detail/uninitialized_copy.cuh>
@@ -156,7 +158,7 @@ struct BlockScanRaking
         CopySegment(out, in, Int2Type<ITERATION + 1>());
     }
 
- 
+
     /// Templated copy (base case)
     __device__ __forceinline__ void CopySegment(
         T*                  /*out*/,            ///< [out] Out array
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index f76131a7856..e050a9b88b4 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/uninitialized_copy.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/warp/warp_scan.cuh>
diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh
index b909bbf7237..88aa182d02f 100644
--- a/cub/cub/config.cuh
+++ b/cub/cub/config.cuh
@@ -32,6 +32,11 @@
 
 #pragma once
 
+// For `_CCCL_IMPLICIT_SYSTEM_HEADER`
+#include <cuda/std/detail/__config>
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "util_arch.cuh"
 #include "util_compiler.cuh"
 #include "util_cpp_dialect.cuh"
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index c5cac22cf14..81c3d38a98d 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -36,6 +36,8 @@
 // Static configuration
 #include "config.cuh"
 
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 // Block
 #include "block/block_adjacent_difference.cuh"
 #include "block/block_discontinuity.cuh"
@@ -101,10 +103,8 @@
 
 // Util
 #include "util_allocator.cuh"
-#include "util_arch.cuh"
 #include "util_debug.cuh"
 #include "util_device.cuh"
-#include "util_macro.cuh"
 #include "util_ptx.cuh"
 #include "util_temporary_storage.cuh"
 #include "util_type.cuh"
diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh
index 4154123973a..b3ea6bcf27f 100644
--- a/cub/cub/detail/choose_offset.cuh
+++ b/cub/cub/detail/choose_offset.cuh
@@ -27,7 +27,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 
 #include <cuda/std/iterator>
 #include <cuda/std/type_traits>
diff --git a/cub/cub/detail/cpp_compatibility.cuh b/cub/cub/detail/cpp_compatibility.cuh
index c4fbe649692..c0770598141 100644
--- a/cub/cub/detail/cpp_compatibility.cuh
+++ b/cub/cub/detail/cpp_compatibility.cuh
@@ -17,7 +17,9 @@
 
 #pragma once
 
-#include <cub/util_cpp_dialect.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
 #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr
 #  define CUB_IF_CONSTEXPR if constexpr
diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
index b8e776db748..a7b025e4cdc 100644
--- a/cub/cub/detail/detect_cuda_runtime.cuh
+++ b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -33,6 +33,11 @@
 
 #pragma once
 
+// We cannot use `cub/config.cuh` here due to circular dependencies
+#include <cuda/std/detail/__config>
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cuda_runtime_api.h>
 
 #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
diff --git a/cub/cub/detail/device_double_buffer.cuh b/cub/cub/detail/device_double_buffer.cuh
index c427dcb438b..ee7670a06ba 100644
--- a/cub/cub/detail/device_double_buffer.cuh
+++ b/cub/cub/detail/device_double_buffer.cuh
@@ -16,6 +16,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_namespace.cuh>
 
 
diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh
index 9da0a361aff..273d03fd374 100644
--- a/cub/cub/detail/device_synchronize.cuh
+++ b/cub/cub/detail/device_synchronize.cuh
@@ -16,10 +16,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/detect_cuda_runtime.cuh>
 #include <cub/detail/exec_check_disable.cuh>
-#include <cub/util_arch.cuh>
-#include <cub/util_namespace.cuh>
 
 #include <nv/target>
 
diff --git a/cub/cub/detail/exec_check_disable.cuh b/cub/cub/detail/exec_check_disable.cuh
index c5f4b4572c7..8a4fe75fad8 100644
--- a/cub/cub/detail/exec_check_disable.cuh
+++ b/cub/cub/detail/exec_check_disable.cuh
@@ -16,7 +16,9 @@
 
 #pragma once
 
-#include <cub/util_compiler.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
 /**
  * @def CUB_EXEC_CHECK_DISABLE
diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh
index 12e6672b9eb..5dcf6d1f8ea 100644
--- a/cub/cub/detail/strong_load.cuh
+++ b/cub/cub/detail/strong_load.cuh
@@ -32,7 +32,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh
index fd293519a38..ab9805218e1 100644
--- a/cub/cub/detail/strong_store.cuh
+++ b/cub/cub/detail/strong_store.cuh
@@ -32,7 +32,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/cpp_compatibility.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh
index 51cb3cc855c..9881b0950bd 100644
--- a/cub/cub/detail/temporary_storage.cuh
+++ b/cub/cub/detail/temporary_storage.cuh
@@ -16,6 +16,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_namespace.cuh>
 #include <cub/util_temporary_storage.cuh>
 
diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
index 77903306250..c07ee7a5aad 100644
--- a/cub/cub/detail/type_traits.cuh
+++ b/cub/cub/detail/type_traits.cuh
@@ -32,6 +32,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_namespace.cuh>
 
@@ -51,7 +55,7 @@ using invoke_result_t =
 
 /// The type of intermediate accumulator (according to P2322R6)
 template <typename Invokable, typename InitT, typename InputT>
-using accumulator_t = 
+using accumulator_t =
   typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
 
 } // namespace detail
diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh
index 2b3e4b1da26..807a458011e 100644
--- a/cub/cub/detail/uninitialized_copy.cuh
+++ b/cub/cub/detail/uninitialized_copy.cuh
@@ -12,9 +12,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,7 +27,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 
 #include <cuda/std/type_traits>
 
@@ -48,7 +51,7 @@ __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
 template <typename T,
           typename U,
           typename ::cuda::std::enable_if<
-            ::cuda::std::is_trivially_copyable<T>::value, 
+            ::cuda::std::is_trivially_copyable<T>::value,
             int
           >::type = 0>
 __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
@@ -56,7 +59,7 @@ __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
   *ptr = ::cuda::std::forward<U>(val);
 }
 
-template <typename T, 
+template <typename T,
          typename U,
          typename ::cuda::std::enable_if<
            !::cuda::std::is_trivially_copyable<T>::value,
diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index 0f614d1876d..aedac4f69f9 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -27,7 +27,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/type_traits.cuh>
 #include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh
index 445f5d862bc..54ac962f273 100644
--- a/cub/cub/device/device_copy.cuh
+++ b/cub/cub/device/device_copy.cuh
@@ -32,7 +32,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index 4e71c04ed82..a4cddfd105d 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,18 +27,21 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceHistogram provides device-wide parallel operations for 
- *       constructing histogram(s) from a sequence of samples data residing 
+ * @file cub::DeviceHistogram provides device-wide parallel operations for
+ *       constructing histogram(s) from a sequence of samples data residing
  *       within device-accessible memory.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdio.h>
 #include <iterator>
 #include <limits>
 
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_histogram.cuh>
 #include <cub/util_deprecated.cuh>
 
@@ -46,8 +49,8 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * @brief DeviceHistogram provides device-wide parallel operations for 
- *        constructing histogram(s) from a sequence of samples data residing 
+ * @brief DeviceHistogram provides device-wide parallel operations for
+ *        constructing histogram(s) from a sequence of samples data residing
  *        within device-accessible memory. ![](histogram_logo.png)
  * @ingroup SingleModule
  *
@@ -67,7 +70,7 @@ struct DeviceHistogram
   //@{
 
   /**
-   * @brief Computes an intensity histogram from a sequence of data samples 
+   * @brief Computes an intensity histogram from a sequence of data samples
    *        using equal-width bins.
    *
    * @par
@@ -97,7 +100,7 @@ struct DeviceHistogram
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input samples and output histogram
    * int      num_samples;    // e.g., 10
    * float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
@@ -112,7 +115,7 @@ struct DeviceHistogram
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceHistogram::HistogramEven(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, 
+   *   d_samples, d_histogram, num_levels,
    *   lower_level, upper_level, num_samples);
    *
    * // Allocate temporary storage
@@ -121,56 +124,56 @@ struct DeviceHistogram
    * // Compute histograms
    * cub::DeviceHistogram::HistogramEven(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, 
+   *   d_samples, d_histogram, num_levels,
    *   lower_level, upper_level, num_samples);
    *
    * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
    * @endcode
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   samples \iterator
    *
-   * @tparam CounterT                 
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
    *
-   * @tparam LevelT                   
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
    *
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc.  \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
+   * @param[in] d_samples
    *   The pointer to the input sequence of data samples.
    *
-   * @param[out] d_histogram 
-   *   The pointer to the histogram counter output array of length 
+   * @param[out] d_histogram
+   *   The pointer to the histogram counter output array of length
    *   `num_levels - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples.  
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples.
    *   Implies that the number of bins is `num_levels - 1`.
    *
-   * @param[in] lower_level 
+   * @param[in] lower_level
    *   The lower sample value bound (inclusive) for the lowest histogram bin.
    *
-   * @param[in] upper_level 
+   * @param[in] upper_level
    *   The upper sample value bound (exclusive) for the highest histogram bin.
    *
-   * @param[in] num_samples 
+   * @param[in] num_samples
    *   The number of input samples (i.e., the length of `d_samples`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename SampleIteratorT,
@@ -240,12 +243,12 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes an intensity histogram from a sequence of data samples 
+   * @brief Computes an intensity histogram from a sequence of data samples
    *        using equal-width bins.
    *
    * @par
-   * - A two-dimensional *region of interest* within `d_samples` can be 
-   *   specified using the `num_row_samples`, `num_rows`, and 
+   * - A two-dimensional *region of interest* within `d_samples` can be
+   *   specified using the `num_row_samples`, `num_rows`, and
    *   `row_stride_bytes` parameters.
    * - The row stride must be a whole multiple of the sample data type
    *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
@@ -259,10 +262,10 @@ struct DeviceHistogram
    *   `uint64_t`, the cuda error `cudaErrorInvalidValue` is returned. If the common type is 128
    *   bits wide, bin computation will use 128-bit arithmetic and `cudaErrorInvalidValue` will only
    *   be returned if bin computation would overflow for 128-bit arithmetic.
-   * - For a given row `r` in `[0, num_rows)`, let 
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and 
+   * - For a given row `r` in `[0, num_rows)`, let
+   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and
    *   `row_end = row_begin + num_row_samples`. The ranges
-   *   `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)` 
+   *   `[row_begin, row_end)` and `[d_histogram, d_histogram + num_levels - 1)`
    *   shall not overlap in any way.
    * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
    *   and SampleT must be valid arithmetic types. The common type must be
@@ -277,7 +280,7 @@ struct DeviceHistogram
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input samples and output histogram
    * int      num_row_samples;    // e.g., 5
    * int      num_rows;           // e.g., 2;
@@ -310,57 +313,57 @@ struct DeviceHistogram
    * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
    * @endcode
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading
    *   input samples. \iterator
    *
-   * @tparam CounterT                 
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
    *
-   * @tparam LevelT                   
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
    *
-   * @tparam OffsetT                  
+   * @tparam OffsetT
    *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
 
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
+   * @param[in] d_samples
    *   The pointer to the input sequence of data samples.
    *
-   * @param[out] d_histogram 
-   *   The pointer to the histogram counter output array of 
+   * @param[out] d_histogram
+   *   The pointer to the histogram counter output array of
    *   length `num_levels - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples.  
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples.
    *   Implies that the number of bins is `num_levels - 1`.
    *
-   * @param[in] lower_level 
+   * @param[in] lower_level
    *   The lower sample value bound (inclusive) for the lowest histogram bin.
    *
-   * @param[in] upper_level 
+   * @param[in] upper_level
    *   The upper sample value bound (exclusive) for the highest histogram bin.
    *
-   * @param[in] num_row_samples 
+   * @param[in] num_row_samples
    *   The number of data samples per row in the region of interest
    *
-   * @param[in] num_rows 
+   * @param[in] num_rows
    *   The number of rows in the region of interest
    *
-   * @param[in] row_stride_bytes 
-   *   The number of bytes between starts of consecutive rows in 
+   * @param[in] row_stride_bytes
+   *   The number of bytes between starts of consecutive rows in
    *   the region of interest
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename SampleIteratorT,
@@ -433,20 +436,20 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes per-channel intensity histograms from a sequence of 
+   * @brief Computes per-channel intensity histograms from a sequence of
    *        multi-channel "pixel" data samples using equal-width bins.
    *
    * @par
    * - The input is a sequence of *pixel* structures, where each pixel comprises
-   *   a record of `NUM_CHANNELS` consecutive data samples 
+   *   a record of `NUM_CHANNELS` consecutive data samples
    *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute 
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` 
+   * - Of the `NUM_CHANNELS` specified, the function will only compute
+   *   histograms for the first `NUM_ACTIVE_CHANNELS`
    *   (e.g., only *RGB* histograms from *RGBA* pixel samples).
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is 
+   * - The number of histogram bins for channel<sub><em>i</em></sub> is
    *   `num_levels[i] - 1`.
    * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-   *   have the same width: 
+   *   have the same width:
    *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
    * - If the common type of sample and level is of integral type, the bin for a sample is
    *   computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] -
@@ -473,7 +476,7 @@ struct DeviceHistogram
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input samples and output histograms
    * int              num_pixels;         // e.g., 5
    * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
@@ -490,7 +493,7 @@ struct DeviceHistogram
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, 
+   *   d_samples, d_histogram, num_levels,
    *   lower_level, upper_level, num_pixels);
    *
    * // Allocate temporary storage
@@ -499,7 +502,7 @@ struct DeviceHistogram
    * // Compute histograms
    * cub::DeviceHistogram::MultiHistogramEven<4, 3>(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, 
+   *   d_samples, d_histogram, num_levels,
    *   lower_level, upper_level, num_pixels);
    *
    * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
@@ -507,65 +510,65 @@ struct DeviceHistogram
    * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
    * @endcode
    *
-   * @tparam NUM_CHANNELS             
-   *   Number of channels interleaved in the input data (may be greater than 
+   * @tparam NUM_CHANNELS
+   *   Number of channels interleaved in the input data (may be greater than
    *   the number of channels being actively histogrammed)
    *
-   * @tparam NUM_ACTIVE_CHANNELS      
+   * @tparam NUM_ACTIVE_CHANNELS
    *   **[inferred]** Number of channels actively being histogrammed
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading
    *   input samples. \iterator
    *
-   * @tparam CounterT                 
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
    *
-   * @tparam LevelT                   
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
    *
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
-   *   The pointer to the multi-channel input sequence of data samples. 
-   *   The samples from different channels are assumed to be interleaved 
-   *   (e.g., an array of 32-bit pixels where each pixel consists of four 
+   * @param[in] d_samples
+   *   The pointer to the multi-channel input sequence of data samples.
+   *   The samples from different channels are assumed to be interleaved
+   *   (e.g., an array of 32-bit pixels where each pixel consists of four
    *   *RGBA* 8-bit samples).
    *
    * @param[out] d_histogram
-   *   The pointers to the histogram counter output arrays, one for each active 
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of 
+   *   The pointers to the histogram counter output arrays, one for each active
+   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
    *   `d_histogram[i]` should be `num_levels[i] - 1`.
    *
    * @param[in] num_levels
-   *   The number of boundaries (levels) for delineating histogram samples in 
-   *   each active channel. Implies that the number of bins for 
+   *   The number of boundaries (levels) for delineating histogram samples in
+   *   each active channel. Implies that the number of bins for
    *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
    *
    * @param[in] lower_level
-   *   The lower sample value bound (inclusive) for the lowest histogram bin in 
+   *   The lower sample value bound (inclusive) for the lowest histogram bin in
    *   each active channel.
    *
    * @param[in] upper_level
-   *   The upper sample value bound (exclusive) for the highest histogram bin 
+   *   The upper sample value bound (exclusive) for the highest histogram bin
    *   in each active channel.
    *
-   * @param[in] num_pixels 
-   *   The number of multi-channel pixels 
+   * @param[in] num_pixels
+   *   The number of multi-channel pixels
    *   (i.e., the length of `d_samples / NUM_CHANNELS`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <int NUM_CHANNELS,
@@ -635,25 +638,25 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes per-channel intensity histograms from a sequence of 
+   * @brief Computes per-channel intensity histograms from a sequence of
    *        multi-channel "pixel" data samples using equal-width bins.
    *
    * @par
-   * - The input is a sequence of *pixel* structures, where each pixel 
-   *   comprises a record of `NUM_CHANNELS` consecutive data samples 
+   * - The input is a sequence of *pixel* structures, where each pixel
+   *   comprises a record of `NUM_CHANNELS` consecutive data samples
    *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute 
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB* 
+   * - Of the `NUM_CHANNELS` specified, the function will only compute
+   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., only *RGB*
    *   histograms from *RGBA* pixel samples).
-   * - A two-dimensional *region of interest* within `d_samples` can be 
-   *   specified using the `num_row_samples`, `num_rows`, and 
+   * - A two-dimensional *region of interest* within `d_samples` can be
+   *   specified using the `num_row_samples`, `num_rows`, and
    *   `row_stride_bytes` parameters.
    * - The row stride must be a whole multiple of the sample data type
    *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is 
+   * - The number of histogram bins for channel<sub><em>i</em></sub> is
    *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram 
-   *   bins have the same width: 
+   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
+   *   bins have the same width:
    *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
    * - If the common type of sample and level is of integral type, the bin for a sample is
    *   computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] -
@@ -663,14 +666,14 @@ struct DeviceHistogram
    *   `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation
    *   will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin
    *   computation would overflow for 128-bit arithmetic.
-   * - For a given row `r` in `[0, num_rows)`, and sample `s` in 
-   *   `[0, num_row_pixels)`, let 
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, 
+   * - For a given row `r` in `[0, num_rows)`, and sample `s` in
+   *   `[0, num_row_pixels)`, let
+   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`,
    *   `sample_begin = row_begin + s * NUM_CHANNELS`, and
    *   `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel
-   *    `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges 
-   *   `[sample_begin, sample_end)` and 
-   *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap 
+   *    `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges
+   *   `[sample_begin, sample_end)` and
+   *   `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap
    *   in any way.
    * - `cuda::std::common_type<LevelT, SampleT>` must be valid, and both LevelT
    *   and SampleT must be valid arithmetic types. The common type must be
@@ -678,15 +681,15 @@ struct DeviceHistogram
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the computation of three 256-bin 
-   * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 
+   * The code snippet below illustrates the computation of three 256-bin
+   * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4
    * array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
    *
    * @par
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for input 
+   * // Declare, allocate, and initialize device-accessible pointers for input
    * // samples and output histograms
    * int              num_row_pixels;     // e.g., 3
    * int              num_rows;           // e.g., 2
@@ -722,71 +725,71 @@ struct DeviceHistogram
    * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
    * @endcode
    *
-   * @tparam NUM_CHANNELS             
-   *   Number of channels interleaved in the input data (may be greater than 
+   * @tparam NUM_CHANNELS
+   *   Number of channels interleaved in the input data (may be greater than
    *   the number of channels being actively histogrammed)
    *
-   * @tparam NUM_ACTIVE_CHANNELS      
+   * @tparam NUM_ACTIVE_CHANNELS
    *   **[inferred]** Number of channels actively being histogrammed
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   samples. \iterator
    *
-   * @tparam CounterT                 
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
    *
-   * @tparam LevelT                   
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
    *
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
-   *   The pointer to the multi-channel input sequence of data samples. The 
-   *   samples from different channels are assumed to be interleaved (e.g., 
-   *   an array of 32-bit pixels where each pixel consists of four 
+   * @param[in] d_samples
+   *   The pointer to the multi-channel input sequence of data samples. The
+   *   samples from different channels are assumed to be interleaved (e.g.,
+   *   an array of 32-bit pixels where each pixel consists of four
    *   *RGBA* 8-bit samples).
    *
-   * @param[out] d_histogram 
-   *   The pointers to the histogram counter output arrays, one for each 
-   *   active channel. For channel<sub><em>i</em></sub>, the allocation length 
+   * @param[out] d_histogram
+   *   The pointers to the histogram counter output arrays, one for each
+   *   active channel. For channel<sub><em>i</em></sub>, the allocation length
    *   of `d_histogram[i]` should be `num_levels[i] - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples in 
-   *   each active channel. Implies that the number of bins for 
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples in
+   *   each active channel. Implies that the number of bins for
    *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
    *
-   * @param[in] lower_level 
-   *   The lower sample value bound (inclusive) for the lowest histogram bin in 
+   * @param[in] lower_level
+   *   The lower sample value bound (inclusive) for the lowest histogram bin in
    *   each active channel.
    *
-   * @param[in] upper_level 
-   *   The upper sample value bound (exclusive) for the highest histogram bin 
+   * @param[in] upper_level
+   *   The upper sample value bound (exclusive) for the highest histogram bin
    *   in each active channel.
    *
-   * @param[in] num_row_pixels 
+   * @param[in] num_row_pixels
    *   The number of multi-channel pixels per row in the region of interest
    *
-   * @param[in] num_rows 
+   * @param[in] num_rows
    *   The number of rows in the region of interest
    *
-   * @param[in] row_stride_bytes 
-   *   The number of bytes between starts of consecutive rows in the region of 
+   * @param[in] row_stride_bytes
+   *   The number of bytes between starts of consecutive rows in the region of
    *   interest
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <int NUM_CHANNELS,
@@ -900,16 +903,16 @@ struct DeviceHistogram
   //@{
 
   /**
-   * @brief Computes an intensity histogram from a sequence of data samples 
+   * @brief Computes an intensity histogram from a sequence of data samples
    *        using the specified bin boundary levels.
    *
    * @par
    * - The number of histogram bins is (`num_levels - 1`)
    * - The value range for bin<sub><em>i</em></sub> is `[level[i], level[i+1])`
-   * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not 
-   *   overlap `[d_samples, d_samples + num_samples)` nor 
-   *   `[d_levels, d_levels + num_levels)` in any way. The ranges 
-   *   `[d_levels, d_levels + num_levels)` and 
+   * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not
+   *   overlap `[d_samples, d_samples + num_samples)` nor
+   *   `[d_levels, d_levels + num_levels)` in any way. The ranges
+   *   `[d_levels, d_levels + num_levels)` and
    *   `[d_samples, d_samples + num_samples)` may overlap.
    * - @devicestorage
    *
@@ -921,7 +924,7 @@ struct DeviceHistogram
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for input 
+   * // Declare, allocate, and initialize device-accessible pointers for input
    * // samples and output histogram
    * int      num_samples;    // e.g., 10
    * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
@@ -949,49 +952,49 @@ struct DeviceHistogram
    *
    * @endcode
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading
    *   input samples.\iterator
    *
-   * @tparam CounterT                 
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
    *
-   * @tparam LevelT                   
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
    *
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
+   * @param[in] d_samples
    *   The pointer to the input sequence of data samples.
    *
-   * @param[out] d_histogram 
-   *   The pointer to the histogram counter output array of length 
+   * @param[out] d_histogram
+   *   The pointer to the histogram counter output array of length
    *   `num_levels - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples.  
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples.
    *   Implies that the number of bins is `num_levels - 1`.
    *
-   * @param[in] d_levels 
-   *   The pointer to the array of boundaries (levels). Bin ranges are defined 
-   *   by consecutive boundary pairings: lower sample value boundaries are 
+   * @param[in] d_levels
+   *   The pointer to the array of boundaries (levels). Bin ranges are defined
+   *   by consecutive boundary pairings: lower sample value boundaries are
    *   inclusive and upper sample value boundaries are exclusive.
    *
-   * @param[in] num_samples 
+   * @param[in] num_samples
    *   The number of data samples per row in the region of interest
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename SampleIteratorT,
@@ -1056,19 +1059,19 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes an intensity histogram from a sequence of data samples 
+   * @brief Computes an intensity histogram from a sequence of data samples
    *        using the specified bin boundary levels.
    *
    * @par
-   * - A two-dimensional *region of interest* within `d_samples` can be 
-   *   specified using the `num_row_samples`, `num_rows`, and 
+   * - A two-dimensional *region of interest* within `d_samples` can be
+   *   specified using the `num_row_samples`, `num_rows`, and
    *   `row_stride_bytes` parameters.
    * - The row stride must be a whole multiple of the sample data type
    *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
    * - The number of histogram bins is (`num_levels - 1`)
    * - The value range for bin<sub><em>i</em></sub> is `[level[i], level[i+1])`
-   * - For a given row `r` in `[0, num_rows)`, let 
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and 
+   * - For a given row `r` in `[0, num_rows)`, let
+   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and
    *   `row_end = row_begin + num_row_samples`. The range
    *   `[d_histogram, d_histogram + num_levels - 1)` shall not overlap
    *   `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`.
@@ -1116,55 +1119,55 @@ struct DeviceHistogram
    * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
    * @endcode
    *
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading 
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading
    *   input samples. \iterator
-   * 
-   * @tparam CounterT                 
+   *
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
-   * 
-   * @tparam LevelT                   
+   *
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
-   * 
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   *
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
+   * @param[in] d_samples
    *   The pointer to the input sequence of data samples.
    *
-   * @param[out] d_histogram 
-   *   The pointer to the histogram counter output array of length 
+   * @param[out] d_histogram
+   *   The pointer to the histogram counter output array of length
    *   `num_levels - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples.  
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples.
    *   Implies that the number of bins is `num_levels - 1`.
    *
-   * @param[in] d_levels 
-   *   The pointer to the array of boundaries (levels). Bin ranges are defined 
-   *   by consecutive boundary pairings: lower sample value boundaries are 
+   * @param[in] d_levels
+   *   The pointer to the array of boundaries (levels). Bin ranges are defined
+   *   by consecutive boundary pairings: lower sample value boundaries are
    *   inclusive and upper sample value boundaries are exclusive.
    *
-   * @param[in] num_row_samples 
+   * @param[in] num_row_samples
    *   The number of data samples per row in the region of interest
    *
-   * @param[in] num_rows 
+   * @param[in] num_rows
    *   The number of rows in the region of interest
    *
-   * @param[in] row_stride_bytes 
-   *   The number of bytes between starts of consecutive rows in the region 
+   * @param[in] row_stride_bytes
+   *   The number of bytes between starts of consecutive rows in the region
    *   of interest
    *
-   * @param[in] stream 
+   * @param[in] stream
    *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
@@ -1233,24 +1236,24 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes per-channel intensity histograms from a sequence of 
-   *        multi-channel "pixel" data samples using the specified bin 
+   * @brief Computes per-channel intensity histograms from a sequence of
+   *        multi-channel "pixel" data samples using the specified bin
    *        boundary levels.
    *
    * @par
-   * - The input is a sequence of *pixel* structures, where each pixel 
-   *   comprises a record of `NUM_CHANNELS` consecutive data samples 
+   * - The input is a sequence of *pixel* structures, where each pixel
+   *   comprises a record of `NUM_CHANNELS` consecutive data samples
    *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute 
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms 
+   * - Of the `NUM_CHANNELS` specified, the function will only compute
+   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms
    *   from *RGBA* pixel samples).
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is 
+   * - The number of histogram bins for channel<sub><em>i</em></sub> is
    *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram 
-   *   bins have the same width: 
+   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
+   *   bins have the same width:
    *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the 
-   *   range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall 
+   * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the
+   *   range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall
    *   not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor
    *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way.
    *   The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and
@@ -1258,15 +1261,15 @@ struct DeviceHistogram
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the computation of three 4-bin *RGB* 
-   * histograms from a quad-channel sequence of *RGBA* pixels 
+   * The code snippet below illustrates the computation of three 4-bin *RGB*
+   * histograms from a quad-channel sequence of *RGBA* pixels
    * (8 bits per channel per pixel)
    *
    * @par
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input samples and output histograms
    * int            num_pixels;       // e.g., 5
    * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
@@ -1299,63 +1302,63 @@ struct DeviceHistogram
    *
    * @endcode
    *
-   * @tparam NUM_CHANNELS             
-   *   Number of channels interleaved in the input data (may be greater than 
+   * @tparam NUM_CHANNELS
+   *   Number of channels interleaved in the input data (may be greater than
    *   the number of channels being actively histogrammed)
-   * 
-   * @tparam NUM_ACTIVE_CHANNELS      
+   *
+   * @tparam NUM_ACTIVE_CHANNELS
    *   **[inferred]** Number of channels actively being histogrammed
-   * 
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading 
+   *
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading
    *   input samples. \iterator
-   * 
-   * @tparam CounterT                 
+   *
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
-   * 
-   * @tparam LevelT                   
+   *
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
-   * 
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   *
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc. \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
-   *   The pointer to the multi-channel input sequence of data samples. 
-   *   The samples from different channels are assumed to be interleaved (e.g., 
-   *   an array of 32-bit pixels where each pixel consists of four *RGBA* 
+   * @param[in] d_samples
+   *   The pointer to the multi-channel input sequence of data samples.
+   *   The samples from different channels are assumed to be interleaved (e.g.,
+   *   an array of 32-bit pixels where each pixel consists of four *RGBA*
    *   8-bit samples).
    *
-   * @param[out] d_histogram 
-   *   The pointers to the histogram counter output arrays, one for each active 
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of 
+   * @param[out] d_histogram
+   *   The pointers to the histogram counter output arrays, one for each active
+   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
    *   `d_histogram[i]` should be `num_levels[i] - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples in 
-   *   each active channel. Implies that the number of bins for 
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples in
+   *   each active channel. Implies that the number of bins for
    *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
    *
-   * @param[in] d_levels 
-   *   The pointers to the arrays of boundaries (levels), one for each active 
-   *   channel. Bin ranges are defined by consecutive boundary pairings: lower 
-   *   sample value boundaries are inclusive and upper sample value boundaries 
+   * @param[in] d_levels
+   *   The pointers to the arrays of boundaries (levels), one for each active
+   *   channel. Bin ranges are defined by consecutive boundary pairings: lower
+   *   sample value boundaries are inclusive and upper sample value boundaries
    *   are exclusive.
    *
-   * @param[in] num_pixels 
-   *   The number of multi-channel pixels 
+   * @param[in] num_pixels
+   *   The number of multi-channel pixels
    *   (i.e., the length of `d_samples / NUM_CHANNELS`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <int NUM_CHANNELS,
@@ -1421,50 +1424,50 @@ struct DeviceHistogram
   }
 
   /**
-   * @brief Computes per-channel intensity histograms from a sequence of 
-   *        multi-channel "pixel" data samples using the specified bin boundary 
+   * @brief Computes per-channel intensity histograms from a sequence of
+   *        multi-channel "pixel" data samples using the specified bin boundary
    *        levels.
    *
    * @par
    * - The input is a sequence of *pixel* structures, where each pixel comprises
-   *   a record of `NUM_CHANNELS` consecutive data samples 
+   *   a record of `NUM_CHANNELS` consecutive data samples
    *   (e.g., an *RGBA* pixel).
-   * - Of the `NUM_CHANNELS` specified, the function will only compute 
-   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms 
+   * - Of the `NUM_CHANNELS` specified, the function will only compute
+   *   histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms
    *   from *RGBA* pixel samples).
-   * - A two-dimensional *region of interest* within `d_samples` can be 
-   *   specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes` 
+   * - A two-dimensional *region of interest* within `d_samples` can be
+   *   specified using the `num_row_samples`, `num_rows`, and `row_stride_bytes`
    *   parameters.
    * - The row stride must be a whole multiple of the sample data type
    *   size, i.e., `(row_stride_bytes % sizeof(SampleT)) == 0`.
-   * - The number of histogram bins for channel<sub><em>i</em></sub> is 
+   * - The number of histogram bins for channel<sub><em>i</em></sub> is
    *   `num_levels[i] - 1`.
-   * - For channel<sub><em>i</em></sub>, the range of values for all histogram 
-   *   bins have the same width: 
+   * - For channel<sub><em>i</em></sub>, the range of values for all histogram
+   *   bins have the same width:
    *   `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`
-   * - For a given row `r` in `[0, num_rows)`, and sample `s` in 
-   *   `[0, num_row_pixels)`, let 
-   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, 
+   * - For a given row `r` in `[0, num_rows)`, and sample `s` in
+   *   `[0, num_row_pixels)`, let
+   *   `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`,
    *   `sample_begin = row_begin + s * NUM_CHANNELS`, and
    *   `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels
    *    `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range
-   *   `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not 
+   *   `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not
    *   overlap `[sample_begin, sample_end)` nor
    *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges
-   *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` and 
+   *   `[d_levels[c2], d_levels[c2] + num_levels[c2])` and
    *   `[sample_begin, sample_end)` may overlap.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the computation of three 4-bin *RGB* 
-   * histograms from a 2x3 region of interest of within a flattened 2x4 array 
+   * The code snippet below illustrates the computation of three 4-bin *RGB*
+   * histograms from a 2x3 region of interest of within a flattened 2x4 array
    * of quad-channel *RGBA* pixels (8 bits per channel per pixel).
    *
    * @par
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for input 
+   * // Declare, allocate, and initialize device-accessible pointers for input
    * // samples and output histograms
    * int              num_row_pixels;     // e.g., 3
    * int              num_rows;           // e.g., 2
@@ -1483,7 +1486,7 @@ struct DeviceHistogram
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, d_levels, 
+   *   d_samples, d_histogram, num_levels, d_levels,
    *   num_row_pixels, num_rows, row_stride_bytes);
    *
    * // Allocate temporary storage
@@ -1492,7 +1495,7 @@ struct DeviceHistogram
    * // Compute histograms
    * cub::DeviceHistogram::MultiHistogramRange<4, 3>(
    *   d_temp_storage, temp_storage_bytes,
-   *   d_samples, d_histogram, num_levels, 
+   *   d_samples, d_histogram, num_levels,
    *   d_levels, num_row_pixels, num_rows, row_stride_bytes);
    *
    * // d_histogram   <-- [ [2, 3, 0, 1],
@@ -1501,68 +1504,68 @@ struct DeviceHistogram
    *
    * @endcode
    *
-   * @tparam NUM_CHANNELS             
-   *   Number of channels interleaved in the input data (may be greater than 
+   * @tparam NUM_CHANNELS
+   *   Number of channels interleaved in the input data (may be greater than
    *   the number of channels being actively histogrammed)
-   * 
-   * @tparam NUM_ACTIVE_CHANNELS      
+   *
+   * @tparam NUM_ACTIVE_CHANNELS
    *   **[inferred]** Number of channels actively being histogrammed
-   * 
-   * @tparam SampleIteratorT          
-   *   **[inferred]** Random-access input iterator type for reading input 
+   *
+   * @tparam SampleIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   samples. \iterator
-   * 
-   * @tparam CounterT                 
+   *
+   * @tparam CounterT
    *   **[inferred]** Integer type for histogram bin counters
-   * 
-   * @tparam LevelT                   
+   *
+   * @tparam LevelT
    *   **[inferred]** Type for specifying boundaries (levels)
-   * 
-   * @tparam OffsetT                  
-   *   **[inferred]** Signed integer type for sequence offsets, list lengths, 
+   *
+   * @tparam OffsetT
+   *   **[inferred]** Signed integer type for sequence offsets, list lengths,
    *   pointer differences, etc.  \offset_size1
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to \p temp_storage_bytes and no work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_samples 
-   *   The pointer to the multi-channel input sequence of data samples. The 
-   *   samples from different channels are assumed to be interleaved (e.g., an 
-   *   array of 32-bit pixels where each pixel consists of four 
+   * @param[in] d_samples
+   *   The pointer to the multi-channel input sequence of data samples. The
+   *   samples from different channels are assumed to be interleaved (e.g., an
+   *   array of 32-bit pixels where each pixel consists of four
    *   *RGBA* 8-bit samples).
    *
-   * @param[out] d_histogram 
-   *   The pointers to the histogram counter output arrays, one for each active 
-   *   channel. For channel<sub><em>i</em></sub>, the allocation length of 
+   * @param[out] d_histogram
+   *   The pointers to the histogram counter output arrays, one for each active
+   *   channel. For channel<sub><em>i</em></sub>, the allocation length of
    *   `d_histogram[i]` should be `num_levels[i] - 1`.
    *
-   * @param[in] num_levels 
-   *   The number of boundaries (levels) for delineating histogram samples in 
-   *   each active channel. Implies that the number of bins for 
+   * @param[in] num_levels
+   *   The number of boundaries (levels) for delineating histogram samples in
+   *   each active channel. Implies that the number of bins for
    *   channel<sub><em>i</em></sub> is `num_levels[i] - 1`.
    *
-   * @param[in] d_levels 
-   *   The pointers to the arrays of boundaries (levels), one for each active 
-   *   channel. Bin ranges are defined by consecutive boundary pairings: lower 
-   *   sample value boundaries are inclusive and upper sample value boundaries 
+   * @param[in] d_levels
+   *   The pointers to the arrays of boundaries (levels), one for each active
+   *   channel. Bin ranges are defined by consecutive boundary pairings: lower
+   *   sample value boundaries are inclusive and upper sample value boundaries
    *   are exclusive.
    *
-   * @param[in] num_row_pixels 
+   * @param[in] num_row_pixels
    *   The number of multi-channel pixels per row in the region of interest
    *
-   * @param[in] num_rows 
+   * @param[in] num_rows
    *   The number of rows in the region of interest
    *
-   * @param[in] row_stride_bytes 
-   *   The number of bytes between starts of consecutive rows in the 
+   * @param[in] row_stride_bytes
+   *   The number of bytes between starts of consecutive rows in the
    *   region of interest
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <int NUM_CHANNELS,
diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh
index a0f8fe5415f..9171a67e2bc 100644
--- a/cub/cub/device/device_memcpy.cuh
+++ b/cub/cub/device/device_memcpy.cuh
@@ -32,7 +32,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
 
 #include <cstdint>
diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
index 9b17ac39125..7806921ab12 100644
--- a/cub/cub/device/device_merge_sort.cuh
+++ b/cub/cub/device/device_merge_sort.cuh
@@ -27,7 +27,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_merge_sort.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_namespace.cuh>
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 6692a489b20..491e8b2e56b 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -34,10 +34,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdio.h>
 #include <iterator>
 
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_select_if.cuh>
 #include <cub/device/dispatch/dispatch_three_way_partition.cuh>
 #include <cub/util_deprecated.cuh>
@@ -85,9 +88,9 @@ struct DevicePartition
      * - Copies of the selected items are compacted into @p d_out and maintain
      *   their original relative ordering, however copies of the unselected
      *   items are compacted into the rear of @p d_out in reverse order.
-     * - The range `[d_out, d_out + num_items)` shall not overlap 
-     *   `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any 
-     *   way. The range `[d_in, d_in + num_items)` may overlap 
+     * - The range `[d_out, d_out + num_items)` shall not overlap
+     *   `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any
+     *   way. The range `[d_in, d_in + num_items)` may overlap
      *  `[d_flags, d_flags + num_items)`.
      * - \devicestorage
      *
@@ -251,8 +254,8 @@ struct DevicePartition
      * - Copies of the selected items are compacted into @p d_out and maintain
      *   their original relative ordering, however copies of the unselected
      *   items are compacted into the rear of @p d_out in reverse order.
-     * - The range `[d_out, d_out + num_items)` shall not overlap 
-     *   `[d_in, d_in + num_items)` in any way. 
+     * - The range `[d_out, d_out + num_items)` shall not overlap
+     *   `[d_in, d_in + num_items)` in any way.
      * - \devicestorage
      *
      * @par Performance
@@ -451,10 +454,10 @@ struct DevicePartition
      * - Copies of the unselected items are compacted into the
      *   @p d_unselected_out in reverse order.
      * - The ranges `[d_out, d_out + num_items)`,
-     *   `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, 
-     *   `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, 
-     *   `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, 
-     *   shall not overlap in any way. 
+     *   `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`,
+     *   `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`,
+     *   `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`,
+     *   shall not overlap in any way.
      *
      * @par Snippet
      * The code snippet below illustrates how this algorithm can partition an
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index e9d1b4d1d91..6644adff2a7 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -26,13 +26,16 @@
  *
  ******************************************************************************/
 
-//! @file cub::DeviceRadixSort provides device-wide, parallel operations for 
-//!       computing a radix sort across a sequence of data items residing within 
+//! @file cub::DeviceRadixSort provides device-wide, parallel operations for
+//!       computing a radix sort across a sequence of data items residing within
 //!       device-accessible memory.
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_radix_sort.cuh>
 #include <cub/util_deprecated.cuh>
@@ -43,33 +46,33 @@
 CUB_NAMESPACE_BEGIN
 
 
-//! @brief DeviceRadixSort provides device-wide, parallel operations for 
-//!        computing a radix sort across a sequence of data items residing 
+//! @brief DeviceRadixSort provides device-wide, parallel operations for
+//!        computing a radix sort across a sequence of data items residing
 //!        within device-accessible memory. ![](sorting_logo.png)
 //! @ingroup SingleModule
-//! 
+//!
 //! @par Overview
-//! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) 
-//! arranges items into ascending (or descending) order. The algorithm relies 
-//! upon a positional representation for keys, i.e., each key is comprised of an 
-//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from 
-//! least-significant to most-significant. For a given input sequence of keys 
-//! and a set of rules specifying a total ordering of the symbolic alphabet, the 
+//! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
+//! arranges items into ascending (or descending) order. The algorithm relies
+//! upon a positional representation for keys, i.e., each key is comprised of an
+//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
+//! least-significant to most-significant. For a given input sequence of keys
+//! and a set of rules specifying a total ordering of the symbolic alphabet, the
 //! radix sorting method produces a lexicographic ordering of those keys.
-//! 
+//!
 //! @par Supported Types
 //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
 //! (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half`
-//! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are 
+//! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are
 //! supported as long as decomposer object is provided.
-//! 
+//!
 //! @par Floating-Point Special Cases
-//! 
+//!
 //! - Positive and negative zeros are considered equivalent, and will be treated
 //!   as such in the output.
 //! - No special handling is implemented for NaN values; these are sorted
 //!   according to their bit representations after any transformations.
-//! 
+//!
 //! @par Transformations
 //! Although the direct radix sorting method can only be applied to unsigned
 //! integral types, DeviceRadixSort is able to sort signed and floating-point
@@ -78,41 +81,41 @@ CUB_NAMESPACE_BEGIN
 //! transformations must be considered when restricting the
 //! `[begin_bit, end_bit)` range, as the bitwise transformations will occur
 //! before the bit-range truncation.
-//! 
+//!
 //! Any transformations applied to the keys prior to sorting are reversed
 //! while writing to the final output buffer.
-//! 
+//!
 //! @par Type Specific Bitwise Transformations
 //! To convert the input values into a radix-sortable bitwise representation,
 //! the following transformations take place prior to sorting:
-//! 
+//!
 //! - For unsigned integral values, the keys are used directly.
 //! - For signed integral values, the sign bit is inverted.
 //! - For positive floating point values, the sign bit is inverted.
 //! - For negative floating point values, the full key is inverted.
-//! 
+//!
 //! For floating point types, positive and negative zero are a special case and
 //! will be considered equivalent during sorting.
-//! 
+//!
 //! @par Descending Sort Bitwise Transformations
 //! If descending sort is used, the keys are inverted after performing any
 //! type-specific transformations, and the resulting keys are sorted in ascending
 //! order.
-//! 
+//!
 //! @par Stability
 //! DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are
 //! considered equal and appear in the result in the same order as they appear in
 //! the input.
-//! 
+//!
 //! @par Usage Considerations
 //! @cdp_class{DeviceRadixSort}
-//! 
+//!
 //! @par Performance
-//! @linear_performance{radix sort} The following chart illustrates 
-//! DeviceRadixSort::SortKeys performance across different CUDA architectures 
+//! @linear_performance{radix sort} The following chart illustrates
+//! DeviceRadixSort::SortKeys performance across different CUDA architectures
 //! for uniform-random `uint32` keys.
 //! @plots_below
-//! 
+//!
 //! @image html lsb_radix_sort_int32_keys.png
 struct DeviceRadixSort
 {
@@ -203,7 +206,7 @@ public:
   //! @name KeyT-value pairs
   //@{
 
-  //! @brief Sorts key-value pairs into ascending order. 
+  //! @brief Sorts key-value pairs into ascending order.
   //!        (`~2N` auxiliary storage required)
   //!
   //! @par
@@ -216,15 +219,15 @@ public:
   //!   - `[d_keys_out,   d_keys_out   + num_items)`
   //!   - `[d_values_in,  d_values_in  + num_items)`
   //!   - `[d_values_out, d_values_out + num_items)`
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! - @devicestorage
   //!
   //! @par Performance
-  //! The following charts illustrate saturated sorting performance across 
+  //! The following charts illustrate saturated sorting performance across
   //! different CUDA architectures for uniform-random `uint32, uint32` and
   //! `uint64, uint64` pairs, respectively.
   //!
@@ -236,10 +239,10 @@ public:
   //! keys with associated vector of `int` values.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -265,49 +268,49 @@ public:
   //! // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., sizeof(unsigned int) * 8)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -380,9 +383,9 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -394,10 +397,10 @@ public:
   //!   * ``[d_values_in,  d_values_in  + num_items)``
   //!   * ``[d_values_out, d_values_out + num_items)``
   //!
-  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify 
-  //!   differentiating key bits. This can reduce overall sorting overhead and 
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -417,7 +420,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairs``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -426,62 +429,62 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -530,9 +533,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -544,7 +547,7 @@ public:
   //!   * ``[d_values_in,  d_values_in  + num_items)``
   //!   * ``[d_values_out, d_values_out + num_items)``
   //!
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -564,7 +567,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairs``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -573,54 +576,54 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -665,7 +668,7 @@ public:
                                                              stream);
   }
 
-  //! @brief Sorts key-value pairs into ascending order. 
+  //! @brief Sorts key-value pairs into ascending order.
   //!        (`~N` auxiliary storage required)
   //!
   //! @par
@@ -673,7 +676,7 @@ public:
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! - The contents of both buffers within each pair may be altered by the 
+  //! - The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! - In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -681,18 +684,18 @@ public:
   //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!   - `[d_values.Current(),   d_values.Current()   + num_items)`
   //!   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
   //!
   //! @par Performance
-  //! The following charts illustrate saturated sorting performance across 
+  //! The following charts illustrate saturated sorting performance across
   //! different CUDA architectures for uniform-random `uint32, uint32` and
   //! `uint64, uint64` pairs, respectively.
   //!
@@ -700,14 +703,14 @@ public:
   //! @image html lsb_radix_sort_int64_pairs.png
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of `int` 
+  //! The code snippet below illustrates the sorting of a device vector of `int`
   //! keys with associated vector of `int` values.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers for 
+  //! // Declare, allocate, and initialize device-accessible pointers for
   //! // sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -738,45 +741,45 @@ public:
   //!
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to @p temp_storage_bytes and no work is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -833,14 +836,14 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers and a corresponding
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! * The contents of both buffers within each pair may be altered by the 
+  //! * The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! * In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -850,9 +853,9 @@ public:
   //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
   //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
   //!
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! - @devicestorageP
   //! - @devicestorage
@@ -873,7 +876,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairs``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -882,51 +885,51 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -963,14 +966,14 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers and a corresponding
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! * The contents of both buffers within each pair may be altered by the 
+  //! * The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! * In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -980,12 +983,12 @@ public:
   //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
   //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
   //!
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
@@ -1006,7 +1009,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairs``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -1015,59 +1018,59 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -1108,7 +1111,7 @@ public:
                                                              stream);
   }
 
-  //! @brief Sorts key-value pairs into descending order. 
+  //! @brief Sorts key-value pairs into descending order.
   //!        (`~2N` auxiliary storage required).
   //!
   //! @par
@@ -1121,10 +1124,10 @@ public:
   //!   - `[d_keys_out,   d_keys_out   + num_items)`
   //!   - `[d_values_in,  d_values_in  + num_items)`
   //!   - `[d_values_out, d_values_out + num_items)`
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see 
+  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! - @devicestorage
   //!
@@ -1132,14 +1135,14 @@ public:
   //! Performance is similar to DeviceRadixSort::SortPairs.
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of `int` 
+  //! The code snippet below illustrates the sorting of a device vector of `int`
   //! keys with associated vector of `int` values.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -1167,49 +1170,49 @@ public:
   //! // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of @p d_temp_storage allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -1279,9 +1282,9 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -1293,10 +1296,10 @@ public:
   //!   * ``[d_values_in,  d_values_in  + num_items)``
   //!   * ``[d_values_out, d_values_out + num_items)``
   //!
-  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify 
-  //!   differentiating key bits. This can reduce overall sorting overhead and 
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -1316,7 +1319,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairsDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -1325,62 +1328,62 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -1429,9 +1432,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -1443,7 +1446,7 @@ public:
   //!   * ``[d_values_in,  d_values_in  + num_items)``
   //!   * ``[d_values_out, d_values_out + num_items)``
   //!
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -1463,7 +1466,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairsDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -1472,54 +1475,54 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] d_values_in 
+  //! @param[in] d_values_in
   //!   Pointer to the corresponding input sequence of associated value items
   //!
-  //! @param[out] d_values_out 
-  //!   Pointer to the correspondingly-reordered output sequence of associated 
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
   //!   value items
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -1564,7 +1567,7 @@ public:
                                                              stream);
   }
 
-  //! @brief Sorts key-value pairs into descending order. 
+  //! @brief Sorts key-value pairs into descending order.
   //!        (`~N` auxiliary storage required).
   //!
   //! @par
@@ -1572,7 +1575,7 @@ public:
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! - The contents of both buffers within each pair may be altered by the 
+  //! - The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! - In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -1580,12 +1583,12 @@ public:
   //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!   - `[d_values.Current(),   d_values.Current()   + num_items)`
   //!   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the number 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
   //!   of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
@@ -1594,14 +1597,14 @@ public:
   //! Performance is similar to DeviceRadixSort::SortPairs.
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of `int` 
+  //! The code snippet below illustrates the sorting of a device vector of `int`
   //! keys with associated vector of `int` values.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -1631,46 +1634,46 @@ public:
   //! // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within.  
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -1727,14 +1730,14 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers and a corresponding
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! * The contents of both buffers within each pair may be altered by the 
+  //! * The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! * In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -1744,9 +1747,9 @@ public:
   //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
   //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
   //!
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! - @devicestorageP
   //! - @devicestorage
@@ -1767,7 +1770,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairsDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -1776,51 +1779,51 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -1857,14 +1860,14 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers and a corresponding
   //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
   //!   structure that indicates which of the two buffers is "current" (and thus
   //!   contains the input data to be sorted).
-  //! * The contents of both buffers within each pair may be altered by the 
+  //! * The contents of both buffers within each pair may be altered by the
   //!   sorting operation.
   //! * In-place operations are not supported. There must be no overlap between
   //!   any of the provided ranges:
@@ -1874,12 +1877,12 @@ public:
   //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
   //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
   //!
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within each DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
@@ -1900,7 +1903,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortPairsDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -1909,59 +1912,59 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam ValueT    
+  //! @tparam ValueT
   //!   **[inferred]** ValueT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in,out] d_values 
-  //!   Double-buffer of values whose "current" device-accessible buffer 
-  //!   contains the unsorted input values and, upon return, is updated to point 
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
   //!   to the sorted output values
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -2009,7 +2012,7 @@ public:
   //@{
 
 
-  //! @brief Sorts keys into ascending order. 
+  //! @brief Sorts keys into ascending order.
   //!        (`~2N` auxiliary storage required)
   //!
   //! @par
@@ -2020,30 +2023,30 @@ public:
   //!   any of the provided ranges:
   //!   - `[d_keys_in,    d_keys_in    + num_items)`
   //!   - `[d_keys_out,   d_keys_out   + num_items)`
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see 
+  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! - @devicestorage
   //!
   //! @par Performance
-  //! The following charts illustrate saturated sorting performance across 
-  //! different CUDA architectures for uniform-random `uint32` and `uint64` 
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32` and `uint64`
   //! keys, respectively.
   //!
   //! @image html lsb_radix_sort_int32_keys.png
   //! @image html lsb_radix_sort_int64_keys.png
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of 
+  //! The code snippet below illustrates the sorting of a device vector of
   //! `int` keys.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -2066,42 +2069,42 @@ public:
   //! // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -2138,9 +2141,9 @@ public:
       stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -2150,10 +2153,10 @@ public:
   //!   * ``[d_keys_in,  d_keys_in  + num_items)``
   //!   * ``[d_keys_out, d_keys_out + num_items)``
   //!
-  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify 
-  //!   differentiating key bits. This can reduce overall sorting overhead and 
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -2173,7 +2176,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeys``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -2182,52 +2185,52 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -2274,9 +2277,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -2286,10 +2289,10 @@ public:
   //!   * ``[d_keys_in,  d_keys_in  + num_items)``
   //!   * ``[d_keys_out, d_keys_out + num_items)``
   //!
-  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -2309,7 +2312,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeys``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -2318,44 +2321,44 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -2437,33 +2440,33 @@ public:
   //!   any of the provided ranges:
   //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
   //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
   //!
   //! @par Performance
-  //! The following charts illustrate saturated sorting performance across 
-  //! different CUDA architectures for uniform-random `uint32` and `uint64` 
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32` and `uint64`
   //! keys, respectively.
   //!
   //! @image html lsb_radix_sort_int32_keys.png
   //! @image html lsb_radix_sort_int64_keys.png
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of 
+  //! The code snippet below illustrates the sorting of a device vector of
   //! `int` keys.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -2489,38 +2492,38 @@ public:
   //! // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -2577,9 +2580,9 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers managed by a
   //!   DoubleBuffer structure that indicates which of the two buffers is
   //!   "current" (and thus contains the input data to be sorted).
@@ -2590,9 +2593,9 @@ public:
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
   //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!
-  //! * Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! * @devicestorageP
   //! * @devicestorage
@@ -2613,7 +2616,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeys``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -2622,43 +2625,43 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -2695,9 +2698,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers managed by a
   //!   DoubleBuffer structure that indicates which of the two buffers is
   //!   "current" (and thus contains the input data to be sorted).
@@ -2708,12 +2711,12 @@ public:
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
   //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!
-  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify 
-  //!   differentiating key bits. This can reduce overall sorting overhead and 
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! * @devicestorageP
   //! * @devicestorage
@@ -2734,7 +2737,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeys``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -2743,51 +2746,51 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -2827,8 +2830,8 @@ public:
                                                              end_bit,
                                                              stream);
   }
-  
-  //! @brief Sorts keys into descending order. 
+
+  //! @brief Sorts keys into descending order.
   //!        (`~2N` auxiliary storage required).
   //!
   //! @par
@@ -2839,10 +2842,10 @@ public:
   //!   any of the provided ranges:
   //!   - `[d_keys_in,    d_keys_in    + num_items)`
   //!   - `[d_keys_out,   d_keys_out   + num_items)`
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! - @devicestorage
   //!
@@ -2850,14 +2853,14 @@ public:
   //! Performance is similar to DeviceRadixSort::SortKeys.
   //!
   //! @par Snippet
-  //! The code snippet below illustrates the sorting of a device vector of 
+  //! The code snippet below illustrates the sorting of a device vector of
   //! `int` keys.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -2870,7 +2873,7 @@ public:
   //! // Determine temporary device storage requirements
   //! void     *d_temp_storage = NULL;
   //! size_t   temp_storage_bytes = 0;
-  //! cub::DeviceRadixSort::SortKeysDescending( 
+  //! cub::DeviceRadixSort::SortKeysDescending(
   //!   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
   //!
   //! // Allocate temporary storage
@@ -2884,39 +2887,39 @@ public:
   //!
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within.  
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -2979,9 +2982,9 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -2991,10 +2994,10 @@ public:
   //!   * ``[d_keys_in,  d_keys_in  + num_items)``
   //!   * ``[d_keys_out, d_keys_out + num_items)``
   //!
-  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -3014,7 +3017,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeysDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -3023,52 +3026,52 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -3115,9 +3118,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
-  //! 
+  //!
   //! * The contents of the input data are not altered by the sorting operation.
   //! * Pointers to contiguous memory must be used; iterators are not currently
   //!   supported.
@@ -3127,7 +3130,7 @@ public:
   //!   * ``[d_keys_in,  d_keys_in  + num_items)``
   //!   * ``[d_keys_out, d_keys_out + num_items)``
   //!
-  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see 
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
   //!   the sorting interface using DoubleBuffer wrappers below.
   //! * @devicestorage
   //!
@@ -3147,7 +3150,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeysDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -3156,44 +3159,44 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in] d_keys_in 
+  //! @param[in] d_keys_in
   //!   Pointer to the input data of key data to sort
   //!
-  //! @param[out] d_keys_out 
+  //! @param[out] d_keys_out
   //!   Pointer to the sorted output sequence of key data
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -3236,7 +3239,7 @@ public:
                                                              stream);
   }
 
-  //! @brief Sorts keys into descending order. 
+  //! @brief Sorts keys into descending order.
   //!        (`~N` auxiliary storage required).
   //!
   //! @par
@@ -3248,12 +3251,12 @@ public:
   //!   any of the provided ranges:
   //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
   //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-  //! - Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
-  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-  //!   bits can be specified. This can reduce overall sorting overhead and 
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
   //! - @devicestorageP
   //! - @devicestorage
@@ -3265,10 +3268,10 @@ public:
   //! The code snippet below illustrates the sorting of a device vector of @p int keys.
   //! @par
   //! @code
-  //! #include <cub/cub.cuh>   
+  //! #include <cub/cub.cuh>
   //! // or equivalently <cub/device/device_radix_sort.cuh>
   //!
-  //! // Declare, allocate, and initialize device-accessible pointers 
+  //! // Declare, allocate, and initialize device-accessible pointers
   //! // for sorting data
   //! int  num_items;          // e.g., 7
   //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -3294,38 +3297,38 @@ public:
   //! // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
   //! @endcode
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `sizeof(unsigned int) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t
@@ -3382,9 +3385,9 @@ public:
   }
   #endif
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers managed by a
   //!   DoubleBuffer structure that indicates which of the two buffers is
   //!   "current" (and thus contains the input data to be sorted).
@@ -3395,9 +3398,9 @@ public:
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
   //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!
-  //! * Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! * @devicestorageP
   //! * @devicestorage
@@ -3418,7 +3421,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeysDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -3427,43 +3430,43 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
@@ -3500,9 +3503,9 @@ public:
                                                              stream);
   }
 
-  //! @rst 
+  //! @rst
   //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
-  //! 
+  //!
   //! * The sorting operation is given a pair of key buffers managed by a
   //!   DoubleBuffer structure that indicates which of the two buffers is
   //!   "current" (and thus contains the input data to be sorted).
@@ -3513,12 +3516,12 @@ public:
   //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
   //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
   //!
-  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify 
-  //!   differentiating key bits. This can reduce overall sorting overhead and 
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
   //!   yield a corresponding performance improvement.
-  //! * Upon completion, the sorting operation will update the "current" 
-  //!   indicator within the DoubleBuffer wrapper to reference which of the two 
-  //!   buffers now contains the sorted output sequence (a function of the 
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
   //!   number of key bits specified and the targeted device architecture).
   //! * @devicestorageP
   //! * @devicestorage
@@ -3539,7 +3542,7 @@ public:
   //!
   //! The following snippet shows how to sort an array of ``custom_t`` objects
   //! using ``cub::DeviceRadixSort::SortKeysDescending``:
-  //! 
+  //!
   //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu
   //!     :language: c++
   //!     :dedent:
@@ -3548,51 +3551,51 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam KeyT      
+  //! @tparam KeyT
   //!   **[inferred]** KeyT type
   //!
-  //! @tparam NumItemsT 
+  //! @tparam NumItemsT
   //!   **[inferred]** Type of num_items
   //!
   //! @tparam DecomposerT
-  //!   **[inferred]** Type of a callable object responsible for decomposing a 
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
   //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
-  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``. 
-  //!   The leftmost element of the tuple is considered the most significant. 
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
   //!   The call operator must not modify members of the key.
   //!
-  //! @param[in] d_temp_storage 
-  //!   Device-accessible allocation of temporary storage. When `nullptr`, the 
-  //!   required allocation size is written to `temp_storage_bytes` and no work 
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
   //!   is done.
   //!
-  //! @param[in,out] temp_storage_bytes 
+  //! @param[in,out] temp_storage_bytes
   //!   Reference to size in bytes of `d_temp_storage` allocation
   //!
-  //! @param[in,out] d_keys 
-  //!   Reference to the double-buffer of keys whose "current" device-accessible 
-  //!   buffer contains the unsorted input keys and, upon return, is updated to 
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
   //!   point to the sorted output keys
   //!
-  //! @param[in] num_items 
+  //! @param[in] num_items
   //!   Number of items to sort
   //!
   //! @param decomposer
   //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
-  //!   references to its constituent arithmetic types. The leftmost element of 
-  //!   the tuple is considered the most significant. The call operator must not 
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
   //!   modify members of the key.
   //!
-  //! @param[in] begin_bit 
-  //!   **[optional]** The least-significant bit index (inclusive) needed for 
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
   //!   key comparison
   //!
-  //! @param[in] end_bit 
-  //!   **[optional]** The most-significant bit index (exclusive) needed for key 
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
   //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
   //!
-  //! @param[in] stream 
-  //!   **[optional]** CUDA stream to launch kernels within. 
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
   //!   Default is stream<sub>0</sub>.
   template <typename KeyT, typename NumItemsT, typename DecomposerT>
   CUB_RUNTIME_FUNCTION static                                //
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index f2c4090f8b5..1606042f184 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,17 +27,20 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceReduce provides device-wide, parallel operations for 
- *       computing a reduction across a sequence of data items residing within 
+ * @file cub::DeviceReduce provides device-wide, parallel operations for
+ *       computing a reduction across a sequence of data items residing within
  *       device-accessible memory.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <limits>
 
-#include <cub/config.cuh>
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
@@ -50,9 +53,9 @@ CUB_NAMESPACE_BEGIN
 //! @ingroup SingleModule
 //!
 //! @rst
-//! DeviceReduce provides device-wide, parallel operations for computing 
-//! a reduction across a sequence of data items residing within 
-//! device-accessible memory. 
+//! DeviceReduce provides device-wide, parallel operations for computing
+//! a reduction across a sequence of data items residing within
+//! device-accessible memory.
 //!
 //! .. image:: ../img/reduce_logo.png
 //!     :align: center
@@ -60,7 +63,7 @@ CUB_NAMESPACE_BEGIN
 //! Overview
 //! ====================================
 //! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
-//! (or *fold*) uses a binary combining operator to compute a single aggregate 
+//! (or *fold*) uses a binary combining operator to compute a single aggregate
 //! from a sequence of input elements.
 //!
 //! Usage Considerations
@@ -79,8 +82,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //! @par
 //! The following chart illustrates DeviceReduce::ReduceByKey (summation)
-//! performance across different CUDA architectures for `fp32` values. Segments 
-//! are identified by `int32` keys, and have lengths uniformly sampled 
+//! performance across different CUDA architectures for `fp32` values. Segments
+//! are identified by `int32` keys, and have lengths uniformly sampled
 //! from `[1, 1000]`.
 //!
 //! .. image:: ../img/reduce_by_key_fp32_len_500.png
@@ -90,7 +93,7 @@ CUB_NAMESPACE_BEGIN
 struct DeviceReduce
 {
   /**
-   * @brief Computes a device-wide reduction using the specified binary 
+   * @brief Computes a device-wide reduction using the specified binary
    *        `reduction_op` functor and initial value `init`.
    *
    * @par
@@ -104,11 +107,11 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates a user-defined min-reduction of a 
+   * The code snippet below illustrates a user-defined min-reduction of a
    * device vector of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
    * // CustomMin functor
@@ -121,7 +124,7 @@ struct DeviceReduce
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;  // e.g., 7
    * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -134,7 +137,7 @@ struct DeviceReduce
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items, min_op, init);
    *
    * // Allocate temporary storage
@@ -142,55 +145,55 @@ struct DeviceReduce
    *
    * // Run reduction
    * cub::DeviceReduce::Reduce(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items, min_op, init);
    *
    * // d_out <-- [0]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
-   * @tparam ReductionOpT         
-   *   **[inferred]** Binary reduction functor type having member 
+   * @tparam ReductionOpT
+   *   **[inferred]** Binary reduction functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @tparam T                    
-   *   **[inferred]** Data element type that is convertible to the `value` type 
+   * @tparam T
+   *   **[inferred]** Data element type that is convertible to the `value` type
    *   of `InputIteratorT`
    *
    * @tparam NumItemsT **[inferred]** Type of num_items
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param d_in[in] 
+   * @param d_in[in]
    *   Pointer to the input sequence of data items
    *
-   * @param d_out[out] 
+   * @param d_out[out]
    *   Pointer to the output aggregate
    *
-   * @param num_items[in] 
+   * @param num_items[in]
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param reduction_op[in] 
+   * @param reduction_op[in]
    *   Binary reduction functor
    *
-   * @param[in] init  
+   * @param[in] init
    *   Initial value of the reduction
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
@@ -267,21 +270,21 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Performance
-   * The following charts illustrate saturated sum-reduction performance across 
+   * The following charts illustrate saturated sum-reduction performance across
    * different CUDA architectures for `int32` and `int64` items, respectively.
    *
    * @image html reduce_int32.png
    * @image html reduce_int64.png
    *
    * @par Snippet
-   * The code snippet below illustrates the sum-reduction of a device vector 
+   * The code snippet below illustrates the sum-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int  num_items;      // e.g., 7
    * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -304,39 +307,39 @@ struct DeviceReduce
    * // d_out <-- [38]
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
    * @tparam NumItemsT **[inferred]** Type of num_items
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, 
-            typename OutputIteratorT, 
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
             typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
                                               size_t &temp_storage_bytes,
@@ -353,12 +356,12 @@ struct DeviceReduce
       cub::detail::non_void_value_t<OutputIteratorT,
                                     cub::detail::value_t<InputIteratorT>>;
 
-    using InitT = OutputT; 
+    using InitT = OutputT;
 
-    return DispatchReduce<InputIteratorT, 
-                          OutputIteratorT,  
-                          OffsetT, 
-                          cub::Sum, 
+    return DispatchReduce<InputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
+                          cub::Sum,
                           InitT>::Dispatch(d_temp_storage,
                                            temp_storage_bytes,
                                            d_in,
@@ -404,14 +407,14 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the min-reduction of a device vector of 
+   * The code snippet below illustrates the min-reduction of a device vector of
    * `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int  num_items;      // e.g., 7
    * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -434,35 +437,35 @@ struct DeviceReduce
    * // d_out <-- [0]
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
    * @tparam NumItemsT **[inferred]** Type of num_items
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
@@ -483,9 +486,9 @@ struct DeviceReduce
 
     using InitT = InputT;
 
-    return DispatchReduce<InputIteratorT,   
-                          OutputIteratorT,  
-                          OffsetT, 
+    return DispatchReduce<InputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
                           cub::Min,
                           InitT>::Dispatch(d_temp_storage,
                                            temp_storage_bytes,
@@ -493,10 +496,10 @@ struct DeviceReduce
                                            d_out,
                                            static_cast<OffsetT>(num_items),
                                            cub::Min(),
-                                           // replace with 
+                                           // replace with
                                            // std::numeric_limits<T>::max() when
                                            // C++11 support is more prevalent
-                                           Traits<InitT>::Max(), 
+                                           Traits<InitT>::Max(),
                                            stream);
   }
 
@@ -521,15 +524,15 @@ struct DeviceReduce
   }
 
   /**
-   * @brief Finds the first device-wide minimum using the less-than ('<') 
+   * @brief Finds the first device-wide minimum using the less-than ('<')
    *        operator, also returning the index of that item.
    *
    * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
    *   (assuming the value type of `d_in` is `T`)
-   *   - The minimum is written to `d_out.value` and its offset in the input 
+   *   - The minimum is written to `d_out.value` and its offset in the input
    *     array is written to `d_out.key`.
-   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for 
+   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
    *     zero-length inputs
    * - Does not support `<` operators that are non-commutative.
    * - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -541,14 +544,14 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the argmin-reduction of a device vector 
+   * The code snippet below illustrates the argmin-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int                      num_items;      // e.g., 7
    * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -572,35 +575,35 @@ struct DeviceReduce
    *
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input items 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input items
    *   (of some type `T`) \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced aggregate
    *   (having value type `cub::KeyValuePair<int, T>`) \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to \p temp_storage_bytes and no work is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, 
+  template <typename InputIteratorT,
             typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
                                                  size_t &temp_storage_bytes,
@@ -620,7 +623,7 @@ struct DeviceReduce
       cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
 
     using AccumT = OutputTupleT;
-    
+
     using InitT = detail::reduce::empty_problem_init_t<AccumT>;
 
     // The output value type
@@ -634,7 +637,7 @@ struct DeviceReduce
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())}; 
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
 
     return DispatchReduce<ArgIndexInputIteratorT,
                           OutputIteratorT,
@@ -675,7 +678,7 @@ struct DeviceReduce
    * @brief Computes a device-wide maximum using the greater-than ('>') operator.
    *
    * @par
-   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the 
+   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
    *   reduction.
    * - Does not support `>` operators that are non-commutative.
    * - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -687,14 +690,14 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the max-reduction of a device vector of 
+   * The code snippet below illustrates the max-reduction of a device vector of
    * `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int  num_items;      // e.g., 7
    * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -717,38 +720,38 @@ struct DeviceReduce
    * // d_out <-- [9]
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Output iterator type for recording the reduced 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced
    *   aggregate \iterator
    *
    * @tparam NumItemsT **[inferred]** Type of num_items
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, 
+  template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumItemsT>
   CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
@@ -766,9 +769,9 @@ struct DeviceReduce
 
     using InitT = InputT;
 
-    return DispatchReduce<InputIteratorT,   
-                          OutputIteratorT,  
-                          OffsetT, 
+    return DispatchReduce<InputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
                           cub::Max,
                           InitT>::Dispatch(d_temp_storage,
                                            temp_storage_bytes,
@@ -776,11 +779,11 @@ struct DeviceReduce
                                            d_out,
                                            static_cast<OffsetT>(num_items),
                                            cub::Max(),
-                                           // replace with 
+                                           // replace with
                                            // std::numeric_limits<T>::lowest()
-                                           // when C++11 support is more 
+                                           // when C++11 support is more
                                            // prevalent
-                                           Traits<InitT>::Lowest(), 
+                                           Traits<InitT>::Lowest(),
                                            stream);
   }
 
@@ -805,15 +808,15 @@ struct DeviceReduce
   }
 
   /**
-   * @brief Finds the first device-wide maximum using the greater-than ('>') 
+   * @brief Finds the first device-wide maximum using the greater-than ('>')
    *        operator, also returning the index of that item
    *
    * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
    *   (assuming the value type of `d_in` is `T`)
-   *   - The maximum is written to `d_out.value` and its offset in the input 
+   *   - The maximum is written to `d_out.value` and its offset in the input
    *     array is written to `d_out.key`.
-   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for 
+   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
    *     zero-length inputs
    * - Does not support `>` operators that are non-commutative.
    * - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -825,14 +828,14 @@ struct DeviceReduce
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the argmax-reduction of a device vector 
+   * The code snippet below illustrates the argmax-reduction of a device vector
    * of `int` data elements.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_reduce.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int                      num_items;      // e.g., 7
    * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -856,36 +859,36 @@ struct DeviceReduce
    *
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input items 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input items
    *   (of some type \p T) \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Output iterator type for recording the reduced aggregate
    *   (having value type `cub::KeyValuePair<int, T>`) \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output aggregate
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
-  template <typename InputIteratorT, 
+  template <typename InputIteratorT,
             typename OutputIteratorT>
   CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
                                                  size_t &temp_storage_bytes,
@@ -920,7 +923,7 @@ struct DeviceReduce
 
     // Initial value
     // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())}; 
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
 
     return DispatchReduce<ArgIndexInputIteratorT,
                           OutputIteratorT,
@@ -958,57 +961,57 @@ struct DeviceReduce
   }
 
   /**
-   * @brief Reduces segments of values, where segments are demarcated by 
+   * @brief Reduces segments of values, where segments are demarcated by
    *        corresponding runs of identical keys.
    *
    * @par
    * This operation computes segmented reductions within `d_values_in` using
-   * the specified binary `reduction_op` functor. The segments are identified 
-   * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal 
-   * ranges of consecutive, identical keys. For the *i*<sup>th</sup> run 
-   * encountered, the first key of the run and the corresponding value 
-   * aggregate of that run are written to `d_unique_out[i] and 
-   * `d_aggregates_out[i]`, respectively. The total number of runs encountered 
+   * the specified binary `reduction_op` functor. The segments are identified
+   * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
+   * ranges of consecutive, identical keys. For the *i*<sup>th</sup> run
+   * encountered, the first key of the run and the corresponding value
+   * aggregate of that run are written to `d_unique_out[i] and
+   * `d_aggregates_out[i]`, respectively. The total number of runs encountered
    * is written to `d_num_runs_out`.
    *
    * @par
-   * - The `==` equality operator is used to determine whether keys are 
+   * - The `==` equality operator is used to determine whether keys are
    *   equivalent
    * - Provides "run-to-run" determinism for pseudo-associative reduction
    *   (e.g., addition of floating point types) on the same GPU device.
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
-   * - Let `out` be any of 
+   * - Let `out` be any of
    *   `[d_unique_out, d_unique_out + *d_num_runs_out)`
    *   `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`
-   *   `d_num_runs_out`. The ranges represented by `out` shall not overlap 
+   *   `d_num_runs_out`. The ranges represented by `out` shall not overlap
    *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_values_in, d_values_in + num_items)` nor `out` in any way.
    * - @devicestorage
    *
    * @par Performance
    * The following chart illustrates reduction-by-key (sum) performance across
-   * different CUDA architectures for `fp32` and `fp64` values, respectively.  
-   * Segments are identified by `int32` keys, and have lengths uniformly 
+   * different CUDA architectures for `fp32` and `fp64` values, respectively.
+   * Segments are identified by `int32` keys, and have lengths uniformly
    * sampled from `[1, 1000]`.
    *
    * @image html reduce_by_key_fp32_len_500.png
    * @image html reduce_by_key_fp64_len_500.png
    *
    * @par
-   * The following charts are similar, but with segment lengths uniformly 
+   * The following charts are similar, but with segment lengths uniformly
    * sampled from [1,10]:
    *
    * @image html reduce_by_key_fp32_len_5.png
    * @image html reduce_by_key_fp64_len_5.png
    *
    * @par Snippet
-   * The code snippet below illustrates the segmented reduction of `int` values 
+   * The code snippet below illustrates the segmented reduction of `int` values
    * grouped by runs of associated `int` keys.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_reduce.cuh>
    *
    * // CustomMin functor
@@ -1021,7 +1024,7 @@ struct DeviceReduce
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int          num_items;          // e.g., 8
    * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
@@ -1036,8 +1039,8 @@ struct DeviceReduce
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceReduce::ReduceByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_unique_out, d_values_in, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_unique_out, d_values_in,
    *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
    *
    * // Allocate temporary storage
@@ -1045,8 +1048,8 @@ struct DeviceReduce
    *
    * // Run reduce-by-key
    * cub::DeviceReduce::ReduceByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_unique_out, d_values_in, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_unique_out, d_values_in,
    *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
    *
    * // d_unique_out      <-- [0, 2, 9, 5, 8]
@@ -1054,66 +1057,66 @@ struct DeviceReduce
    * // d_num_runs_out    <-- [5]
    * @endcode
    *
-   * @tparam KeysInputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam KeysInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   keys \iterator
    *
-   * @tparam UniqueOutputIteratorT    
-   *   **[inferred]** Random-access output iterator type for writing unique 
+   * @tparam UniqueOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing unique
    *   output keys \iterator
    *
-   * @tparam ValuesInputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam ValuesInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   values \iterator
    *
-   * @tparam AggregatesOutputIterator 
-   *   **[inferred]** Random-access output iterator type for writing output 
+   * @tparam AggregatesOutputIterator
+   *   **[inferred]** Random-access output iterator type for writing output
    *   value aggregates \iterator
    *
-   * @tparam NumRunsOutputIteratorT   
-   *   **[inferred]** Output iterator type for recording the number of runs 
+   * @tparam NumRunsOutputIteratorT
+   *   **[inferred]** Output iterator type for recording the number of runs
    *   encountered \iterator
    *
-   * @tparam ReductionOpT              
-   *   **[inferred]*8 Binary reduction functor type having member 
+   * @tparam ReductionOpT
+   *   **[inferred]*8 Binary reduction functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
    * @tparam NumItemsT **[inferred]** Type of num_items
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in  
+   * @param[in] d_keys_in
    *   Pointer to the input sequence of keys
    *
-   * @param[out] d_unique_out  
+   * @param[out] d_unique_out
    *   Pointer to the output sequence of unique keys (one key per run)
    *
-   * @param[in] d_values_in  
+   * @param[in] d_values_in
    *   Pointer to the input sequence of corresponding values
    *
-   * @param[out] d_aggregates_out  
-   *   Pointer to the output sequence of value aggregates 
+   * @param[out] d_aggregates_out
+   *   Pointer to the output sequence of value aggregates
    *   (one aggregate per run)
    *
-   * @param[out] d_num_runs_out  
-   *   Pointer to total number of runs encountered 
+   * @param[out] d_num_runs_out
+   *   Pointer to total number of runs encountered
    *   (i.e., the length of `d_unique_out`)
    *
-   * @param[in] reduction_op  
+   * @param[in] reduction_op
    *   Binary reduction functor
    *
-   * @param[in] num_items  
-   *   Total number of associated key+value pairs 
+   * @param[in] num_items
+   *   Total number of associated key+value pairs
    *   (i.e., the length of `d_in_keys` and `d_in_values`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeysInputIteratorT,
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index a299d24fa01..8a5630a0cc9 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
 #include <cub/device/dispatch/dispatch_rle.cuh>
 #include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 20cb8ba872f..1f8442fb1d5 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,14 +27,17 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceScan provides device-wide, parallel operations for 
- *       computing a prefix scan across a sequence of data items residing 
+ * @file cub::DeviceScan provides device-wide, parallel operations for
+ *       computing a prefix scan across a sequence of data items residing
  *       within device-accessible memory.
  */
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -44,32 +47,32 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * @brief DeviceScan provides device-wide, parallel operations for computing a 
- *   prefix scan across a sequence of data items residing within 
+ * @brief DeviceScan provides device-wide, parallel operations for computing a
+ *   prefix scan across a sequence of data items residing within
  *   device-accessible memory. ![](device_scan.png)
  *
  * @ingroup SingleModule
  *
  * @par Overview
- * Given a sequence of input elements and a binary reduction operator, a 
- * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output 
- * sequence where each element is computed to be the reduction of the elements 
- * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan 
- * with the addition operator. The term *inclusive* indicates that the 
+ * Given a sequence of input elements and a binary reduction operator, a
+ * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output
+ * sequence where each element is computed to be the reduction of the elements
+ * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
+ * with the addition operator. The term *inclusive* indicates that the
  * *i*<sup>th</sup> output reduction incorporates the *i*<sup>th</sup> input.
- * The term *exclusive* indicates the *i*<sup>th</sup> input is not 
- * incorporated into the *i*<sup>th</sup> output reduction. When the input and 
+ * The term *exclusive* indicates the *i*<sup>th</sup> input is not
+ * incorporated into the *i*<sup>th</sup> output reduction. When the input and
  * output sequences are the same, the scan is performed in-place.
  *
  * @par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our 
- * *"decoupled look-back"* algorithm for performing global prefix scan with 
- * only a single pass through the input data, as described in our 2016 technical 
- * report [1]. The central idea is to leverage a small, constant factor of 
- * redundant work in order to overlap the latencies of global prefix 
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
+ * *"decoupled look-back"* algorithm for performing global prefix scan with
+ * only a single pass through the input data, as described in our 2016 technical
+ * report [1]. The central idea is to leverage a small, constant factor of
+ * redundant work in order to overlap the latencies of global prefix
  * propagation with local computation. As such, our algorithm requires only
- * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and 
- * typically proceeds at "memcpy" speeds. Our algorithm supports inplace 
+ * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and
+ * typically proceeds at "memcpy" speeds. Our algorithm supports inplace
  * operations.
  *
  * @par
@@ -82,7 +85,7 @@ CUB_NAMESPACE_BEGIN
  * @linear_performance{prefix scan}
  *
  * @par
- * The following chart illustrates DeviceScan::ExclusiveSum performance across 
+ * The following chart illustrates DeviceScan::ExclusiveSum performance across
  * different CUDA architectures for `int32` keys.
  * @plots_below
  *
@@ -97,7 +100,7 @@ struct DeviceScan
   //@{
 
   /**
-   * @brief Computes a device-wide exclusive prefix sum. The value of `0` is 
+   * @brief Computes a device-wide exclusive prefix sum. The value of `0` is
    *        applied as the initial value, and is assigned to `*d_out`.
    *
    * @par
@@ -106,13 +109,13 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
    *   shall not overlap in any other way.
    * - @devicestorage
    *
    * @par Performance
-   * The following charts illustrate saturated exclusive sum performance across 
+   * The following charts illustrate saturated exclusive sum performance across
    * different CUDA architectures for `int32` and `int64` items, respectively.
    *
    * @image html scan_int32.png
@@ -125,7 +128,7 @@ struct DeviceScan
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int  num_items;      // e.g., 7
    * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -136,7 +139,7 @@ struct DeviceScan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items);
    *
    * // Allocate temporary storage
@@ -144,24 +147,24 @@ struct DeviceScan
    *
    * // Run exclusive prefix sum
    * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items);
    *
    * // d_out <-- [0, 8, 14, 21, 26, 29, 29]
    *
    * @endcode
    *
-   * @tparam InputIteratorT 
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   outputs \iterator
    *
    * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -177,7 +180,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of `d_in`)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. 
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -227,7 +230,7 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix sum in-place. The value of 
+   * @brief Computes a device-wide exclusive prefix sum in-place. The value of
    *        `0` is applied as the initial value, and is assigned to `*d_data`.
    *
    * @par
@@ -239,7 +242,7 @@ struct DeviceScan
    * - @devicestorage
    *
    * @par Performance
-   * The following charts illustrate saturated exclusive sum performance across 
+   * The following charts illustrate saturated exclusive sum performance across
    * different CUDA architectures for `int32` and `int64` items, respectively.
    *
    * @image html scan_int32.png
@@ -252,7 +255,7 @@ struct DeviceScan
    * @code
    * #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int  num_items;      // e.g., 7
    * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -262,7 +265,7 @@ struct DeviceScan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, num_items);
    *
    * // Allocate temporary storage
@@ -270,20 +273,20 @@ struct DeviceScan
    *
    * // Run exclusive prefix sum
    * cub::DeviceScan::ExclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, num_items);
    *
    * // d_data <-- [0, 8, 14, 21, 26, 29, 29]
    *
    * @endcode
    *
-   * @tparam IteratorT 
-   *   **[inferred]** Random-access iterator type for reading scan 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access iterator type for reading scan
    *   inputs and wrigin scan outputs
    *
    * @param[in] d_temp_storage
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -296,7 +299,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of `d_in`)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. 
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -337,8 +340,8 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified 
-   *        binary `scan_op` functor. The `init_value` value is applied as 
+   * @brief Computes a device-wide exclusive prefix scan using the specified
+   *        binary `scan_op` functor. The `init_value` value is applied as
    *        the initial value, and is assigned to `*d_out`.
    *
    * @par
@@ -347,13 +350,13 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
    *   shall not overlap in any other way.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * The code snippet below illustrates the exclusive prefix min-scan of an
    * `int` device vector
    * @par
    * @code
@@ -370,7 +373,7 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -378,12 +381,12 @@ struct DeviceScan
    * CustomMin    min_op;
    * ...
    *
-   * // Determine temporary device storage requirements for exclusive 
+   * // Determine temporary device storage requirements for exclusive
    * // prefix scan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, (int) INT_MAX, num_items);
    *
    * // Allocate temporary storage for exclusive prefix scan
@@ -391,32 +394,32 @@ struct DeviceScan
    *
    * // Run exclusive prefix min-scan
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, (int) INT_MAX, num_items);
    *
    * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
    *
    * @endcode
    *
-   * @tparam InputIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs \iterator
    *
-   * @tparam OutputIteratorT  
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   outputs \iterator
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
-   * 
-   * @tparam InitValueT       
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *
+   * @tparam InitValueT
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type
    *   having member `T operator()(const T &a, const T &b)`
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -438,7 +441,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of \p d_in)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. Default is 
+   *   **[optional]** CUDA stream to launch kernels within. Default is
    *   stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -505,8 +508,8 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified 
-   *        binary `scan_op` functor. The `init_value` value is applied as 
+   * @brief Computes a device-wide exclusive prefix scan using the specified
+   *        binary `scan_op` functor. The `init_value` value is applied as
    *        the initial value, and is assigned to `*d_data`.
    *
    * @par
@@ -518,7 +521,7 @@ struct DeviceScan
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * The code snippet below illustrates the exclusive prefix min-scan of an
    * `int` device vector
    * @par
    * @code
@@ -535,19 +538,19 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
    * CustomMin    min_op;
    * ...
    *
-   * // Determine temporary device storage requirements for exclusive 
+   * // Determine temporary device storage requirements for exclusive
    * // prefix scan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, min_op, (int) INT_MAX, num_items);
    *
    * // Allocate temporary storage for exclusive prefix scan
@@ -555,28 +558,28 @@ struct DeviceScan
    *
    * // Run exclusive prefix min-scan
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, min_op, (int) INT_MAX, num_items);
    *
    * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
    *
    * @endcode
    *
-   * @tparam IteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs and writing scan outputs
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
-   * 
-   * @tparam InitValueT       
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *
+   * @tparam InitValueT
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type
    *   having member `T operator()(const T &a, const T &b)`
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -595,7 +598,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of \p d_in)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. Default is 
+   *   **[optional]** CUDA stream to launch kernels within. Default is
    *   stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -648,8 +651,8 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified 
-   *        binary `scan_op` functor. The `init_value` value is provided as 
+   * @brief Computes a device-wide exclusive prefix scan using the specified
+   *        binary `scan_op` functor. The `init_value` value is provided as
    *        a future value.
    *
    * @par
@@ -658,13 +661,13 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
    *   shall not overlap in any other way.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * The code snippet below illustrates the exclusive prefix min-scan of an
    * `int` device vector
    * @par
    * @code
@@ -681,7 +684,7 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -689,17 +692,17 @@ struct DeviceScan
    * int          *d_init_iter;   // e.g., INT_MAX
    * CustomMin    min_op;
    *
-   * auto future_init_value = 
+   * auto future_init_value =
    *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
    *
    * ...
    *
-   * // Determine temporary device storage requirements for exclusive 
+   * // Determine temporary device storage requirements for exclusive
    * // prefix scan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, future_init_value, num_items);
    *
    * // Allocate temporary storage for exclusive prefix scan
@@ -707,54 +710,54 @@ struct DeviceScan
    *
    * // Run exclusive prefix min-scan
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, future_init_value, num_items);
    *
    * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
    *
    * @endcode
    *
-   * @tparam InputIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs \iterator
    *
-   * @tparam OutputIteratorT  
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   outputs \iterator
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
-   * 
-   * @tparam InitValueT       
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *
+   * @tparam InitValueT
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type
    *   having member `T operator()(const T &a, const T &b)`
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of \p d_temp_storage allocation
    *
-   * @param[in] d_in 
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out 
+   * @param[out] d_out
    *   Pointer to the output sequence of data items
    *
-   * @param[in] scan_op 
+   * @param[in] scan_op
    *   Binary scan functor
    *
-   * @param[in] init_value 
+   * @param[in] init_value
    *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   Total number of input items (i.e., the length of `d_in`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -826,8 +829,8 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix scan using the specified 
-   *        binary `scan_op` functor. The `init_value` value is provided as 
+   * @brief Computes a device-wide exclusive prefix scan using the specified
+   *        binary `scan_op` functor. The `init_value` value is provided as
    *        a future value.
    *
    * @par
@@ -839,7 +842,7 @@ struct DeviceScan
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan of an 
+   * The code snippet below illustrates the exclusive prefix min-scan of an
    * `int` device vector
    * @par
    * @code
@@ -856,24 +859,24 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
    * int          *d_init_iter;   // e.g., INT_MAX
    * CustomMin    min_op;
    *
-   * auto future_init_value = 
+   * auto future_init_value =
    *   cub::FutureValue<InitialValueT, IterT>(d_init_iter);
    *
    * ...
    *
-   * // Determine temporary device storage requirements for exclusive 
+   * // Determine temporary device storage requirements for exclusive
    * // prefix scan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, min_op, future_init_value, num_items);
    *
    * // Allocate temporary storage for exclusive prefix scan
@@ -881,47 +884,47 @@ struct DeviceScan
    *
    * // Run exclusive prefix min-scan
    * cub::DeviceScan::ExclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, min_op, future_init_value, num_items);
    *
    * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
    *
    * @endcode
    *
-   * @tparam IteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs and writing scan outputs
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
-   * 
-   * @tparam InitValueT       
-   *  **[inferred]** Type of the `init_value` used Binary scan functor type 
+   *
+   * @tparam InitValueT
+   *  **[inferred]** Type of the `init_value` used Binary scan functor type
    *   having member `T operator()(const T &a, const T &b)`
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of \p d_temp_storage allocation
    *
    * @param[in,out] d_data
    *   Pointer to the sequence of data items
    *
-   * @param[in] scan_op 
+   * @param[in] scan_op
    *   Binary scan functor
    *
-   * @param[in] init_value 
+   * @param[in] init_value
    *   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   Total number of input items (i.e., the length of `d_in`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -992,8 +995,8 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
    *   shall not overlap in any other way.
    * - @devicestorage
    *
@@ -1005,19 +1008,19 @@ struct DeviceScan
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int  num_items;      // e.g., 7
    * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
    * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
    * ...
    *
-   * // Determine temporary device storage requirements for inclusive 
+   * // Determine temporary device storage requirements for inclusive
    * // prefix sum
    * void     *d_temp_storage = nullptr;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items);
    *
    * // Allocate temporary storage for inclusive prefix sum
@@ -1025,40 +1028,40 @@ struct DeviceScan
    *
    * // Run inclusive prefix sum
    * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, num_items);
    *
    * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
    *
    * @endcode
    *
-   * @tparam InputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs \iterator
    *
-   * @tparam OutputIteratorT    
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   outputs \iterator
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Random-access iterator to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Random-access iterator to the output sequence of data items
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., the length of `d_in`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1129,18 +1132,18 @@ struct DeviceScan
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int  num_items;      // e.g., 7
    * int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
    * ...
    *
-   * // Determine temporary device storage requirements for inclusive 
+   * // Determine temporary device storage requirements for inclusive
    * // prefix sum
    * void     *d_temp_storage = nullptr;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, num_items);
    *
    * // Allocate temporary storage for inclusive prefix sum
@@ -1148,33 +1151,33 @@ struct DeviceScan
    *
    * // Run inclusive prefix sum
    * cub::DeviceScan::InclusiveSum(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, num_items);
    *
    * // d_data <-- [8, 14, 21, 26, 29, 29, 38]
    *
    * @endcode
    *
-   * @tparam IteratorT     
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs and writing scan outputs
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
    * @param[in,out] d_data
    *   Random-access iterator to the sequence of data items
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., the length of `d_in`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1215,7 +1218,7 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide inclusive prefix scan using the specified 
+   * @brief Computes a device-wide inclusive prefix scan using the specified
    *        binary `scan_op` functor.
    *
    * @par
@@ -1224,13 +1227,13 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The 
-   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` 
+   * - When `d_in` and `d_out` are equal, the scan is performed in-place. The
+   *   range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)`
    *   shall not overlap in any other way.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan of an 
+   * The code snippet below illustrates the inclusive prefix min-scan of an
    * `int` device vector.
    *
    * @par
@@ -1248,7 +1251,7 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
@@ -1256,12 +1259,12 @@ struct DeviceScan
    * CustomMin    min_op;
    * ...
    *
-   * // Determine temporary device storage requirements for inclusive 
+   * // Determine temporary device storage requirements for inclusive
    * // prefix scan
    * void *d_temp_storage = nullptr;
    * size_t temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, num_items);
    *
    * // Allocate temporary storage for inclusive prefix scan
@@ -1269,28 +1272,28 @@ struct DeviceScan
    *
    * // Run inclusive prefix min-scan
    * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, num_items);
    *
    * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
    *
    * @endcode
    *
-   * @tparam InputIteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs \iterator
    *
-   * @tparam OutputIteratorT  
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   outputs \iterator
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @param[in]  
-   *   d_temp_storage Device-accessible allocation of temporary storage. 
-   *   When `nullptr`, the required allocation size is written to 
+   * @param[in]
+   *   d_temp_storage Device-accessible allocation of temporary storage.
+   *   When `nullptr`, the required allocation size is written to
    *   `temp_storage_bytes` and no work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -1309,7 +1312,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of `d_in`)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. 
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1366,7 +1369,7 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide inclusive prefix scan using the specified 
+   * @brief Computes a device-wide inclusive prefix scan using the specified
    *        binary `scan_op` functor.
    *
    * @par
@@ -1378,7 +1381,7 @@ struct DeviceScan
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan of an 
+   * The code snippet below illustrates the inclusive prefix min-scan of an
    * `int` device vector.
    *
    * @par
@@ -1396,19 +1399,19 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
    * CustomMin    min_op;
    * ...
    *
-   * // Determine temporary device storage requirements for inclusive 
+   * // Determine temporary device storage requirements for inclusive
    * // prefix scan
    * void *d_temp_storage = nullptr;
    * size_t temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, min_op, num_items);
    *
    * // Allocate temporary storage for inclusive prefix scan
@@ -1416,24 +1419,24 @@ struct DeviceScan
    *
    * // Run inclusive prefix min-scan
    * cub::DeviceScan::InclusiveScan(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, min_op, num_items);
    *
    * // d_data <-- [8, 6, 6, 5, 3, 0, 0]
    *
    * @endcode
    *
-   * @tparam IteratorT   
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   inputs and writing scan outputs
    *
-   * @tparam ScanOp           
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @param[in]  
-   *   d_temp_storage Device-accessible allocation of temporary storage. 
-   *   When `nullptr`, the required allocation size is written to 
+   * @param[in]
+   *   d_temp_storage Device-accessible allocation of temporary storage.
+   *   When `nullptr`, the required allocation size is written to
    *   `temp_storage_bytes` and no work is done.
    *
    * @param[in,out] temp_storage_bytes
@@ -1449,7 +1452,7 @@ struct DeviceScan
    *   Total number of input items (i.e., the length of `d_in`)
    *
    * @param[in] stream
-   *   **[optional]** CUDA stream to launch kernels within. 
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1495,8 +1498,8 @@ struct DeviceScan
 
   /**
    * @brief Computes a device-wide exclusive prefix sum-by-key with key equality
-   *        defined by `equality_op`. The value of `0` is applied as the initial 
-   *        value, and is assigned to the beginning of each segment in 
+   *        defined by `equality_op`. The value of `0` is applied as the initial
+   *        value, and is assigned to the beginning of each segment in
    *        `d_values_out`.
    *
    * @par
@@ -1505,22 +1508,22 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range 
-   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   * - `d_keys_in` may equal `d_values_out` but the range
+   *   `[d_keys_in, d_keys_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range 
-   *   `[d_values_in, d_values_in + num_items)` and the range 
+   * - `d_values_in` may equal `d_values_out` but the range
+   *   `[d_values_in, d_values_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix sum-by-key of an 
+   * The code snippet below illustrates the exclusive prefix sum-by-key of an
    * `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int num_items;      // e.g., 7
    * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
@@ -1532,7 +1535,7 @@ struct DeviceScan
    * void     *d_temp_storage = nullptr;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, num_items);
    *
    * // Allocate temporary storage
@@ -1540,57 +1543,57 @@ struct DeviceScan
    *
    * // Run exclusive prefix sum
    * cub::DeviceScan::ExclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, num_items);
    *
    * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
    *
    * @endcode
    *
-   * @tparam KeysInputIteratorT      
-   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   * @tparam KeysInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan keys
    *   inputs \iterator
-   * 
-   * @tparam ValuesInputIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   *
+   * @tparam ValuesInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   values inputs \iterator
    *
-   * @tparam ValuesOutputIteratorT   
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam ValuesOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   values outputs \iterator
    *
-   * @tparam EqualityOpT             
-   *   **[inferred]** Functor type having member 
-   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   * @tparam EqualityOpT
+   *   **[inferred]** Functor type having member
+   *   `T operator()(const T &a, const T &b)` for binary operations that
    *   defines the equality of keys
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
    *   work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in 
+   * @param[in] d_keys_in
    *   Random-access input iterator to the input sequence of key items
    *
-   * @param[in] d_values_in 
+   * @param[in] d_values_in
    *   Random-access input iterator to the input sequence of value items
    *
-   * @param[out] d_values_out 
+   * @param[out] d_values_out
    *   Random-access output iterator to the output sequence of value items
    *
-   * @param[in] num_items 
-   *   Total number of input items (i.e., the length of `d_keys_in` and 
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_keys_in` and
    *   `d_values_in`)
    *
-   * @param[in] equality_op 
-   *   Binary functor that defines the equality of keys. 
+   * @param[in] equality_op
+   *   Binary functor that defines the equality of keys.
    *   Default is cub::Equality().
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1614,7 +1617,7 @@ struct DeviceScan
     using InitT = cub::detail::value_t<ValuesInputIteratorT>;
 
     // Initial value
-    InitT init_value{}; 
+    InitT init_value{};
 
     return DispatchScanByKey<KeysInputIteratorT,
                              ValuesInputIteratorT,
@@ -1666,10 +1669,10 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide exclusive prefix scan-by-key using the 
-   *        specified binary `scan_op` functor. The key equality is defined by 
-   *        `equality_op`.  The `init_value` value is applied as the initial 
-   *        value, and is assigned to the beginning of each segment in 
+   * @brief Computes a device-wide exclusive prefix scan-by-key using the
+   *        specified binary `scan_op` functor. The key equality is defined by
+   *        `equality_op`.  The `init_value` value is applied as the initial
+   *        value, and is assigned to the beginning of each segment in
    *        `d_values_out`.
    *
    * @par
@@ -1678,16 +1681,16 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range 
-   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   * - `d_keys_in` may equal `d_values_out` but the range
+   *   `[d_keys_in, d_keys_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range 
-   *   `[d_values_in, d_values_in + num_items)` and the range 
+   * - `d_values_in` may equal `d_values_out` but the range
+   *   `[d_values_in, d_values_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the exclusive prefix min-scan-by-key of 
+   * The code snippet below illustrates the exclusive prefix min-scan-by-key of
    * an `int` device vector
    * @par
    * @code
@@ -1714,7 +1717,7 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
@@ -1724,13 +1727,13 @@ struct DeviceScan
    * CustomEqual  equality_op;
    * ...
    *
-   * // Determine temporary device storage requirements for exclusive 
+   * // Determine temporary device storage requirements for exclusive
    * // prefix scan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::ExclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_values_in, d_values_out, min_op, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_values_in, d_values_out, min_op,
    *   (int) INT_MAX, num_items, equality_op);
    *
    * // Allocate temporary storage for exclusive prefix scan
@@ -1738,73 +1741,73 @@ struct DeviceScan
    *
    * // Run exclusive prefix min-scan
    * cub::DeviceScan::ExclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_values_in, d_values_out, min_op, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_values_in, d_values_out, min_op,
    *   (int) INT_MAX, num_items, equality_op);
    *
    * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
    *
    * @endcode
    *
-   * @tparam KeysInputIteratorT      
-   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   * @tparam KeysInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan keys
    *   inputs \iterator
    *
-   * @tparam ValuesInputIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading scan values 
+   * @tparam ValuesInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan values
    *   inputs \iterator
    *
-   * @tparam ValuesOutputIteratorT   
-   *   **[inferred]** Random-access output iterator type for writing scan values 
+   * @tparam ValuesOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan values
    *   outputs \iterator
    *
-   * @tparam ScanOp                  
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @tparam InitValueT              
-   *   **[inferred]** Type of the `init_value` value used in Binary scan 
+   * @tparam InitValueT
+   *   **[inferred]** Type of the `init_value` value used in Binary scan
    *   functor type having member `T operator()(const T &a, const T &b)`
    *
-   * @tparam EqualityOpT             
-   *   **[inferred]** Functor type having member 
-   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   * @tparam EqualityOpT
+   *   **[inferred]** Functor type having member
+   *   `T operator()(const T &a, const T &b)` for binary operations that
    *   defines the equality of keys
    *
-   *  @param[in] d_temp_storage 
-   *    Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *    required allocation size is written to `temp_storage_bytes` and no work 
+   *  @param[in] d_temp_storage
+   *    Device-accessible allocation of temporary storage. When `nullptr`, the
+   *    required allocation size is written to `temp_storage_bytes` and no work
    *    is done.
    *
-   *  @param[in,out] temp_storage_bytes 
+   *  @param[in,out] temp_storage_bytes
    *    Reference to size in bytes of `d_temp_storage` allocation
    *
-   *  @param[in] d_keys_in 
+   *  @param[in] d_keys_in
    *    Random-access input iterator to the input sequence of key items
    *
-   *  @param[in] d_values_in 
+   *  @param[in] d_values_in
    *    Random-access input iterator to the input sequence of value items
    *
-   *  @param[out] d_values_out 
+   *  @param[out] d_values_out
    *    Random-access output iterator to the output sequence of value items
    *
-   *  @param[in] scan_op 
+   *  @param[in] scan_op
    *    Binary scan functor
    *
-   *  @param[in] init_value 
-   *    Initial value to seed the exclusive scan (and is assigned to the 
+   *  @param[in] init_value
+   *    Initial value to seed the exclusive scan (and is assigned to the
    *    beginning of each segment in `d_values_out`)
    *
-   *  @param[in] num_items 
-   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *  @param[in] num_items
+   *    Total number of input items (i.e., the length of `d_keys_in` and
    *    `d_values_in`)
    *
-   *  @param[in] equality_op 
-   *    Binary functor that defines the equality of keys. 
+   *  @param[in] equality_op
+   *    Binary functor that defines the equality of keys.
    *    Default is cub::Equality().
    *
-   *  @param[in] stream   
-   *    **[optional]** CUDA stream to launch kernels within.  
+   *  @param[in] stream
+   *    **[optional]** CUDA stream to launch kernels within.
    *    Default is stream<sub>0</sub>.
    *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
@@ -1888,7 +1891,7 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide inclusive prefix sum-by-key with key 
+   * @brief Computes a device-wide inclusive prefix sum-by-key with key
    *        equality defined by `equality_op`.
    *
    * @par
@@ -1897,22 +1900,22 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range 
-   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   * - `d_keys_in` may equal `d_values_out` but the range
+   *   `[d_keys_in, d_keys_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range 
-   *   `[d_values_in, d_values_in + num_items)` and the range 
+   * - `d_values_in` may equal `d_values_out` but the range
+   *   `[d_values_in, d_values_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the inclusive prefix sum-by-key of an 
+   * The code snippet below illustrates the inclusive prefix sum-by-key of an
    * `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int num_items;      // e.g., 7
    * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
@@ -1924,7 +1927,7 @@ struct DeviceScan
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, num_items);
    *
    * // Allocate temporary storage for inclusive prefix sum
@@ -1932,59 +1935,59 @@ struct DeviceScan
    *
    * // Run inclusive prefix sum
    * cub::DeviceScan::InclusiveSumByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, num_items);
    *
    * // d_out <-- [8, 14, 7, 12, 15, 0, 9]
    *
    * @endcode
    *
-   * @tparam KeysInputIteratorT      
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam KeysInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   keys inputs \iterator
-   * 
-   * @tparam ValuesInputIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   *
+   * @tparam ValuesInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   values inputs \iterator
-   * 
-   * @tparam ValuesOutputIteratorT   
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   *
+   * @tparam ValuesOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   values outputs \iterator
-   * 
-   * @tparam EqualityOpT             
-   *   **[inferred]** Functor type having member 
-   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   *
+   * @tparam EqualityOpT
+   *   **[inferred]** Functor type having member
+   *   `T operator()(const T &a, const T &b)` for binary operations that
    *   defines the equality of keys
    *
-   *  @param[in] d_temp_storage 
-   *    Device-accessible allocation of temporary storage.  
-   *    When `nullptr`, the required allocation size is written to 
+   *  @param[in] d_temp_storage
+   *    Device-accessible allocation of temporary storage.
+   *    When `nullptr`, the required allocation size is written to
    *    `temp_storage_bytes` and no work is done.
-   * 
-   *  @param[in,out] temp_storage_bytes 
+   *
+   *  @param[in,out] temp_storage_bytes
    *    Reference to size in bytes of `d_temp_storage` allocation
-   * 
-   *  @param[in] d_keys_in 
+   *
+   *  @param[in] d_keys_in
    *    Random-access input iterator to the input sequence of key items
-   * 
-   *  @param[in] d_values_in 
+   *
+   *  @param[in] d_values_in
    *    Random-access input iterator to the input sequence of value items
-   * 
-   *  @param[out] d_values_out 
+   *
+   *  @param[out] d_values_out
    *    Random-access output iterator to the output sequence of value items
-   * 
-   *  @param[in] num_items 
-   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *
+   *  @param[in] num_items
+   *    Total number of input items (i.e., the length of `d_keys_in` and
    *    `d_values_in`)
-   * 
-   *  @param[in] equality_op 
-   *    Binary functor that defines the equality of keys. 
+   *
+   *  @param[in] equality_op
+   *    Binary functor that defines the equality of keys.
    *    Default is cub::Equality().
-   * 
-   *  @param[in] stream 
-   *    **[optional]** CUDA stream to launch kernels within.  
+   *
+   *  @param[in] stream
+   *    **[optional]** CUDA stream to launch kernels within.
    *    Default is stream<sub>0</sub>.
-   * 
+   *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
    */
   template <typename KeysInputIteratorT,
@@ -2054,8 +2057,8 @@ struct DeviceScan
   }
 
   /**
-   * @brief Computes a device-wide inclusive prefix scan-by-key using the 
-   *        specified binary `scan_op` functor. The key equality is defined 
+   * @brief Computes a device-wide inclusive prefix scan-by-key using the
+   *        specified binary `scan_op` functor. The key equality is defined
    *        by `equality_op`.
    *
    * @par
@@ -2064,16 +2067,16 @@ struct DeviceScan
    *   addition of floating-point types). Results for pseudo-associative
    *   operators may vary from run to run. Additional details can be found in
    *   the [decoupled look-back] description.
-   * - `d_keys_in` may equal `d_values_out` but the range 
-   *   `[d_keys_in, d_keys_in + num_items)` and the range 
+   * - `d_keys_in` may equal `d_values_out` but the range
+   *   `[d_keys_in, d_keys_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
-   * - `d_values_in` may equal `d_values_out` but the range 
-   *   `[d_values_in, d_values_in + num_items)` and the range 
+   * - `d_values_in` may equal `d_values_out` but the range
+   *   `[d_values_in, d_values_in + num_items)` and the range
    *   `[d_values_out, d_values_out + num_items)` shall not overlap otherwise.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the inclusive prefix min-scan-by-key 
+   * The code snippet below illustrates the inclusive prefix min-scan-by-key
    * of an `int` device vector.
    * @par
    * @code
@@ -2100,7 +2103,7 @@ struct DeviceScan
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // input and output
    * int          num_items;      // e.g., 7
    * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
@@ -2114,7 +2117,7 @@ struct DeviceScan
    * void *d_temp_storage = NULL;
    * size_t temp_storage_bytes = 0;
    * cub::DeviceScan::InclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
    *
    * // Allocate temporary storage for inclusive prefix scan
@@ -2122,66 +2125,66 @@ struct DeviceScan
    *
    * // Run inclusive prefix min-scan
    * cub::DeviceScan::InclusiveScanByKey(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
    *
    * // d_out <-- [8, 6, 7, 5, 3, 0, 0]
    *
    * @endcode
    *
-   * @tparam KeysInputIteratorT      
-   *   **[inferred]** Random-access input iterator type for reading scan keys 
+   * @tparam KeysInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan keys
    *   inputs \iterator
    *
-   * @tparam ValuesInputIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading scan 
+   * @tparam ValuesInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading scan
    *   values inputs \iterator
    *
-   * @tparam ValuesOutputIteratorT   
-   *   **[inferred]** Random-access output iterator type for writing scan 
+   * @tparam ValuesOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing scan
    *   values outputs \iterator
    *
-   * @tparam ScanOp                  
-   *   **[inferred]** Binary scan functor type having member 
+   * @tparam ScanOp
+   *   **[inferred]** Binary scan functor type having member
    *   `T operator()(const T &a, const T &b)`
    *
-   * @tparam EqualityOpT             
-   *   **[inferred]** Functor type having member 
-   *   `T operator()(const T &a, const T &b)` for binary operations that 
+   * @tparam EqualityOpT
+   *   **[inferred]** Functor type having member
+   *   `T operator()(const T &a, const T &b)` for binary operations that
    *   defines the equality of keys
    *
-   *  @param[in] d_temp_storage 
-   *    Device-accessible allocation of temporary storage.  
-   *    When `nullptr`, the required allocation size is written to 
+   *  @param[in] d_temp_storage
+   *    Device-accessible allocation of temporary storage.
+   *    When `nullptr`, the required allocation size is written to
    *    `temp_storage_bytes` and no work is done.
-   * 
-   *  @param[in,out] temp_storage_bytes 
+   *
+   *  @param[in,out] temp_storage_bytes
    *    Reference to size in bytes of `d_temp_storage` allocation
-   * 
-   *  @param[in] d_keys_in 
+   *
+   *  @param[in] d_keys_in
    *    Random-access input iterator to the input sequence of key items
-   * 
-   *  @param[in] d_values_in 
+   *
+   *  @param[in] d_values_in
    *    Random-access input iterator to the input sequence of value items
-   * 
-   *  @param[out] d_values_out 
+   *
+   *  @param[out] d_values_out
    *    Random-access output iterator to the output sequence of value items
-   * 
-   *  @param[in] scan_op 
+   *
+   *  @param[in] scan_op
    *    Binary scan functor
-   * 
-   *  @param[in] num_items 
-   *    Total number of input items (i.e., the length of `d_keys_in` and 
+   *
+   *  @param[in] num_items
+   *    Total number of input items (i.e., the length of `d_keys_in` and
    *    `d_values_in`)
-   * 
-   *  @param[in] equality_op 
-   *    Binary functor that defines the equality of keys. 
+   *
+   *  @param[in] equality_op
+   *    Binary functor that defines the equality of keys.
    *    Default is cub::Equality().
-   * 
-   *  @param[in] stream 
-   *    **[optional]** CUDA stream to launch kernels within.  
+   *
+   *  @param[in] stream
+   *    **[optional]** CUDA stream to launch kernels within.
    *    Default is stream<sub>0</sub>.
-   * 
+   *
    * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
    */
   template <typename KeysInputIteratorT,
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index c8f69223e94..425dece845f 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,18 +27,21 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceSegmentedRadixSort provides device-wide, parallel 
- *       operations for computing a batched radix sort across multiple, 
- *       non-overlapping sequences of data items residing within 
+ * @file cub::DeviceSegmentedRadixSort provides device-wide, parallel
+ *       operations for computing a batched radix sort across multiple,
+ *       non-overlapping sequences of data items residing within
  *       device-accessible memory.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdio.h>
 #include <iterator>
 
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_radix_sort.cuh>
 #include <cub/util_deprecated.cuh>
 
@@ -46,27 +49,27 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations 
- *        for computing a batched radix sort across multiple, non-overlapping 
- *        sequences of data items residing within device-accessible memory. 
+ * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations
+ *        for computing a batched radix sort across multiple, non-overlapping
+ *        sequences of data items residing within device-accessible memory.
  *        ![](segmented_sorting_logo.png)
  * @ingroup SegmentedModule
  *
  * @par Overview
- * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) 
- * arranges items into ascending (or descending) order. The algorithm relies 
- * upon a positional representation for keys, i.e., each key is comprised of an 
- * ordered sequence of symbols (e.g., digits, characters, etc.) specified from 
- * least-significant to most-significant.  For a given input sequence of keys 
- * and a set of rules specifying a total ordering of the symbolic alphabet, the 
+ * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
+ * arranges items into ascending (or descending) order. The algorithm relies
+ * upon a positional representation for keys, i.e., each key is comprised of an
+ * ordered sequence of symbols (e.g., digits, characters, etc.) specified from
+ * least-significant to most-significant.  For a given input sequence of keys
+ * and a set of rules specifying a total ordering of the symbolic alphabet, the
  * radix sorting method produces a lexicographic ordering of those keys.
  *
  * @par See Also
  * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
  * that algorithm's documentation for more information.
  *
- * @par Segments are not required to be contiguous. Any element of input(s) or 
- * output(s) outside the specified segments will not be accessed nor modified.  
+ * @par Segments are not required to be contiguous. Any element of input(s) or
+ * output(s) outside the specified segments will not be accessed nor modified.
  *
  * @par Usage Considerations
  * @cdp_class{DeviceSegmentedRadixSort}
@@ -80,7 +83,7 @@ struct DeviceSegmentedRadixSort
   //@{
 
   /**
-   * @brief Sorts segments of key-value pairs into ascending order. 
+   * @brief Sorts segments of key-value pairs into ascending order.
    *        (`~2N` auxiliary storage required)
    *
    * @par
@@ -89,31 +92,31 @@ struct DeviceSegmentedRadixSort
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
    *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
-   * (with one zero-length segment) of `int` keys with associated vector of 
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of `int` keys with associated vector of
    * `int` values.
    * @par
    * @code
-   * #include <cub/cub.cuh>  
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -145,72 +148,72 @@ struct DeviceSegmentedRadixSort
    * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
    * @endcode
    *
-   * @tparam KeyT                  
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam ValueT                
+   * @tparam ValueT
    *   **[inferred]** Value type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in 
+   * @param[in] d_keys_in
    *   Device-accessible pointer to the input data of key data to sort
    *
-   * @param[out] d_keys_out 
+   * @param[out] d_keys_out
    *   Device-accessible pointer to the sorted output sequence of key data
    *
-   * @param[in] d_values_in 
-   *   Device-accessible pointer to the corresponding input sequence of 
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
    *   associated value items
    *
-   * @param[out] d_values_out 
-   *   Device-accessible pointer to the correspondingly-reordered output 
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
    *   sequence of associated value items
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments 
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets 
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets 
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If 
-   *   `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
+   *   `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] begin_bit 
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit 
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream 
+   * @param[in] stream
    *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
@@ -299,7 +302,7 @@ struct DeviceSegmentedRadixSort
   }
 
   /**
-   * @brief Sorts segments of key-value pairs into ascending order. 
+   * @brief Sorts segments of key-value pairs into ascending order.
    *        (`~N` auxiliary storage required)
    *
    * @par
@@ -307,42 +310,42 @@ struct DeviceSegmentedRadixSort
    *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
    *   structure that indicates which of the two buffers is "current" (and thus
    *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the 
+   * - The contents of both buffers within each pair may be altered by the
    *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current" 
-   *   indicator within each DoubleBuffer wrapper to reference which of the two 
-   *   buffers now contains the sorted output sequence (a function of the number 
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within each DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
    *   of key bits specified and the targeted device architecture).
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
    *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and yield 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and yield
    *   a corresponding performance improvement.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
    * - @devicestorageP
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
-   * (with one zero-length segment) of `int` keys with associated vector of 
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of `int` keys with associated vector of
    * `int` values.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -377,69 +380,69 @@ struct DeviceSegmentedRadixSort
    *
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam ValueT           
+   * @tparam ValueT
    *   **[inferred]** Value type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in,out] d_keys 
-   *   Reference to the double-buffer of keys whose "current" device-accessible 
-   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
    *   point to the sorted output keys
    *
-   * @param[in,out] d_values 
-   *   Double-buffer of values whose "current" device-accessible buffer 
-   *   contains the unsorted input values and, upon return, is updated to point 
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer
+   *   contains the unsorted input values and, upon return, is updated to point
    *   to the sorted output values
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments 
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets 
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets 
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] begin_bit 
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit 
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -517,40 +520,40 @@ struct DeviceSegmentedRadixSort
   }
 
   /**
-   * @brief Sorts segments of key-value pairs into descending order. 
+   * @brief Sorts segments of key-value pairs into descending order.
    *        (`~2N` auxiliary storage required).
    *
    * @par
    * - The contents of the input data are not altered by the sorting operation
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
    *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
    *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
-   * (with one zero-length segment) of `int` keys with associated vector of 
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of `int` keys with associated vector of
    * `int` values.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -582,73 +585,73 @@ struct DeviceSegmentedRadixSort
    * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam ValueT           
+   * @tparam ValueT
    *   **[inferred]** Value type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in 
+   * @param[in] d_keys_in
    *   Device-accessible pointer to the input data of key data to sort
    *
-   * @param[out] d_keys_out 
+   * @param[out] d_keys_out
    *   Device-accessible pointer to the sorted output sequence of key data
    *
-   * @param[in] d_values_in 
-   *   Device-accessible pointer to the corresponding input sequence of 
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
    *   associated value items
    *
-   * @param[out] d_values_out 
-   *   Device-accessible pointer to the correspondingly-reordered output 
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
    *   sequence of associated value items
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments 
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets 
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets 
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup>
    *   is considered empty.
    *
-   * @param[in] begin_bit 
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit 
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -738,7 +741,7 @@ struct DeviceSegmentedRadixSort
   }
 
   /**
-   * @brief Sorts segments of key-value pairs into descending order. 
+   * @brief Sorts segments of key-value pairs into descending order.
    *        (`~N` auxiliary storage required).
    *
    * @par
@@ -746,43 +749,43 @@ struct DeviceSegmentedRadixSort
    *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
    *   structure that indicates which of the two buffers is "current" (and thus
    *   contains the input data to be sorted).
-   * - The contents of both buffers within each pair may be altered by the 
+   * - The contents of both buffers within each pair may be altered by the
    *   sorting operation.
-   * - Upon completion, the sorting operation will update the "current" 
-   *   indicator within each DoubleBuffer wrapper to reference which of the two 
-   *   buffers now contains the sorted output sequence (a function of the number 
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within each DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
    *   of key bits specified and the targeted device architecture).
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is
    *   specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
-   *   not to be modified. 
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
+   *   not to be modified.
    * - @devicestorageP
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
-   * (with one zero-length segment) of `int` keys with associated vector of 
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of `int` keys with associated vector of
    * `int` values.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -816,69 +819,69 @@ struct DeviceSegmentedRadixSort
    * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam ValueT           
+   * @tparam ValueT
    *   **[inferred]** Value type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage 
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in,out] d_keys 
-   *   Reference to the double-buffer of keys whose "current" device-accessible 
-   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
    *   point to the sorted output keys
    *
-   * @param[in,out] d_values 
-   *   Double-buffer of values whose "current" device-accessible buffer 
-   *   contains the unsorted input values and, upon return, is updated to point 
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer
+   *   contains the unsorted input values and, upon return, is updated to point
    *   to the sorted output values
    *
-   * @param[in] num_items 
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments 
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets 
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets 
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup>
    *   is considered empty.
    *
-   * @param[in] begin_bit 
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit 
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream 
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -965,38 +968,38 @@ struct DeviceSegmentedRadixSort
 
 
   /**
-   * @brief Sorts segments of keys into ascending order. 
+   * @brief Sorts segments of keys into ascending order.
    *        (`~2N` auxiliary storage required)
    *
    * @par
    * - The contents of the input data are not altered by the sorting operation
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
    *   is specified as `segment_offsets + 1`).
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
    *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
+   * The code snippet below illustrates the batched sorting of three segments
    * (with one zero-length segment) of `int` keys.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -1008,7 +1011,7 @@ struct DeviceSegmentedRadixSort
    * // Determine temporary device storage requirements
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
-   * cub::DeviceSegmentedRadixSort::SortKeys( 
+   * cub::DeviceSegmentedRadixSort::SortKeys(
    *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
    *     num_items, num_segments, d_offsets, d_offsets + 1);
    *
@@ -1016,7 +1019,7 @@ struct DeviceSegmentedRadixSort
    * cudaMalloc(&d_temp_storage, temp_storage_bytes);
    *
    * // Run sorting operation
-   * cub::DeviceSegmentedRadixSort::SortKeys( 
+   * cub::DeviceSegmentedRadixSort::SortKeys(
    *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
    *     num_items, num_segments, d_offsets, d_offsets + 1);
    *
@@ -1024,60 +1027,60 @@ struct DeviceSegmentedRadixSort
    *
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage 
+   * @param[in] d_temp_storage
    *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
    *
-   * @param[in,out] temp_storage_bytes 
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of \p d_temp_storage allocation
    *
-   * @param[in] d_keys_in  
+   * @param[in] d_keys_in
    *   Device-accessible pointer to the input data of key data to sort
    *
-   * @param[out] d_keys_out  
+   * @param[out] d_keys_out
    *   Device-accessible pointer to the sorted output sequence of key data
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] begin_bit  
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit  
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -1160,41 +1163,41 @@ struct DeviceSegmentedRadixSort
    * @brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
    *
    * @par
-   * - The sorting operation is given a pair of key buffers managed by a 
+   * - The sorting operation is given a pair of key buffers managed by a
    *   DoubleBuffer structure that indicates which of the two buffers is
    *   "current" (and thus contains the input data to be sorted).
    * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current" 
-   *   indicator within the DoubleBuffer wrapper to reference which of the two 
-   *   buffers now contains the sorted output sequence (a function of the 
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the
    *   number of key bits specified and the targeted device architecture).
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
    *   is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    * - @devicestorageP
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
+   * The code snippet below illustrates the batched sorting of three segments
    * (with one zero-length segment) of `int` keys.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // Declare, allocate, and initialize device-accessible pointers for
    * // sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -1225,61 +1228,61 @@ struct DeviceSegmentedRadixSort
    *
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in,out] d_keys  
-   *   Reference to the double-buffer of keys whose "current" device-accessible 
-   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
    *   point to the sorted output keys
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
    *   If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*<sup>th</sup>
    *   is considered empty.
    *
-   * @param[in] begin_bit  
-   *   **[optional]** The least-significant bit index (inclusive)  
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive)
    *   needed for key comparison
    *
-   * @param[in] end_bit  
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -1355,38 +1358,38 @@ struct DeviceSegmentedRadixSort
   }
 
   /**
-   * @brief Sorts segments of keys into descending order. 
+   * @brief Sorts segments of keys into descending order.
    * (`~2N` auxiliary storage required).
    *
    * @par
    * - The contents of the input data are not altered by the sorting operation
    * - When input a contiguous sequence of segments, a single sequence
-   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
-   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter
    *   is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see
    *   the sorting interface using DoubleBuffer wrappers below.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
+   * The code snippet below illustrates the batched sorting of three segments
    * (with one zero-length segment) of `int` keys.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -1417,62 +1420,62 @@ struct DeviceSegmentedRadixSort
    *
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in  
+   * @param[in] d_keys_in
    *   Device-accessible pointer to the input data of key data to sort
    *
-   * @param[out] d_keys_out  
+   * @param[out] d_keys_out
    *   Device-accessible pointer to the sorted output sequence of key data
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] begin_bit  
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit  
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., sizeof(unsigned int) * 8)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
@@ -1551,7 +1554,7 @@ struct DeviceSegmentedRadixSort
   }
 
   /**
-   * @brief Sorts segments of keys into descending order. 
+   * @brief Sorts segments of keys into descending order.
    * (`~N` auxiliary storage required).
    *
    * @par
@@ -1559,37 +1562,37 @@ struct DeviceSegmentedRadixSort
    *   DoubleBuffer structure that indicates which of the two buffers is
    *   "current" (and thus contains the input data to be sorted).
    * - The contents of both buffers may be altered by the sorting operation.
-   * - Upon completion, the sorting operation will update the "current" 
-   *   indicator within the DoubleBuffer wrapper to reference which of the two 
-   *   buffers now contains the sorted output sequence (a function of the 
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the
    *   number of key bits specified and the targeted device architecture).
    * - When input a contiguous sequence of segments, a single sequence
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
-   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
-   *   bits can be specified. This can reduce overall sorting overhead and 
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+   *   bits can be specified. This can reduce overall sorting overhead and
    *   yield a corresponding performance improvement.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    * - @devicestorageP
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the batched sorting of three segments 
+   * The code snippet below illustrates the batched sorting of three segments
    * (with one zero-length segment) of `int` keys.
    * @par
    * @code
-   * #include <cub/cub.cuh>   
+   * #include <cub/cub.cuh>
    * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for sorting data
    * int  num_items;          // e.g., 7
    * int  num_segments;       // e.g., 3
@@ -1619,61 +1622,61 @@ struct DeviceSegmentedRadixSort
    * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
    * @endcode
    *
-   * @tparam KeyT             
+   * @tparam KeyT
    *   **[inferred]** Key type
    *
-   * @tparam BeginOffsetIteratorT  
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   beginning offsets \iterator
    *
-   * @tparam EndOffsetIteratorT    
-   *   **[inferred]** Random-access input iterator type for reading segment 
+   * @tparam EndOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment
    *   ending offsets \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in,out] d_keys  
-   *   Reference to the double-buffer of keys whose "current" device-accessible 
-   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
    *   point to the sorted output keys
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   The total number of items within the segmented array, including items not
    *   covered by segments. `num_items` should match the largest element within
    *   the range `[d_end_offsets, d_end_offsets + num_segments)`.
    *
-   * @param[in] num_segments  
+   * @param[in] num_segments
    *   The number of segments that comprise the sorting data
    *
-   * @param[in] d_begin_offsets  
-   *   Random-access input iterator to the sequence of beginning offsets of 
-   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
-   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
    *   `d_values_*`
    *
-   * @param[in] d_end_offsets  
-   *   Random-access input iterator to the sequence of ending offsets of length 
-   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
-   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
-   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*<sup>th</sup> is 
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*<sup>th</sup> is
    *   considered empty.
    *
-   * @param[in] begin_bit  
-   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   * @param[in] begin_bit
+   *   **[optional]** The least-significant bit index (inclusive) needed for
    *   key comparison
    *
-   * @param[in] end_bit  
-   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   * @param[in] end_bit
+   *   **[optional]** The most-significant bit index (exclusive) needed for key
    *   comparison (e.g., `sizeof(unsigned int) * 8`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyT,
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index a776ac0f9e7..64de08f5fc1 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 2b86da95078..4d2aebd64f4 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/device/dispatch/dispatch_segmented_sort.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_namespace.cuh>
@@ -72,8 +75,8 @@ CUB_NAMESPACE_BEGIN
  * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and
  * `__nv_bfloat16` 16-bit floating-point types.
  *
- * @par Segments are not required to be contiguous. Any element of input(s) or 
- * output(s) outside the specified segments will not be accessed nor modified.  
+ * @par Segments are not required to be contiguous. Any element of input(s) or
+ * output(s) outside the specified segments will not be accessed nor modified.
  *
  * @par A simple example
  * @code
@@ -135,12 +138,12 @@ struct DeviceSegmentedSort
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -309,12 +312,12 @@ struct DeviceSegmentedSort
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -490,13 +493,13 @@ struct DeviceSegmentedSort
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -671,13 +674,13 @@ struct DeviceSegmentedSort
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -846,12 +849,12 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1009,12 +1012,12 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
-   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_keys_in, d_keys_in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not 
-   *   be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not
+   *   be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1181,13 +1184,13 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1350,13 +1353,13 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
-   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   The range `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_keys[i].Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1520,13 +1523,13 @@ struct DeviceSegmentedSort
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1720,13 +1723,13 @@ struct DeviceSegmentedSort
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1930,16 +1933,16 @@ struct DeviceSegmentedSort
    *   @p j are equivalent: neither one is less than the other. It is not
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2131,16 +2134,16 @@ struct DeviceSegmentedSort
    *   @p i and @p j are equivalent: neither one is less than the other. It is
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2327,13 +2330,13 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2520,13 +2523,13 @@ struct DeviceSegmentedSort
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
    * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
-   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
-   *   not overlap `[in, in + num_items)`, 
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall
+   *   not overlap `[in, in + num_items)`,
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, 
-   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys_in[i]`, `d_values_in[i]`,
+   *   `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2723,16 +2726,16 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2918,16 +2921,16 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
-   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
-   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
-   *   `[cur, cur + num_items)` shall not overlap 
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt`
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range
+   *   `[cur, cur + num_items)` shall not overlap
    *   `[alt, alt + num_items)`. Both ranges shall not overlap
    *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
    *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
-   * - Segments are not required to be contiguous. For all index values `i` 
-   *   outside the specified segments `d_keys.Current()[i]`, 
-   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`, 
-   *   `d_values.Alternate()[i]` will not be accessed nor modified.   
+   * - Segments are not required to be contiguous. For all index values `i`
+   *   outside the specified segments `d_keys.Current()[i]`,
+   *   `d_values.Current()[i]`, `d_keys.Alternate()[i]`,
+   *   `d_values.Alternate()[i]` will not be accessed nor modified.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index f21431391ab..1c93894c95c 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,17 +27,20 @@
  ******************************************************************************/
 
 /**
- * @file cub::DeviceSelect provides device-wide, parallel operations for 
- *       compacting selected items from sequences of data items residing within 
+ * @file cub::DeviceSelect provides device-wide, parallel operations for
+ *       compacting selected items from sequences of data items residing within
  *       device-accessible memory.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <stdio.h>
 
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_select_if.cuh>
 #include <cub/device/dispatch/dispatch_unique_by_key.cuh>
 #include <cub/util_deprecated.cuh>
@@ -46,8 +49,8 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * @brief DeviceSelect provides device-wide, parallel operations for compacting 
- *        selected items from sequences of data items residing within 
+ * @brief DeviceSelect provides device-wide, parallel operations for compacting
+ *        selected items from sequences of data items residing within
  *        device-accessible memory. ![](select_logo.png)
  * @ingroup SingleModule
  *
@@ -62,15 +65,15 @@ CUB_NAMESPACE_BEGIN
  * @linear_performance{select-flagged, select-if, and select-unique}
  *
  * @par
- * The following chart illustrates DeviceSelect::If performance across 
- * different CUDA architectures for `int32` items, where 50% of the items are 
+ * The following chart illustrates DeviceSelect::If performance across
+ * different CUDA architectures for `int32` items, where 50% of the items are
  * randomly selected.
  *
  * @image html select_if_int32_50_percent.png
  *
  * @par
- * The following chart illustrates DeviceSelect::Unique performance across 
- * different CUDA architectures for `int32` items where segments have lengths 
+ * The following chart illustrates DeviceSelect::Unique performance across
+ * different CUDA architectures for `int32` items where segments have lengths
  * uniformly sampled from `[1, 1000]`.
  *
  * @image html select_unique_int32_len_500.png
@@ -82,28 +85,28 @@ CUB_NAMESPACE_BEGIN
 struct DeviceSelect
 {
   /**
-   * @brief Uses the `d_flags` sequence to selectively copy the corresponding 
-   *        items from `d_in` into `d_out`. The total number of items selected 
+   * @brief Uses the `d_flags` sequence to selectively copy the corresponding
+   *        items from `d_in` into `d_out`. The total number of items selected
    *        is written to `d_num_selected_out`. ![](select_flags_logo.png)
    *
    * @par
-   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, 
+   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`,
    *   `char`, `int`, etc.).
-   * - Copies of the selected items are compacted into `d_out` and maintain 
+   * - Copies of the selected items are compacted into `d_out` and maintain
    *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
    *   `[d_in, d_in + num_items)`, `[d_flags, d_flags + num_items)` nor
    *   `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for input, 
+   * // Declare, allocate, and initialize device-accessible pointers for input,
    * // flags, and output
    * int  num_items;              // e.g., 8
    * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
@@ -116,7 +119,7 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_flags, d_out, d_num_selected_out, num_items);
    *
    * // Allocate temporary storage
@@ -124,7 +127,7 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_flags, d_out, d_num_selected_out, num_items);
    *
    * // d_out                 <-- [1, 4, 6, 7]
@@ -132,48 +135,48 @@ struct DeviceSelect
    *
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam FlagIterator         
-   *   **[inferred]** Random-access input iterator type for reading selection 
+   * @tparam FlagIterator
+   *   **[inferred]** Random-access input iterator type for reading selection
    *   flags \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Random-access output iterator type for writing selected 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing selected
    *   items \iterator
    *
-   * @tparam NumSelectedIteratorT  
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[in] d_flags  
+   * @param[in] d_flags
    *   Pointer to the input sequence of selection flags
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output sequence of selected data items
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the output total number of items selected 
+   * @param[out] d_num_selected_out
+   *   Pointer to the output total number of items selected
    *   (i.e., length of `d_out`)
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
@@ -243,30 +246,30 @@ struct DeviceSelect
                                          num_items,
                                          stream);
   }
-   
+
   /**
-   * @brief Uses the `d_flags` sequence to selectively compact the items in 
-   *        `d_data`. The total number of items selected is written to 
+   * @brief Uses the `d_flags` sequence to selectively compact the items in
+   *        `d_data`. The total number of items selected is written to
    *        `d_num_selected_out`. ![](select_flags_logo.png)
    *
    * @par
-   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, 
+   * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`,
    *   `char`, `int`, etc.).
-   * - Copies of the selected items are compacted in-place and maintain 
+   * - Copies of the selected items are compacted in-place and maintain
    *   their original relative ordering.
-   * - The `d_data` may equal `d_flags`. The range 
-   *  `[d_data, d_data + num_items)` shall not overlap 
+   * - The `d_data` may equal `d_flags`. The range
+   *  `[d_data, d_data + num_items)` shall not overlap
    *  `[d_flags, d_flags + num_items)` in any other way.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers for input, 
+   * // Declare, allocate, and initialize device-accessible pointers for input,
    * // flags, and output
    * int  num_items;              // e.g., 8
    * int  *d_data;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
@@ -278,7 +281,7 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_flags, d_num_selected_out, num_items);
    *
    * // Allocate temporary storage
@@ -286,7 +289,7 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::Flagged(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_flags, d_num_selected_out, num_items);
    *
    * // d_data                <-- [1, 4, 6, 7]
@@ -294,40 +297,40 @@ struct DeviceSelect
    *
    * @endcode
    *
-   * @tparam IteratorT       
-   *   **[inferred]** Random-access iterator type for reading and writing 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access iterator type for reading and writing
    *   selected items \iterator
    *
-   * @tparam FlagIterator         
-   *   **[inferred]** Random-access input iterator type for reading selection 
+   * @tparam FlagIterator
+   *   **[inferred]** Random-access input iterator type for reading selection
    *   flags \iterator
    *
-   * @tparam NumSelectedIteratorT  
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
    * @param[in,out] d_data
    *   Pointer to the sequence of data items
    *
-   * @param[in] d_flags  
+   * @param[in] d_flags
    *   Pointer to the input sequence of selection flags
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the output total number of items selected 
+   * @param[out] d_num_selected_out
+   *   Pointer to the output total number of items selected
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_data`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename IteratorT,
@@ -355,7 +358,7 @@ struct DeviceSelect
                             SelectOp,
                             EqualityOp,
                             OffsetT,
-                            false, 
+                            false,
                             may_alias>::Dispatch(d_temp_storage,
                                                  temp_storage_bytes,
                                                  d_data, // in
@@ -395,20 +398,20 @@ struct DeviceSelect
   }
 
   /**
-   * @brief Uses the `select_op` functor to selectively copy items from `d_in` 
-   *        into `d_out`. The total number of items selected is written to 
+   * @brief Uses the `select_op` functor to selectively copy items from `d_in`
+   *        into `d_out`. The total number of items selected is written to
    *        `d_num_selected_out`. ![](select_logo.png)
    *
    * @par
-   * - Copies of the selected items are compacted into `d_out` and maintain 
+   * - Copies of the selected items are compacted into `d_out` and maintain
    *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
    *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Performance
-   * The following charts illustrate saturated select-if performance across 
-   * different CUDA architectures for `int32` and `int64` items, respectively. 
+   * The following charts illustrate saturated select-if performance across
+   * different CUDA architectures for `int32` and `int64` items, respectively.
    * Items are selected with 50% probability.
    *
    * @image html select_if_int32_50_percent.png
@@ -421,7 +424,7 @@ struct DeviceSelect
    * @image html select_if_int64_5_percent.png
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
@@ -441,7 +444,7 @@ struct DeviceSelect
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int      num_items;              // e.g., 8
    * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
@@ -454,7 +457,7 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, d_num_selected_out, num_items, select_op);
    *
    * // Allocate temporary storage
@@ -462,55 +465,55 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, d_num_selected_out, num_items, select_op);
    *
    * // d_out                 <-- [0, 2, 3, 5, 2]
    * // d_num_selected_out    <-- [5]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Random-access output iterator type for writing selected 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing selected
    *   items \iterator
    *
-   * @tparam NumSelectedIteratorT  
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @tparam SelectOp             
-   *   **[inferred]** Selection operator type having member 
+   * @tparam SelectOp
+   *   **[inferred]** Selection operator type having member
    *   `bool operator()(const T &a)`
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output sequence of selected data items
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the output total number of items selected 
+   * @param[out] d_num_selected_out
+   *   Pointer to the output total number of items selected
    *   (i.e., length of `d_out`)
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] select_op  
+   * @param[in] select_op
    *   Unary selection operator
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
@@ -580,17 +583,17 @@ struct DeviceSelect
   }
 
   /**
-   * @brief Uses the `select_op` functor to selectively compact items in 
-   *        `d_data`. The total number of items selected is written to 
+   * @brief Uses the `select_op` functor to selectively compact items in
+   *        `d_data`. The total number of items selected is written to
    *        `d_num_selected_out`. ![](select_logo.png)
    *
    * @par
-   * - Copies of the selected items are compacted in `d_data` and maintain 
+   * - Copies of the selected items are compacted in `d_data` and maintain
    *   their original relative ordering.
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
@@ -610,7 +613,7 @@ struct DeviceSelect
    *     }
    * };
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int      num_items;              // e.g., 8
    * int      *d_data;                // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
@@ -622,7 +625,7 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, d_num_selected_out, num_items, select_op);
    *
    * // Allocate temporary storage
@@ -630,47 +633,47 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::If(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_data, d_num_selected_out, num_items, select_op);
    *
    * // d_data                <-- [0, 2, 3, 5, 2]
    * // d_num_selected_out    <-- [5]
    * @endcode
    *
-   * @tparam IteratorT       
-   *   **[inferred]** Random-access input iterator type for reading and 
+   * @tparam IteratorT
+   *   **[inferred]** Random-access input iterator type for reading and
    *   writing items \iterator
    *
-   * @tparam NumSelectedIteratorT  
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @tparam SelectOp             
-   *   **[inferred]** Selection operator type having member 
+   * @tparam SelectOp
+   *   **[inferred]** Selection operator type having member
    *   `bool operator()(const T &a)`
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
    * @param[in,out] d_data
    *   Pointer to the sequence of data items
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the output total number of items selected 
+   * @param[out] d_num_selected_out
+   *   Pointer to the output total number of items selected
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_data`)
    *
-   * @param[in] select_op  
+   * @param[in] select_op
    *   Unary selection operator
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename IteratorT,
@@ -737,43 +740,43 @@ struct DeviceSelect
   }
 
   /**
-   * @brief Given an input sequence `d_in` having runs of consecutive 
-   *        equal-valued keys, only the first key from each run is selectively 
-   *        copied to `d_out`. The total number of items selected is written to 
+   * @brief Given an input sequence `d_in` having runs of consecutive
+   *        equal-valued keys, only the first key from each run is selectively
+   *        copied to `d_out`. The total number of items selected is written to
    *        `d_num_selected_out`. ![](unique_logo.png)
    *
    * @par
-   * - The `==` equality operator is used to determine whether keys are 
+   * - The `==` equality operator is used to determine whether keys are
    *   equivalent
-   * - Copies of the selected items are compacted into `d_out` and maintain 
+   * - Copies of the selected items are compacted into `d_out` and maintain
    *   their original relative ordering.
-   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap 
+   * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap
    *   `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way.
    * - @devicestorage
    *
    * @par Performance
    * The following charts illustrate saturated select-unique performance across different
-   * CUDA architectures for `int32` and `int64` items, respectively. Segments 
+   * CUDA architectures for `int32` and `int64` items, respectively. Segments
    * have lengths uniformly sampled from `[1, 1000]`.
    *
    * @image html select_unique_int32_len_500.png
    * @image html select_unique_int64_len_500.png
    *
    * @par
-   * The following charts are similar, but with segment lengths uniformly 
+   * The following charts are similar, but with segment lengths uniformly
    * sampled from `[1, 10]`:
    *
    * @image html select_unique_int32_len_5.png
    * @image html select_unique_int64_len_5.png
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int  num_items;              // e.g., 8
    * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
@@ -785,7 +788,7 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::Unique(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, d_num_selected_out, num_items);
    *
    * // Allocate temporary storage
@@ -793,48 +796,48 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::Unique(
-   *   d_temp_storage, temp_storage_bytes, 
+   *   d_temp_storage, temp_storage_bytes,
    *   d_in, d_out, d_num_selected_out, num_items);
    *
    * // d_out                 <-- [0, 2, 9, 5, 8]
    * // d_num_selected_out    <-- [5]
    * @endcode
    *
-   * @tparam InputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam InputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   items \iterator
    *
-   * @tparam OutputIteratorT      
-   *   **[inferred]** Random-access output iterator type for writing selected 
+   * @tparam OutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing selected
    *   items \iterator
    *
-   * @tparam NumSelectedIteratorT  
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_in  
+   * @param[in] d_in
    *   Pointer to the input sequence of data items
    *
-   * @param[out] d_out  
+   * @param[out] d_out
    *   Pointer to the output sequence of selected data items
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the output total number of items selected 
+   * @param[out] d_num_selected_out
+   *   Pointer to the output total number of items selected
    *   (i.e., length of `d_out`)
    *
-   * @param[in] num_items  
+   * @param[in] num_items
    *   Total number of input items (i.e., length of `d_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within.  
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename InputIteratorT,
@@ -900,16 +903,16 @@ struct DeviceSelect
   }
 
   /**
-   * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of 
-   *        key-value pairs with consecutive equal-valued keys, only the first 
-   *        key and its value from each run is selectively copied to 
-   *        `d_keys_out` and `d_values_out`. The total number of items selected 
+   * @brief Given an input sequence `d_keys_in` and `d_values_in` with runs of
+   *        key-value pairs with consecutive equal-valued keys, only the first
+   *        key and its value from each run is selectively copied to
+   *        `d_keys_out` and `d_values_out`. The total number of items selected
    *        is written to `d_num_selected_out`. ![](unique_logo.png)
    *
    * @par
-   * - The `==` equality operator is used to determine whether keys are 
+   * - The `==` equality operator is used to determine whether keys are
    *   equivalent
-   * - Copies of the selected items are compacted into `d_out` and maintain 
+   * - Copies of the selected items are compacted into `d_out` and maintain
    *   their original relative ordering.
    * - In-place operations are not supported. There must be no overlap between
    *   any of the provided ranges:
@@ -921,13 +924,13 @@ struct DeviceSelect
    * - @devicestorage
    *
    * @par Snippet
-   * The code snippet below illustrates the compaction of items selected from 
+   * The code snippet below illustrates the compaction of items selected from
    * an `int` device vector.
    * @par
    * @code
    * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
    *
-   * // Declare, allocate, and initialize device-accessible pointers 
+   * // Declare, allocate, and initialize device-accessible pointers
    * // for input and output
    * int  num_items;              // e.g., 8
    * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
@@ -941,8 +944,8 @@ struct DeviceSelect
    * void     *d_temp_storage = NULL;
    * size_t   temp_storage_bytes = 0;
    * cub::DeviceSelect::UniqueByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_values_in, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_values_in,
    *   d_keys_out, d_values_out, d_num_selected_out, num_items);
    *
    * // Allocate temporary storage
@@ -950,8 +953,8 @@ struct DeviceSelect
    *
    * // Run selection
    * cub::DeviceSelect::UniqueByKey(
-   *   d_temp_storage, temp_storage_bytes, 
-   *   d_keys_in, d_values_in, 
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys_in, d_values_in,
    *   d_keys_out, d_values_out, d_num_selected_out, num_items);
    *
    * // d_keys_out            <-- [0, 2, 9, 5, 8]
@@ -959,56 +962,56 @@ struct DeviceSelect
    * // d_num_selected_out    <-- [5]
    * @endcode
    *
-   * @tparam KeyInputIteratorT       
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam KeyInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   keys \iterator
    *
-   * @tparam ValueInputIteratorT     
-   *   **[inferred]** Random-access input iterator type for reading input 
+   * @tparam ValueInputIteratorT
+   *   **[inferred]** Random-access input iterator type for reading input
    *   values \iterator
    *
-   * @tparam KeyOutputIteratorT      
-   *   **[inferred]** Random-access output iterator type for writing selected 
+   * @tparam KeyOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing selected
    *   keys \iterator
    *
-   * @tparam ValueOutputIteratorT    
-   *   **[inferred]** Random-access output iterator type for writing selected 
+   * @tparam ValueOutputIteratorT
+   *   **[inferred]** Random-access output iterator type for writing selected
    *   values \iterator
    *
-   * @tparam NumSelectedIteratorT    
-   *   **[inferred]** Output iterator type for recording the number of items 
+   * @tparam NumSelectedIteratorT
+   *   **[inferred]** Output iterator type for recording the number of items
    *   selected \iterator
    *
-   * @param[in] d_temp_storage  
-   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
-   *   required allocation size is written to `temp_storage_bytes` and no work 
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
    *   is done.
    *
-   * @param[in,out] temp_storage_bytes  
+   * @param[in,out] temp_storage_bytes
    *   Reference to size in bytes of `d_temp_storage` allocation
    *
-   * @param[in] d_keys_in  
+   * @param[in] d_keys_in
    *   Pointer to the input sequence of keys
    *
-   * @param[in] d_values_in  
+   * @param[in] d_values_in
    *   Pointer to the input sequence of values
    *
-   * @param[out] d_keys_out  
+   * @param[out] d_keys_out
    *   Pointer to the output sequence of selected keys
    *
-   * @param[out] d_values_out  
+   * @param[out] d_values_out
    *   Pointer to the output sequence of selected values
    *
-   * @param[out] d_num_selected_out  
-   *   Pointer to the total number of items selected (i.e., length of 
+   * @param[out] d_num_selected_out
+   *   Pointer to the total number of items selected (i.e., length of
    *   `d_keys_out` or `d_values_out`)
    *
-   * @param[in] num_items  
-   *   Total number of input items (i.e., length of `d_keys_in` or 
+   * @param[in] num_items
+   *   Total number of input items (i.e., length of `d_keys_in` or
    *   `d_values_in`)
    *
-   * @param[in] stream  
-   *   **[optional]** CUDA stream to launch kernels within. 
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
    *   Default is stream<sub>0</sub>.
    */
   template <typename KeyInputIteratorT,
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 22dcbe5fcf5..c23fce98439 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -34,6 +34,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdio.h>
 #include <iterator>
 #include <limits>
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 6328e7d787b..be0be052a54 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -27,8 +27,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_adjacent_difference.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/type_traits.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_deprecated.cuh>
@@ -67,10 +70,10 @@ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
                                          DifferenceOpT difference_op,
                                          OffsetT num_items)
 {
-  using ActivePolicyT = 
+  using ActivePolicyT =
     typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;
 
-  // It is OK to introspect the return type or parameter types of the 
+  // It is OK to introspect the return type or parameter types of the
   // `operator()` function of `__device__` extended lambda within device code.
   using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;
 
@@ -94,7 +97,7 @@ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
               num_items);
 
   int tile_idx = static_cast<int>(blockIdx.x);
-  OffsetT tile_base  = static_cast<OffsetT>(tile_idx) 
+  OffsetT tile_base  = static_cast<OffsetT>(tile_idx)
                      * ActivePolicyT::ITEMS_PER_TILE;
 
   agent.Process(tile_idx, tile_base);
@@ -313,7 +316,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
               num_items);
 
       error = CubDebug(detail::DebugSyncStream(stream));
-      
+
       if (cudaSuccess != error)
       {
         break;
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 04384ae0451..61b4232bf15 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_batch_memcpy.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/temporary_storage.cuh>
 #include <cub/thread/thread_search.cuh>
 #include <cub/util_debug.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 5c9aab47a85..921eeef7258 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -35,8 +35,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_histogram.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/cpp_compatibility.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_temporary_storage.cuh>
@@ -567,8 +570,8 @@ struct dispatch_histogram
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam SelectedPolicy 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <int NUM_CHANNELS,
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 79bdfa86775..b4169a77f5f 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -27,6 +27,10 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_merge_sort.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_device.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index b57ccaace88..f64f16d6bb7 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -33,13 +33,16 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_radix_sort_downsweep.cuh>
 #include <cub/agent/agent_radix_sort_histogram.cuh>
 #include <cub/agent/agent_radix_sort_onesweep.cuh>
 #include <cub/agent/agent_radix_sort_upsweep.cuh>
 #include <cub/agent/agent_scan.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cub/config.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_deprecated.cuh>
@@ -75,7 +78,7 @@ template <
     bool     IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
     typename KeyT,                           ///< Key type
     typename OffsetT,                        ///< Signed integer type for global offsets
-    typename DecomposerT = detail::identity_decomposer_t>                        
+    typename DecomposerT = detail::identity_decomposer_t>
 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
     int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
     int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
@@ -316,7 +319,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
     ValueT          values[ITEMS_PER_THREAD];
 
     // Get default (min/max) value for out-of-bounds keys
-    bit_ordered_type default_key_bits = IS_DESCENDING 
+    bit_ordered_type default_key_bits = IS_DESCENDING
                                       ? traits::min_raw_binary_key(decomposer)
                                       : traits::max_raw_binary_key(decomposer);
 
@@ -540,7 +543,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
  * Onesweep kernels
  ******************************************************************************/
 
-/** 
+/**
  * Kernel for computing multiple histograms
  */
 
@@ -552,7 +555,7 @@ template <typename ChainedPolicyT,
           typename KeyT,
           typename OffsetT,
           typename DecomposerT = detail::identity_decomposer_t>
-CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) 
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
 void DeviceRadixSortHistogramKernel(OffsetT *d_bins_out,
                                     const KeyT *d_keys_in,
                                     OffsetT num_items,
@@ -594,7 +597,7 @@ DeviceRadixSortOnesweepKernel
 }
 
 
-/** 
+/**
  * Exclusive sum kernel
  */
 template <
@@ -722,10 +725,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
             RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
@@ -778,10 +781,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
             RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
@@ -821,10 +824,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256, OFFSET_64BIT ? 29 : 30, DominantT, 2,
             RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
@@ -864,10 +867,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
             RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
@@ -905,10 +908,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
             RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
@@ -948,10 +951,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <256,
             sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4,
@@ -993,10 +996,10 @@ struct DeviceRadixSortPolicy
 
         // Histogram policy
         typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
-        
+
         // Exclusive sum policy
         typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
-        
+
         // Onesweep policy
         typedef AgentRadixSortOnesweepPolicy <384,
             OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21, DominantT, 1,
@@ -1162,8 +1165,8 @@ struct DeviceRadixSortPolicy
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam DecomposerT 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam DecomposerT
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <bool IS_DESCENDING,
@@ -1503,19 +1506,19 @@ struct DispatchRadixSort : SelectedPolicy
                 radix_digits            = 1 << radix_bits;
 
                 error = CubDebug(upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
 
                 error = CubDebug(scan_config.Init<ScanPolicyT>(scan_kernel));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
 
                 error = CubDebug(downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
@@ -1629,7 +1632,7 @@ struct DispatchRadixSort : SelectedPolicy
                                                                            histogram_kernel,
                                                                            HISTO_BLOCK_THREADS,
                                                                            0));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -1713,7 +1716,7 @@ struct DispatchRadixSort : SelectedPolicy
                                                0,
                                                num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
                                                stream));
-                    if (cudaSuccess != error) 
+                    if (cudaSuccess != error)
                     {
                         break;
                     }
@@ -1760,7 +1763,7 @@ struct DispatchRadixSort : SelectedPolicy
                 {
                     break;
                 }
-                
+
                 // use the temporary buffers if no overwrite is allowed
                 if (!is_overwrite_okay && pass == 0)
                 {
@@ -1775,7 +1778,7 @@ struct DispatchRadixSort : SelectedPolicy
                 d_values.selector ^= 1;
             }
         } while (0);
-        
+
         return error;
     }
 
@@ -1799,7 +1802,7 @@ struct DispatchRadixSort : SelectedPolicy
             // Get device ordinal
             int device_ordinal;
             error = CubDebug(cudaGetDevice(&device_ordinal));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -1808,7 +1811,7 @@ struct DispatchRadixSort : SelectedPolicy
             int sm_count;
             error = CubDebug(
               cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -1899,7 +1902,7 @@ struct DispatchRadixSort : SelectedPolicy
                                   spine_length,
                                   current_bit,
                                   (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -1917,7 +1920,7 @@ struct DispatchRadixSort : SelectedPolicy
                   current_bit,
                   (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
 
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
@@ -1956,7 +1959,7 @@ struct DispatchRadixSort : SelectedPolicy
             DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
             RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
             DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>,
-            DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);        
+            DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
     }
 
     template <typename ActivePolicyT>
@@ -1977,7 +1980,7 @@ struct DispatchRadixSort : SelectedPolicy
             temp_storage_bytes = 1;
             return cudaSuccess;
         }
-        
+
         // Copy keys
         #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
         _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long)num_items,
@@ -2103,7 +2106,7 @@ struct DispatchRadixSort : SelectedPolicy
             int ptx_version = 0;
 
             error = CubDebug(PtxVersion(ptx_version));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -2397,7 +2400,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
 
             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
             error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -2437,7 +2440,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
                                   d_values_remaining_passes.Current(),
                                   current_bit,
                                   (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -2452,7 +2455,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
                   d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
                   current_bit,
                   (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
@@ -2533,7 +2536,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
             int ptx_version = 0;
 
             error = CubDebug(PtxVersion(ptx_version));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -2548,7 +2551,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
 
             // Dispatch to chained policy
             error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 39a777c8e3d..e6b6ab0d452 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -34,8 +34,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_reduce.cuh>
-#include <cub/config.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index a758fa60992..7e4deb4c3d6 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -33,8 +33,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_reduce_by_key.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
 #include <cub/grid/grid_queue.cuh>
@@ -205,8 +208,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam SelectedPolicy 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <typename KeysInputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 18309b44c7d..34ecc13a0a1 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -34,8 +34,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_rle.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
 #include <cub/grid/grid_queue.cuh>
@@ -172,8 +175,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA
  * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam SelectedPolicy 
- *   Implementation detail, do not specify directly, requirements on the 
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
  *   content of this type are subject to breaking change.
  */
 template <typename InputIteratorT,
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 6893f24e1dc..89bdb36c178 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -14,9 +14,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -35,8 +35,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_scan.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/tuning/tuning_scan.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -211,7 +214,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
  *   Random-access output iterator type for writing scan outputs \iterator
  *
  * @tparam ScanOpT
- *   Binary scan functor type having member 
+ *   Binary scan functor type having member
  *   `auto operator()(const T &a, const U &b)`
  *
  * @tparam InitValueT
@@ -226,9 +229,9 @@ template <typename InputIteratorT,
           typename ScanOpT,
           typename InitValueT,
           typename OffsetT,
-          typename AccumT = 
+          typename AccumT =
             detail::accumulator_t<
-              ScanOpT, 
+              ScanOpT,
               cub::detail::conditional_t<
                 std::is_same<InitValueT, NullType>::value,
                 cub::detail::value_t<InputIteratorT>,
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 62df5c6b913..13a4f354f00 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -12,9 +12,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -26,15 +26,18 @@
  ******************************************************************************/
 
 /**
- * @file DeviceScan provides device-wide, parallel operations for computing a 
- *       prefix scan across a sequence of data items residing within 
+ * @file DeviceScan provides device-wide, parallel operations for computing a
+ *       prefix scan across a sequence of data items residing within
  *       device-accessible memory.
  */
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_scan_by_key.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_scan_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -465,7 +468,7 @@ struct DispatchScanByKey : SelectedPolicy
       }
 
       // Sync the stream if specified to flush runtime errors
-      
+
       error = CubDebug(detail::DebugSyncStream(stream));
       if (cudaSuccess != error)
       {
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index e2d5da09669..68db255b044 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -27,6 +27,10 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_segmented_radix_sort.cuh>
 #include <cub/agent/agent_sub_warp_merge_sort.cuh>
 #include <cub/block/block_load.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index bb9fe685768..8b84113bb2b 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -28,14 +28,17 @@
 
 /**
  * @file
- *   cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences 
+ *   cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences
  *   of data items residing within device-accessible memory.
  */
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_select_if.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_select_if.cuh>
 #include <cub/grid/grid_queue.cuh>
@@ -64,39 +67,39 @@ CUB_NAMESPACE_BEGIN
  * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
  * Otherwise performs discontinuity selection (keep unique)
  *
- * @tparam InputIteratorT 
+ * @tparam InputIteratorT
  *   Random-access input iterator type for reading input items
  *
- * @tparam FlagsInputIteratorT 
- *   Random-access input iterator type for reading selection flags (NullType* if a selection functor 
+ * @tparam FlagsInputIteratorT
+ *   Random-access input iterator type for reading selection flags (NullType* if a selection functor
  *   or discontinuity flagging is to be used for selection)
  *
- * @tparam SelectedOutputIteratorT 
+ * @tparam SelectedOutputIteratorT
  *   Random-access output iterator type for writing selected items
  *
- * @tparam NumSelectedIteratorT 
+ * @tparam NumSelectedIteratorT
  *   Output iterator type for recording the number of items selected
  *
- * @tparam ScanTileStateT 
+ * @tparam ScanTileStateT
  *   Tile status interface type
  *
- * @tparam SelectOpT 
- *   Selection operator type (NullType if selection flags or discontinuity flagging is 
+ * @tparam SelectOpT
+ *   Selection operator type (NullType if selection flags or discontinuity flagging is
  *   to be used for selection)
  *
- * @tparam EqualityOpT 
- *   Equality operator type (NullType if selection functor or selection flags is 
+ * @tparam EqualityOpT
+ *   Equality operator type (NullType if selection functor or selection flags is
  *   to be used for selection)
  *
- * @tparam OffsetT 
+ * @tparam OffsetT
  *   Signed integer type for global offsets
  *
- * @tparam KEEP_REJECTS 
+ * @tparam KEEP_REJECTS
  *   Whether or not we push rejected items to the back of the output
  *
  * @param[in] d_in
  *   Pointer to the input sequence of data items
- * 
+ *
  * @param[in] d_flags
  *   Pointer to the input sequence of selection flags (if applicable)
  *
@@ -106,18 +109,18 @@ CUB_NAMESPACE_BEGIN
  * @param[out] d_num_selected_out
  *   Pointer to the total number of items selected (i.e., length of \p d_selected_out)
  *
- * @param[in] tile_status 
+ * @param[in] tile_status
  *   Tile status interface
  *
  * @param[in] select_op
  *   Selection operator
- * 
+ *
  * @param[in] equality_op
  *   Equality operator
- * 
+ *
  * @param[in] num_items
  *   Total number of input items (i.e., length of \p d_in)
- * 
+ *
  * @param[in] num_tiles
  *   Total number of tiles for the entire problem
  */
@@ -176,7 +179,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA
  *   Random-access input iterator type for reading input items
  *
  * @tparam FlagsInputIteratorT
- *   Random-access input iterator type for reading selection flags 
+ *   Random-access input iterator type for reading selection flags
  *   (NullType* if a selection functor or discontinuity flagging is to be used for selection)
  *
  * @tparam SelectedOutputIteratorT
@@ -186,11 +189,11 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA
  *   Output iterator type for recording the number of items selected
  *
  * @tparam SelectOpT
- *   Selection operator type (NullType if selection flags or discontinuity flagging is 
+ *   Selection operator type (NullType if selection flags or discontinuity flagging is
  *   to be used for selection)
  *
  * @tparam EqualityOpT
- *   Equality operator type (NullType if selection functor or selection flags is to 
+ *   Equality operator type (NullType if selection functor or selection flags is to
  *   be used for selection)
  *
  * @tparam OffsetT
@@ -225,13 +228,13 @@ struct DispatchSelectIf : SelectedPolicy
 
     static constexpr int INIT_KERNEL_THREADS = 128;
 
-    /// Device-accessible allocation of temporary storage. 
-    /// When `nullptr`, the required allocation size is written to `temp_storage_bytes` 
+    /// Device-accessible allocation of temporary storage.
+    /// When `nullptr`, the required allocation size is written to `temp_storage_bytes`
     /// and no work is done.
     void* d_temp_storage;
 
     /// Reference to size in bytes of `d_temp_storage` allocation
-    size_t& temp_storage_bytes; 
+    size_t& temp_storage_bytes;
 
     /// Pointer to the input sequence of data items
     InputIteratorT d_in;
@@ -261,11 +264,11 @@ struct DispatchSelectIf : SelectedPolicy
 
     /**
      * @param d_temp_storage
-     *   Device-accessible allocation of temporary storage. 
-     *   When `nullptr`, the required allocation size is written to `temp_storage_bytes` 
+     *   Device-accessible allocation of temporary storage.
+     *   When `nullptr`, the required allocation size is written to `temp_storage_bytes`
      *   and no work is done.
-     * 
-     * @param temp_storage_bytes 
+     *
+     * @param temp_storage_bytes
      *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param d_in
@@ -339,7 +342,7 @@ struct DispatchSelectIf : SelectedPolicy
             // Get device ordinal
             int device_ordinal;
             error = CubDebug(cudaGetDevice(&device_ordinal));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -352,7 +355,7 @@ struct DispatchSelectIf : SelectedPolicy
 
             // bytes needed for tile status descriptors
             error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -361,7 +364,7 @@ struct DispatchSelectIf : SelectedPolicy
             void* allocations[1] = {};
 
             error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -375,7 +378,7 @@ struct DispatchSelectIf : SelectedPolicy
             // Construct the tile status interface
             ScanTileStateT tile_status;
             error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -418,7 +421,7 @@ struct DispatchSelectIf : SelectedPolicy
             // Get max x-dimension of grid
             int max_dim_x;
             error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -509,11 +512,11 @@ struct DispatchSelectIf : SelectedPolicy
      * Internal dispatch routine
      *
      * @param d_temp_storage
-     *   Device-accessible allocation of temporary storage. 
-     *   When `nullptr`, the required allocation size is written to `temp_storage_bytes` 
+     *   Device-accessible allocation of temporary storage.
+     *   When `nullptr`, the required allocation size is written to `temp_storage_bytes`
      *   and no work is done.
-     * 
-     * @param temp_storage_bytes 
+     *
+     * @param temp_storage_bytes
      *   Reference to size in bytes of `d_temp_storage` allocation
      *
      * @param d_in
@@ -555,7 +558,7 @@ struct DispatchSelectIf : SelectedPolicy
         using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
 
         int ptx_version = 0;
-        if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) 
+        if (cudaError_t error = CubDebug(PtxVersion(ptx_version)))
         {
             return error;
         }
@@ -571,23 +574,23 @@ struct DispatchSelectIf : SelectedPolicy
                                     num_items,
                                     stream,
                                     ptx_version);
-        
+
         return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
     }
 
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
     CUB_RUNTIME_FUNCTION __forceinline__
     static cudaError_t Dispatch(
-        void*                       d_temp_storage,          
-        size_t&                     temp_storage_bytes,       
-        InputIteratorT              d_in,                      
-        FlagsInputIteratorT         d_flags,                    
-        SelectedOutputIteratorT     d_selected_out,              
-        NumSelectedIteratorT        d_num_selected_out,           
-        SelectOpT                   select_op,                     
-        EqualityOpT                 equality_op,                    
-        OffsetT                     num_items,            
-        cudaStream_t                stream,                
+        void*                       d_temp_storage,
+        size_t&                     temp_storage_bytes,
+        InputIteratorT              d_in,
+        FlagsInputIteratorT         d_flags,
+        SelectedOutputIteratorT     d_selected_out,
+        NumSelectedIteratorT        d_num_selected_out,
+        SelectOpT                   select_op,
+        EqualityOpT                 equality_op,
+        OffsetT                     num_items,
+        cudaStream_t                stream,
         bool                        debug_synchronous)
     {
       CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index 8f2a68f967c..89376f617df 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -34,10 +34,13 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_segment_fixup.cuh>
 #include <cub/agent/agent_spmv_orig.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/cpp_compatibility.cuh>
 #include <cub/grid/grid_queue.cuh>
 #include <cub/thread/thread_search.cuh>
@@ -196,9 +199,9 @@ DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
     {
         ValueT result = 0.0;
 
-        CUB_IF_CONSTEXPR(HAS_BETA) 
+        CUB_IF_CONSTEXPR(HAS_BETA)
         {
-            result += spmv_params.beta * spmv_params.d_vector_y[row]; 
+            result += spmv_params.beta * spmv_params.d_vector_y[row];
         }
 
         spmv_params.d_vector_y[row] = result;
@@ -838,7 +841,7 @@ struct DispatchSpmv
             constexpr bool has_beta = false;
 
             if (CubDebug(error = Dispatch(
-                d_temp_storage, temp_storage_bytes, spmv_params, stream, 
+                d_temp_storage, temp_storage_bytes, spmv_params, stream,
                 DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
                 DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
                 DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, has_alpha, has_beta>,
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 8b1849fa24c..3e36e336dfc 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -27,8 +27,11 @@
 
 #pragma once
 
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_three_way_partition.cuh>
-#include <cub/config.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_three_way_partition.cuh>
 #include <cub/thread/thread_operators.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index c924e71ef7e..36ab55be61c 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -34,8 +34,8 @@
 #include <cub/agent/agent_unique_by_key.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/tuning/tuning_unique_by_key.cuh>
+#include <cub/config.cuh>
 #include <cub/util_deprecated.cuh>
-#include <cub/util_macro.cuh>
 #include <cub/util_math.cuh>
 
 #include <iterator>
diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
index 2b595e91e79..9cdc7173f62 100644
--- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -27,9 +27,12 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_histogram.cuh>
 #include <cub/block/block_load.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index e06ddb5019e..1ea37100a07 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -27,12 +27,15 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_reduce_by_key.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
index d7ad21c808f..cfad085825f 100644
--- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -27,13 +27,16 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_reduce_by_key.cuh>
 #include <cub/agent/agent_rle.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
@@ -157,7 +160,7 @@ struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor = detail::no_delay_constructor_t<515>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class LengthT>
 struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -259,7 +262,7 @@ struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor = detail::no_delay_constructor_t<1075>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class LengthT>
 struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -287,7 +290,7 @@ struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::n
 
 } // namespace encode
 
-namespace non_trivial_runs 
+namespace non_trivial_runs
 {
 
 template <class LengthT,
@@ -304,7 +307,7 @@ struct sm90_tuning
 
   static constexpr int items = CUB_MIN(nominal_4b_items_per_thread,
                                  CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT))));
-  
+
   static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
 
   static constexpr bool store_with_time_slicing = true;
@@ -368,7 +371,7 @@ struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor = detail::no_delay_constructor_t<840>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class LengthT>
 struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
@@ -412,7 +415,7 @@ struct sm80_tuning
 
   static constexpr int items = CUB_MIN(nominal_4b_items_per_thread,
                                  CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT))));
-  
+
   static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
 
   static constexpr bool store_with_time_slicing = true;
@@ -476,7 +479,7 @@ struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, len
   using delay_constructor = detail::no_delay_constructor_t<1065>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class LengthT>
 struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index eff4f29fcd2..f26a1655f9c 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -27,12 +27,15 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_scan.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
@@ -102,7 +105,7 @@ template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes
 template <> struct sm90_tuning<float,  primitive_op::yes, primitive_accum::yes, accum_size::_4> : tuning<128, 24, 688, 1140> {};
 template <> struct sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8> : tuning<224, 24, 576, 1215> {};
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <> struct sm90_tuning< __int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {};
 template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {};
 #endif
@@ -229,7 +232,7 @@ struct sm80_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si
 } // namespace detail
 
 
-template <typename AccumT, typename ScanOpT = Sum> 
+template <typename AccumT, typename ScanOpT = Sum>
 struct DeviceScanPolicy
 {
   // For large values, use timesliced loads/stores to fit shared memory.
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
index 92e4931fe81..9f7e679c45a 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -27,12 +27,15 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_scan_by_key.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
@@ -169,7 +172,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<488, 1070>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
 {
@@ -255,7 +258,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<352, 1170>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
 {
@@ -341,7 +344,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<556, 1195>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
 {
@@ -427,7 +430,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<600, 930>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
 {
@@ -513,7 +516,7 @@ struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<320, 1200>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
 {
@@ -627,7 +630,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<124, 1040>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
 {
@@ -713,7 +716,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<160, 695>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
 {
@@ -799,7 +802,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8,
   using delay_constructor = detail::fixed_delay_constructor_t<888, 635>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
 {
@@ -885,7 +888,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8,
   using delay_constructor = detail::no_delay_constructor_t<1160>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
 {
@@ -971,7 +974,7 @@ struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8,
   using delay_constructor = detail::no_delay_constructor_t<1030>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <class KeyT>
 struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
index 41e5b34f5ae..1d1dc6eb587 100644
--- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
@@ -27,20 +27,23 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_select_if.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_type.cuh>
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail 
+namespace detail
 {
 
-namespace select 
+namespace select
 {
 
 enum class flagged { no, yes };
@@ -152,7 +155,7 @@ struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primit
     using delay_constructor = detail::fixed_delay_constructor_t<380, 1140>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -221,7 +224,7 @@ struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primi
     using delay_constructor = detail::fixed_delay_constructor_t<360, 1170>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -290,7 +293,7 @@ struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primi
     using delay_constructor = detail::fixed_delay_constructor_t<512, 1075>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -359,7 +362,7 @@ struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, prim
     using delay_constructor = detail::fixed_delay_constructor_t<532, 1180>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -450,7 +453,7 @@ struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primit
     using delay_constructor = detail::fixed_delay_constructor_t<832, 1165>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -519,7 +522,7 @@ struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primi
     using delay_constructor = detail::no_delay_constructor_t<1130>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -588,7 +591,7 @@ struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primi
     using delay_constructor = detail::fixed_delay_constructor_t<68, 1160>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
@@ -657,7 +660,7 @@ struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, prim
     using delay_constructor = detail::fixed_delay_constructor_t<884, 1130>;
 };
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
 {
diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
index c4b09d8495f..a73e8eed295 100644
--- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
@@ -27,11 +27,14 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_three_way_partition.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh
index d16f770965b..3fffeb055a2 100644
--- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh
@@ -27,11 +27,14 @@
 
 #pragma once
 
+#include "../../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/agent_unique_by_key.cuh>
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
-#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
 #include <cub/util_type.cuh>
@@ -716,7 +719,7 @@ struct DeviceUniqueByKeyPolicy
                                                       detail::default_delay_constructor_t<int>>;
   };
 
-  struct DefaultTuning 
+  struct DefaultTuning
   {
     static constexpr int INPUT_SIZE = sizeof(KeyT);
     enum
diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh
index 063a2c395cd..c91d8c8624d 100644
--- a/cub/cub/grid/grid_barrier.cuh
+++ b/cub/cub/grid/grid_barrier.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,8 +33,11 @@
 
 #pragma once
 
-#include "../util_debug.cuh"
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
+#include "../util_debug.cuh"
 #include "../thread/thread_load.cuh"
 
 CUB_NAMESPACE_BEGIN
@@ -195,13 +198,13 @@ public:
 
                 // Allocate and initialize to zero
                 retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes));
-                if (cudaSuccess != retval) 
+                if (cudaSuccess != retval)
                 {
                     break;
                 }
 
                 retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes));
-                if (cudaSuccess != retval) 
+                if (cudaSuccess != retval)
                 {
                     break;
                 }
diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh
index d2150511321..56e396e4c87 100644
--- a/cub/cub/grid/grid_even_share.cuh
+++ b/cub/cub/grid/grid_even_share.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -35,8 +35,9 @@
 #pragma once
 
 #include "../config.cuh"
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_math.cuh"
 #include "../util_type.cuh"
 #include "grid_mapping.cuh"
diff --git a/cub/cub/grid/grid_mapping.cuh b/cub/cub/grid/grid_mapping.cuh
index b57f193deb8..cf69555a9a0 100644
--- a/cub/cub/grid/grid_mapping.cuh
+++ b/cub/cub/grid/grid_mapping.cuh
@@ -35,6 +35,9 @@
 
 #include "../config.cuh"
 
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
+
 CUB_NAMESPACE_BEGIN
 
 
diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh
index e1933e3d381..6dba6fe55d2 100644
--- a/cub/cub/grid/grid_queue.cuh
+++ b/cub/cub/grid/grid_queue.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_debug.cuh>
 
 #include <nv/target>
diff --git a/cub/cub/host/mutex.cuh b/cub/cub/host/mutex.cuh
index 4ee40288452..dbd6435db00 100644
--- a/cub/cub/host/mutex.cuh
+++ b/cub/cub/host/mutex.cuh
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <mutex>
 
-#include <cub/config.cuh>
 #include <cub/util_deprecated.cuh>
 
 
@@ -43,8 +46,8 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * Wraps std::mutex 
- * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed 
+ * Wraps std::mutex
+ * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed
  *             in a future release. Use `std::mutex` instead.
  */
 struct CUB_DEPRECATED Mutex
diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh
index 7ea860981e6..eb94aace2a2 100644
--- a/cub/cub/iterator/arg_index_input_iterator.cuh
+++ b/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
-#include "../config.cuh"
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
 
diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh
index 9a5936d5ac6..dae5c0bb63f 100644
--- a/cub/cub/iterator/cache_modified_input_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
-#include "../config.cuh"
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
 
diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh
index 91d4fc91a7e..daf3b5e905e 100644
--- a/cub/cub/iterator/cache_modified_output_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
-#include "../config.cuh"
 
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh
index 3de5123df34..c9128d5101c 100644
--- a/cub/cub/iterator/constant_input_iterator.cuh
+++ b/cub/cub/iterator/constant_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
-#include "../config.cuh"
 
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh
index 700455f420c..ed6d254ef47 100644
--- a/cub/cub/iterator/counting_input_iterator.cuh
+++ b/cub/cub/iterator/counting_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
-#include "../config.cuh"
 
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh
index ac47a3ff344..29a30060123 100644
--- a/cub/cub/iterator/discard_output_iterator.cuh
+++ b/cub/cub/iterator/discard_output_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,11 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
-#include "../config.cuh"
-
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
     #include <thrust/iterator/iterator_facade.h>
diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh
index cd3d015aab8..7e7e3fc0ea2 100644
--- a/cub/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/thread/thread_load.cuh>
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_debug.cuh>
diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh
index 0d877e1db49..cc29c69e2f3 100644
--- a/cub/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/cub/iterator/tex_ref_input_iterator.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/iterator/tex_obj_input_iterator.cuh>
 
 #include <cstddef>
diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh
index 0b3350e88a4..99979655a15 100644
--- a/cub/cub/iterator/transform_input_iterator.cuh
+++ b/cub/cub/iterator/transform_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <iostream>
 
 #include "../thread/thread_load.cuh"
 #include "../thread/thread_store.cuh"
-#include "../config.cuh"
 
 #if (THRUST_VERSION >= 100700)
     // This iterator is compatible with Thrust API 1.7 and newer
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index 0a8456898ec..0127e899d81 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index c6b097ea354..f1f9ba460a1 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,10 +12,10 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -37,7 +37,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_type.cuh>
 
@@ -164,8 +167,8 @@ struct ArgMax
              const KeyValuePair<OffsetT, T> &b) const
   {
     // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-    // return ((b.value > a.value) || 
-    //         ((a.value == b.value) && (b.key < a.key))) 
+    // return ((b.value > a.value) ||
+    //         ((a.value == b.value) && (b.key < a.key)))
     //      ? b : a;
 
     if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
@@ -282,14 +285,14 @@ public:
 /**
  * @brief Reduce-by-segment functor.
  *
- * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative 
- * combining operator `f(const T &x, const T &y)`, an instance of this functor 
- * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose 
- * `value` field is either `b.value` if `b.key` is non-zero, or 
+ * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative
+ * combining operator `f(const T &x, const T &y)`, an instance of this functor
+ * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose
+ * `value` field is either `b.value` if `b.key` is non-zero, or
  * `f(a.value, b.value)` otherwise.
  *
- * ReduceBySegmentOp is an associative, non-commutative binary combining 
- * operator for input sequences of cub::KeyValuePair pairings. Such sequences 
+ * ReduceBySegmentOp is an associative, non-commutative binary combining
+ * operator for input sequences of cub::KeyValuePair pairings. Such sequences
  * are typically used to represent a segmented set of values to be reduced
  * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
  * first value of each segment.
@@ -348,7 +351,7 @@ struct ReduceBySegmentOp
     // else {
     //   The second partial reduction does not span a reset, so accumulate both
     //   into the running aggregate
-    // } 
+    // }
     retval.value = (second.key) ? second.value : op(first.value, second.value);
 #endif
     return retval;
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 82042a17667..55b15db8143 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,9 +33,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../thread/thread_operators.cuh"
 #include "../detail/type_traits.cuh"
-#include "../config.cuh"
 
 CUB_NAMESPACE_BEGIN
 
@@ -50,7 +53,7 @@ template <
     typename    T,
     typename    ReductionOp,
     typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
 __device__ __forceinline__ AccumT ThreadReduce(
     T*                  input,                  ///< [in] Input array
     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
@@ -79,7 +82,7 @@ template <
     typename    T,
     typename    ReductionOp,
     typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
 __device__ __forceinline__ AccumT ThreadReduce(
     T*          input,                  ///< [in] Input array
     ReductionOp reduction_op,           ///< [in] Binary reduction operator
@@ -121,7 +124,7 @@ template <
     typename    T,
     typename    ReductionOp,
     typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>> 
+    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
 __device__ __forceinline__ AccumT ThreadReduce(
     T           (&input)[LENGTH],       ///< [in] Input array
     ReductionOp reduction_op,           ///< [in] Binary reduction operator
diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh
index b5e42710fcd..7b051d88e6c 100644
--- a/cub/cub/thread/thread_scan.cuh
+++ b/cub/cub/thread/thread_scan.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../thread/thread_operators.cuh"
 
 CUB_NAMESPACE_BEGIN
diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh
index 62b3cdb4e5d..86bdb4a59ec 100644
--- a/cub/cub/thread/thread_search.cuh
+++ b/cub/cub/thread/thread_search.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 #include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
-#include <cub/config.cuh>
 
 #include <nv/target>
 
diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
index 5d486789684..68280db8dc6 100644
--- a/cub/cub/thread/thread_sort.cuh
+++ b/cub/cub/thread/thread_sort.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index d1f055df158..6ccbb9d8819 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index 7dc12fb920e..b0f3d22ec6c 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "util_namespace.cuh"
 #include "util_debug.cuh"
 
@@ -453,13 +457,13 @@ struct CachingDeviceAllocator
             if (device != entrypoint_device)
             {
                 error = CubDebug(cudaGetDevice(&entrypoint_device));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     return error;
                 }
 
                 error = CubDebug(cudaSetDevice(device));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     return error;
                 }
@@ -491,13 +495,13 @@ struct CachingDeviceAllocator
 
                     // Free device memory and destroy stream event.
                     error = CubDebug(cudaFree(block_itr->d_ptr));
-                    if (cudaSuccess != error) 
+                    if (cudaSuccess != error)
                     {
                         break;
                     }
 
                     error = CubDebug(cudaEventDestroy(block_itr->ready_event));
-                    if (cudaSuccess != error) 
+                    if (cudaSuccess != error)
                     {
                         break;
                     }
@@ -519,7 +523,7 @@ struct CachingDeviceAllocator
 
                 // Try to allocate again
                 error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     return error;
                 }
@@ -547,7 +551,7 @@ struct CachingDeviceAllocator
             if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
             {
                 error = CubDebug(cudaSetDevice(entrypoint_device));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     return error;
                 }
@@ -639,13 +643,13 @@ struct CachingDeviceAllocator
         if (device != entrypoint_device)
         {
             error = CubDebug(cudaGetDevice(&entrypoint_device));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
 
             error = CubDebug(cudaSetDevice(device));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
@@ -655,7 +659,7 @@ struct CachingDeviceAllocator
         {
             // Insert the ready event in the associated stream (must have current device set properly)
             error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
@@ -665,13 +669,13 @@ struct CachingDeviceAllocator
         {
             // Free the allocation from the runtime and cleanup the event.
             error = CubDebug(cudaFree(d_ptr));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
 
             error = CubDebug(cudaEventDestroy(search_key.ready_event));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
@@ -684,7 +688,7 @@ struct CachingDeviceAllocator
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
         {
             error = CubDebug(cudaSetDevice(entrypoint_device));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 return error;
             }
@@ -728,7 +732,7 @@ struct CachingDeviceAllocator
             if (entrypoint_device == INVALID_DEVICE_ORDINAL)
             {
                 error = CubDebug(cudaGetDevice(&entrypoint_device));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
@@ -738,7 +742,7 @@ struct CachingDeviceAllocator
             if (begin->device != current_device)
             {
                 error = CubDebug(cudaSetDevice(begin->device));
-                if (cudaSuccess != error) 
+                if (cudaSuccess != error)
                 {
                     break;
                 }
@@ -747,13 +751,13 @@ struct CachingDeviceAllocator
 
             // Free device memory
             error = CubDebug(cudaFree(begin->d_ptr));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             error = CubDebug(cudaEventDestroy(begin->ready_event));
-            if (cudaSuccess != error) 
+            if (cudaSuccess != error)
             {
                 break;
             }
diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh
index d2506e93cfc..4d8608e8e88 100644
--- a/cub/cub/util_arch.cuh
+++ b/cub/cub/util_arch.cuh
@@ -33,9 +33,13 @@
 
 #pragma once
 
-#include <cub/util_cpp_dialect.cuh>
-#include <cub/util_namespace.cuh>
-#include <cub/util_macro.cuh>
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
 
 // Legacy include; this functionality used to be defined in here.
 #include <cub/detail/detect_cuda_runtime.cuh>
@@ -44,7 +48,7 @@ CUB_NAMESPACE_BEGIN
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-// \deprecated [Since 2.1.0] 
+// \deprecated [Since 2.1.0]
 #define CUB_USE_COOPERATIVE_GROUPS
 
 /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh
index 7cda3c44012..4acf6ba83b5 100644
--- a/cub/cub/util_compiler.cuh
+++ b/cub/cub/util_compiler.cuh
@@ -32,6 +32,11 @@
 
 #pragma once
 
+// For `_CCCL_IMPLICIT_SYSTEM_HEADER`
+#include <cuda/std/detail/__config>
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 // enumerate host compilers we know about
 #define CUB_HOST_COMPILER_UNKNOWN 0
 #define CUB_HOST_COMPILER_MSVC 1
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index 23adf8e8dc7..1b1afb53cf8 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -31,6 +31,10 @@
 
 #pragma once
 
+#include <cuda/std/detail/__config>
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "util_compiler.cuh"
 
 // Deprecation warnings may be silenced by defining the following macros. These
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index 5023524dd6e..6584afb5172 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -36,8 +36,9 @@
 
 #pragma once
 
-#include <cub/util_namespace.cuh>
-#include <cub/util_arch.cuh>
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include <nv/target>
 
@@ -58,8 +59,8 @@ CUB_NAMESPACE_BEGIN
 /**
  * @def CUB_DEBUG_SYNC
  *
- * Causes synchronization of the stream after every kernel launch to check 
- * for errors. Also causes kernel launch configurations to be printed to the 
+ * Causes synchronization of the stream after every kernel launch to check
+ * for errors. Also causes kernel launch configurations to be printed to the
  * console.
  */
 #define CUB_DEBUG_SYNC
@@ -67,7 +68,7 @@ CUB_NAMESPACE_BEGIN
 /**
  * @def CUB_DEBUG_HOST_ASSERTIONS
  *
- * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition 
+ * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition
  * assertions.
  */
 #define CUB_DEBUG_HOST_ASSERTIONS
@@ -75,7 +76,7 @@ CUB_NAMESPACE_BEGIN
 /**
  * @def CUB_DEBUG_DEVICE_ASSERTIONS
  *
- * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side 
+ * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side
  * precondition assertions.
  */
 #define CUB_DEBUG_DEVICE_ASSERTIONS
@@ -83,14 +84,14 @@ CUB_NAMESPACE_BEGIN
 /**
  * @def CUB_DEBUG_ALL
  *
- * Causes host and device-side precondition assertions to be checked. Apart 
- * from that, causes synchronization of the stream after every kernel launch to 
- * check for errors. Also causes kernel launch configurations to be printed to 
+ * Causes host and device-side precondition assertions to be checked. Apart
+ * from that, causes synchronization of the stream after every kernel launch to
+ * check for errors. Also causes kernel launch configurations to be printed to
  * the console.
  */
 #define CUB_DEBUG_ALL
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS 
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 /**
  * \addtogroup UtilMgmt
@@ -132,7 +133,7 @@ CUB_NAMESPACE_BEGIN
 
 // All
 #ifdef CUB_DEBUG_ALL
-#define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL 
+#define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL
 #endif
 
 // Default case, no extra debugging:
@@ -196,11 +197,11 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line)
   cudaError_t last_error = cudaSuccess;
 
   NV_IF_TARGET(
-    NV_IS_HOST, 
+    NV_IS_HOST,
     (last_error = cudaGetLastError();),
     (CUB_TEMP_DEVICE_CODE;)
   );
-  
+
   #undef CUB_TEMP_DEVICE_CODE
   // clang-format on
 
diff --git a/cub/cub/util_deprecated.cuh b/cub/cub/util_deprecated.cuh
index a988c9fca90..842f3560146 100644
--- a/cub/cub/util_deprecated.cuh
+++ b/cub/cub/util_deprecated.cuh
@@ -32,13 +32,13 @@
 
 #pragma once
 
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
 #include <cub/detail/type_traits.cuh>
-#include <cub/util_compiler.cuh>
-#include <cub/util_cpp_dialect.cuh>
 #include <cub/util_debug.cuh>
 
-
 #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
 #  define CUB_IGNORE_DEPRECATED_API
 #endif
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index c7e15cafe06..dd7bfaace9f 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -37,14 +37,14 @@
 
 #pragma once
 
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cuda/std/utility>
 
 #include <cub/detail/device_synchronize.cuh>
-#include <cub/util_arch.cuh>
-#include <cub/util_cpp_dialect.cuh>
 #include <cub/util_debug.cuh>
-#include <cub/util_macro.cuh>
-#include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
@@ -412,7 +412,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d
     {
         int major = 0, minor = 0;
         error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
-        if (cudaSuccess != error) 
+        if (cudaSuccess != error)
         {
             break;
         }
@@ -544,7 +544,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
     cudaError_t error = cudaSuccess;
     int device = -1;
     error = CubDebug(cudaGetDevice(&device));
-    if (cudaSuccess != error) 
+    if (cudaSuccess != error)
     {
         return error;
     }
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index d8f46f09075..720d4918606 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -32,6 +32,10 @@
 
 #pragma once
 
+#include "version.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/detect_cuda_runtime.cuh>
 #include <cub/util_namespace.cuh>
 
@@ -125,7 +129,7 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
  */
 #if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION)
 _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes")
-_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes")                      
+_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes")
 #endif
 
 /** @} */       // end group UtilModule
diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh
index d69fc2ee2d5..8d27c26003e 100644
--- a/cub/cub/util_math.cuh
+++ b/cub/cub/util_math.cuh
@@ -32,10 +32,11 @@
 
 #pragma once
 
-#include <type_traits>
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
+#include <type_traits>
 
 CUB_NAMESPACE_BEGIN
 
diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh
index 27ff12dbbaa..7289ebe02b4 100644
--- a/cub/cub/util_namespace.cuh
+++ b/cub/cub/util_namespace.cuh
@@ -38,7 +38,10 @@
 // This is not used by this file; this is a hack so that we can detect the
 // CUB version from Thrust on older versions of CUB that did not have
 // version.cuh.
-#include <cub/version.cuh>
+#include "version.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/detail/detect_cuda_runtime.cuh>
 
 // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index ff6fdb07f50..425eec2e7a0 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,14 +34,14 @@
 
 #pragma once
 
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
+#include "config.cuh"
 
+_CCCL_IMPLICIT_SYSTEM_HEADER
 
-CUB_NAMESPACE_BEGIN
+#include "util_debug.cuh"
+#include "util_type.cuh"
 
+CUB_NAMESPACE_BEGIN
 
 /**
  * \addtogroup UtilPtx
@@ -163,7 +163,7 @@ __device__ __forceinline__ unsigned int BFE(
     return (source >> bit_start) & MASK;
 }
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 /**
  * Bitfield-extract for 128-bit types.
  */
@@ -328,7 +328,7 @@ __device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_m
 /**
  * Warp synchronous shfl_up
  */
-__device__ __forceinline__ 
+__device__ __forceinline__
 unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
 {
     asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
@@ -339,7 +339,7 @@ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned
 /**
  * Warp synchronous shfl_down
  */
-__device__ __forceinline__ 
+__device__ __forceinline__
 unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
 {
     asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
@@ -350,7 +350,7 @@ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsign
 /**
  * Warp synchronous shfl_idx
  */
-__device__ __forceinline__ 
+__device__ __forceinline__
 unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
 {
     asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
@@ -361,7 +361,7 @@ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned
 /**
  * Warp synchronous shfl_idx
  */
-__device__ __forceinline__ 
+__device__ __forceinline__
 unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
 {
     return __shfl_sync(member_mask, word, src_lane);
@@ -395,7 +395,7 @@ __device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
  */
 __device__ __forceinline__ void ThreadExit() {
     asm volatile("exit;");
-}    
+}
 
 
 /**
@@ -561,7 +561,7 @@ __device__ __forceinline__ T ShuffleUp(
     typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
 
     constexpr int   WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
- 
+
     T               output;
     ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
     ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
@@ -728,26 +728,26 @@ __device__ __forceinline__ T ShuffleIndex(
 
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-namespace detail 
+namespace detail
 {
 
-/** 
- * Implementation detail for `MatchAny`. It provides specializations for full and partial warps. 
- * For partial warps, inactive threads must be masked out. This is done in the partial warp 
- * specialization below. 
+/**
+ * Implementation detail for `MatchAny`. It provides specializations for full and partial warps.
+ * For partial warps, inactive threads must be masked out. This is done in the partial warp
+ * specialization below.
  * Usage:
  * ```
- * // returns a mask of threads with the same 4 least-significant bits of `label` 
+ * // returns a mask of threads with the same 4 least-significant bits of `label`
  * // in a warp with 16 active threads
- * warp_matcher_t<4, 16>::match_any(label); 
+ * warp_matcher_t<4, 16>::match_any(label);
  *
- * // returns a mask of threads with the same 4 least-significant bits of `label` 
+ * // returns a mask of threads with the same 4 least-significant bits of `label`
  * // in a warp with 32 active threads (no extra work is done)
- * warp_matcher_t<4, 32>::match_any(label); 
+ * warp_matcher_t<4, 32>::match_any(label);
  * ```
  */
 template <int LABEL_BITS, int WARP_ACTIVE_THREADS>
-struct warp_matcher_t 
+struct warp_matcher_t
 {
 
   static __device__ unsigned int match_any(unsigned int label)
@@ -758,7 +758,7 @@ struct warp_matcher_t
 };
 
 template <int LABEL_BITS>
-struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS> 
+struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS>
 {
 
   // match.any.sync.b32 is slower when matching a few bits
diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh
index 588c554a32f..c5695ff56f9 100644
--- a/cub/cub/util_temporary_storage.cuh
+++ b/cub/cub/util_temporary_storage.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_debug.cuh>
 #include <cub/util_namespace.cuh>
 
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 2beecbc892c..bb207a4972f 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cfloat>
 #include <iostream>
 #include <iterator>
@@ -48,11 +52,6 @@
 #endif
 
 #include <cub/detail/uninitialized_copy.cuh>
-#include <cub/util_arch.cuh>
-#include <cub/util_compiler.cuh>
-#include <cub/util_deprecated.cuh>
-#include <cub/util_macro.cuh>
-#include <cub/util_namespace.cuh>
 
 #include <cuda/std/type_traits>
 
@@ -1220,7 +1219,7 @@ template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTE
 template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
 
 
-#if CUB_IS_INT128_ENABLED 
+#if CUB_IS_INT128_ENABLED
 template <>
 struct NumericTraits<__uint128_t>
 {
diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh
index d51023912fa..2ad82bb4468 100644
--- a/cub/cub/version.cuh
+++ b/cub/cub/version.cuh
@@ -35,6 +35,11 @@
 
 #pragma once
 
+// For `_CCCL_IMPLICIT_SYSTEM_HEADER`
+#include <cuda/std/detail/__config>
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 /*! \def CUB_VERSION
  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
  *         number of the CUB library.
diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
index fa73509b319..f7d214f8b2f 100644
--- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh
@@ -27,7 +27,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
@@ -51,7 +54,7 @@ class WarpExchangeShfl
   static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
 
   // concrete recursion class
-  template <typename OutputT, int IDX, int SIZE> 
+  template <typename OutputT, int IDX, int SIZE>
   class CompileTimeArray : protected CompileTimeArray<OutputT, IDX + 1, SIZE>
   {
   protected:
@@ -239,7 +242,7 @@ class WarpExchangeShfl
   };
 
   // terminating partial specialization
-  template <typename OutputT, int SIZE> 
+  template <typename OutputT, int SIZE>
   class CompileTimeArray<OutputT, SIZE, SIZE>
   {
   protected:
diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh
index f1c2edce60f..372e1def2b0 100644
--- a/cub/cub/warp/specializations/warp_exchange_smem.cuh
+++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index ad6d2512f92..fabea446673 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../thread/thread_operators.cuh"
 #include "../../util_ptx.cuh"
 #include "../../util_type.cuh"
@@ -46,31 +49,31 @@
 CUB_NAMESPACE_BEGIN
 
 
-namespace detail 
+namespace detail
 {
 
 template <class A = int, class = A>
-struct reduce_add_exists : ::cuda::std::false_type 
+struct reduce_add_exists : ::cuda::std::false_type
 {};
 
 template <class T>
-struct reduce_add_exists<T, decltype(__reduce_add_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type 
+struct reduce_add_exists<T, decltype(__reduce_add_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
 {};
 
 template <class T = int, class = T>
-struct reduce_min_exists : ::cuda::std::false_type 
+struct reduce_min_exists : ::cuda::std::false_type
 {};
 
 template <class T>
-struct reduce_min_exists<T, decltype(__reduce_min_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type 
+struct reduce_min_exists<T, decltype(__reduce_min_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
 {};
 
 template <class T = int, class = T>
-struct reduce_max_exists : ::cuda::std::false_type 
+struct reduce_max_exists : ::cuda::std::false_type
 {};
 
 template <class T>
-struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type 
+struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
 {};
 
 }
@@ -419,7 +422,7 @@ struct WarpReduceShfl
     //---------------------------------------------------------------------
     template <typename ReductionOp>
     __device__ __forceinline__ T ReduceImpl(
-        Int2Type<0>     /* all_lanes_valid */, 
+        Int2Type<0>     /* all_lanes_valid */,
         T               input,                  ///< [in] Calling thread's input
         int             valid_items,            ///< [in] Total number of valid items across the logical warp
         ReductionOp     reduction_op)           ///< [in] Binary reduction operator
@@ -436,7 +439,7 @@ struct WarpReduceShfl
 
     template <typename ReductionOp>
     __device__ __forceinline__ T ReduceImpl(
-        Int2Type<1>     /* all_lanes_valid */, 
+        Int2Type<1>     /* all_lanes_valid */,
         T               input,                  ///< [in] Calling thread's input
         int             /* valid_items */,      ///< [in] Total number of valid items across the logical warp
         ReductionOp     reduction_op)           ///< [in] Binary reduction operator
@@ -452,7 +455,7 @@ struct WarpReduceShfl
     }
 
     template <class U = T>
-    __device__ __forceinline__ 
+    __device__ __forceinline__
     typename std::enable_if<
                (std::is_same<int, U>::value || std::is_same<unsigned int, U>::value)
             && detail::reduce_add_exists<>::value, T>::type
@@ -474,7 +477,7 @@ struct WarpReduceShfl
     }
 
     template <class U = T>
-    __device__ __forceinline__ 
+    __device__ __forceinline__
     typename std::enable_if<
                (std::is_same<int, U>::value || std::is_same<unsigned int, U>::value)
             && detail::reduce_min_exists<>::value, T>::type
@@ -496,7 +499,7 @@ struct WarpReduceShfl
     }
 
     template <class U = T>
-    __device__ __forceinline__ 
+    __device__ __forceinline__
     typename std::enable_if<
                (std::is_same<int, U>::value || std::is_same<unsigned int, U>::value)
             && detail::reduce_max_exists<>::value, T>::type
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index 7ffb73a3c27..fc86802cb53 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../thread/thread_operators.cuh"
 #include "../../thread/thread_load.cuh"
 #include "../../thread/thread_store.cuh"
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index c2988711c8d..cb452c40967 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../thread/thread_operators.cuh"
 #include "../../util_type.cuh"
 #include "../../util_ptx.cuh"
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index f5290e979a4..a7768233aa4 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -34,6 +34,9 @@
 #pragma once
 
 #include "../../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../../thread/thread_operators.cuh"
 #include "../../thread/thread_load.cuh"
 #include "../../thread/thread_store.cuh"
diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh
index e863f67b103..798b3bbacba 100644
--- a/cub/cub/warp/warp_exchange.cuh
+++ b/cub/cub/warp/warp_exchange.cuh
@@ -33,7 +33,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/specializations/warp_exchange_shfl.cuh>
diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh
index 424145588c2..0d917f953f8 100644
--- a/cub/cub/warp/warp_load.cuh
+++ b/cub/cub/warp/warp_load.cuh
@@ -29,8 +29,11 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/block/block_load.cuh>
-#include <cub/config.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh
index 3ad5dccd9c4..7f7beb7796b 100644
--- a/cub/cub/warp/warp_merge_sort.cuh
+++ b/cub/cub/warp/warp_merge_sort.cuh
@@ -27,8 +27,11 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/block/block_merge_sort.cuh>
-#include <cub/config.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 2901f56bbcc..7b636fa3c46 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/specializations/warp_reduce_shfl.cuh>
diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh
index feff7ffe69d..0d4bb31ff4e 100644
--- a/cub/cub/warp/warp_scan.cuh
+++ b/cub/cub/warp/warp_scan.cuh
@@ -34,7 +34,10 @@
 
 #pragma once
 
-#include <cub/config.cuh>
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/specializations/warp_scan_shfl.cuh>
diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh
index d40e63e65ee..cbb426aa68f 100644
--- a/cub/cub/warp/warp_store.cuh
+++ b/cub/cub/warp/warp_store.cuh
@@ -29,8 +29,11 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/block/block_store.cuh>
-#include <cub/config.cuh>
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 #include <cub/warp/warp_exchange.cuh>
diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h
index 439e25e2de0..34289f21810 100644
--- a/thrust/thrust/system/cuda/detail/util.h
+++ b/thrust/thrust/system/cuda/detail/util.h
@@ -38,7 +38,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 #include <thrust/system/cuda/error.h>
 
 #include <cub/detail/device_synchronize.cuh>
-#include <cub/util_arch.cuh>
+#include <cub/config.cuh>
 #include <cub/util_device.cuh>
 
 #include <nv/target>