Add _CCCL_IMPLICIT_SYSTEM_HEADER to cub headers

NVIDIA · Oct 11, 2023 · b7e6717 · b7e6717
1 parent b283432
commit b7e6717
Show file tree

Hide file tree

Showing 142 changed files with 3,402 additions and 2,988 deletions.
diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_namespace.cuh"
 #include "../block/block_load.cuh"
@@ -159,7 +162,7 @@ struct AgentDifference
       }
       else
       {
-        InputT tile_prev_input = MayAlias 
+        InputT tile_prev_input = MayAlias
                                ? first_tile_previous[tile_idx]
                                : *(input_it + tile_base - 1);
 

diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh
@@ -33,6 +33,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/block/block_exchange.cuh>
 #include <cub/block/block_load.cuh>

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
@@ -33,11 +33,14 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include "../util_type.cuh"
 #include "../block/block_load.cuh"
-#include "../config.cuh"
 #include "../grid/grid_queue.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 
@@ -301,8 +304,8 @@ struct AgentHistogram
         for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
         {
             int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
+            for (int privatized_bin = threadIdx.x;
+                    privatized_bin < channel_bins;
                     privatized_bin += BLOCK_THREADS)
             {
                 int         output_bin  = -1;
@@ -631,7 +634,7 @@ struct AgentHistogram
                 // Consume a partially-full tile at the end of the row
                 OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
                 ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-            } 
+            }
             else
             {
                 // Consume full tile

diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh
@@ -28,6 +28,9 @@
 #pragma once
 
 #include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../util_type.cuh"
 #include "../util_namespace.cuh"
 #include "../block/block_load.cuh"

diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -35,6 +35,10 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <stdint.h>
 #include <type_traits>
 
@@ -44,7 +48,6 @@
 #include <cub/block/block_radix_rank.cuh>
 #include <cub/block/block_exchange.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
-#include <cub/config.cuh>
 #include <cub/util_type.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 
@@ -135,7 +138,7 @@ struct AgentRadixSortDownsweep
     using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;
 
     // Radix ranking type to use
-    using BlockRadixRankT = 
+    using BlockRadixRankT =
       cub::detail::block_radix_rank_t<
         RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;
 
@@ -202,7 +205,7 @@ struct AgentRadixSortDownsweep
     // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
     OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
 
-    std::uint32_t current_bit; 
+    std::uint32_t current_bit;
     std::uint32_t num_bits;
 
     // Whether to short-cirucit
@@ -488,15 +491,15 @@ struct AgentRadixSortDownsweep
         OffsetT           relative_bin_offsets[ITEMS_PER_THREAD];
 
         // Assign default (min/max) value to all keys
-        bit_ordered_type default_key = IS_DESCENDING 
-                                     ? traits::min_raw_binary_key(decomposer) 
+        bit_ordered_type default_key = IS_DESCENDING
+                                     ? traits::min_raw_binary_key(decomposer)
                                      : traits::max_raw_binary_key(decomposer);
 
         // Load tile of keys
         LoadKeys(
             keys,
             block_offset,
-            valid_items, 
+            valid_items,
             default_key,
             Int2Type<FULL_TILE>(),
             Int2Type<LOAD_WARP_STRIPED>());

diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -34,9 +34,12 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../block/block_load.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../thread/thread_reduce.cuh"
 #include "../util_math.cuh"
 #include "../util_type.cuh"
@@ -120,7 +123,7 @@ struct AgentRadixSortHistogram
     // thread fields
     // shared memory storage
     _TempStorage& s;
-  
+
     // bins for the histogram
     OffsetT* d_bins_out;
 
@@ -175,7 +178,7 @@ struct AgentRadixSortHistogram
     }
 
     __device__ __forceinline__
-    void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])    
+    void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
     {
         // tile_offset < num_items always, hence the line below works
         bool full_tile = num_items - tile_offset >= TILE_ITEMS;
@@ -264,7 +267,7 @@ struct AgentRadixSortHistogram
                 AccumulateSharedHistograms(tile_offset, keys);
             }
             CTA_SYNC();
-            
+
             // Accumulate the result in global memory.
             AccumulateGlobalHistograms();
             CTA_SYNC();

diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -33,10 +33,13 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../block/block_radix_rank.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
 #include "../block/block_store.cuh"
-#include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
 
@@ -64,7 +67,7 @@ template <
     int NOMINAL_BLOCK_THREADS_4B,
     int NOMINAL_ITEMS_PER_THREAD_4B,
     typename ComputeT,
-    /** \brief Number of private histograms to use in the ranker; 
+    /** \brief Number of private histograms to use in the ranker;
         ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
     int _RANK_NUM_PARTS,
     /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
@@ -106,7 +109,7 @@ struct AgentRadixSortOnesweep
         RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
         TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
         RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
-        RADIX_DIGITS = 1 << RADIX_BITS,        
+        RADIX_DIGITS = 1 << RADIX_BITS,
         BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
         FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
         WARP_THREADS = CUB_PTX_WARP_THREADS,
@@ -127,7 +130,7 @@ struct AgentRadixSortOnesweep
       typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
 
     typedef PortionOffsetT AtomicOffsetT;
-  
+
     static constexpr RadixRankAlgorithm RANK_ALGORITHM =
                                     AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
     static constexpr BlockScanAlgorithm SCAN_ALGORITHM =
@@ -224,7 +227,7 @@ struct AgentRadixSortOnesweep
     __device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
     {
         #pragma unroll
-        for (int u = 0; u < BINS_PER_THREAD; ++u) 
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
         {
             int bin = ThreadBin(u);
             if (FULL_BINS || bin < RADIX_DIGITS)
@@ -260,7 +263,7 @@ struct AgentRadixSortOnesweep
             agent.TryShortCircuit(keys, bins);
         }
     };
-  
+
     __device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
     {
         #pragma unroll
@@ -452,7 +455,7 @@ struct AgentRadixSortOnesweep
             {
                 s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
             }
-        }        
+        }
     }
 
     __device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
@@ -512,7 +515,7 @@ struct AgentRadixSortOnesweep
         constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
         constexpr int ALIGN = 8;
         constexpr auto CACHE_MODIFIER = STORE_CG;
-        
+
         int warp_start = warp * ITEMS_PER_WARP;
         int warp_end = (warp + 1) * ITEMS_PER_WARP;
         int warp_offset = warp_start;
@@ -596,19 +599,19 @@ struct AgentRadixSortOnesweep
         // compute digits corresponding to the keys
         int digits[ITEMS_PER_THREAD];
         ComputeKeyDigits(digits);
-        
+
         // load values
         ValueT values[ITEMS_PER_THREAD];
         LoadValues(block_idx * TILE_ITEMS, values);
-        
+
         // scatter values
         CTA_SYNC();
         ScatterValuesShared(values, ranks);
 
         CTA_SYNC();
         ScatterValuesGlobal(digits);
     }
-        
+
 
     __device__ __forceinline__ void GatherScatterValues(
         int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
@@ -628,7 +631,7 @@ struct AgentRadixSortOnesweep
         BlockRadixRankT(s.rank_temp_storage).RankKeys(
             keys, ranks, digit_extractor(), exclusive_digit_prefix,
             CountsCallback(*this, bins, keys));
-        
+
         // scatter keys in shared memory
         CTA_SYNC();
         ScatterKeysShared(keys, ranks);
@@ -637,7 +640,7 @@ struct AgentRadixSortOnesweep
         LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
         LookbackGlobal(bins);
         UpdateBinsGlobal(bins, exclusive_digit_prefix);
-                
+
         // scatter keys in global memory
         CTA_SYNC();
         ScatterKeysGlobal();

diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -33,12 +33,15 @@
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_load.cuh"
 #include "../warp/warp_reduce.cuh"
 #include "../block/block_load.cuh"
 #include "../block/radix_rank_sort_operations.cuh"
-#include "../config.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
 
@@ -321,7 +324,7 @@ struct AgentRadixSortUpsweep
         const OffsetT &block_end)
     {
         // Process partial tile if necessary using single loads
-        for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) 
+        for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
         {
             // Load and bucket key
             bit_ordered_type key = d_keys_in[block_offset + offset];
@@ -346,7 +349,7 @@ struct AgentRadixSortUpsweep
     :
         temp_storage(temp_storage.Alias()),
         d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in)),
-        current_bit(current_bit), 
+        current_bit(current_bit),
         num_bits(num_bits),
         decomposer(decomposer)
     {}

diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
@@ -13,9 +13,9 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@@ -27,17 +27,20 @@
  ******************************************************************************/
 
 /**
- * @file cub::AgentReduce implements a stateful abstraction of CUDA thread 
+ * @file cub::AgentReduce implements a stateful abstraction of CUDA thread
  *       blocks for participating in device-wide reduction.
  */
 
 #pragma once
 
+#include "../config.cuh"
+
+_CCCL_IMPLICIT_SYSTEM_HEADER
+
 #include <iterator>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_reduce.cuh>
-#include <cub/config.cuh>
 #include <cub/detail/type_traits.cuh>
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/grid/grid_mapping.cuh>
@@ -368,7 +371,7 @@ struct AgentReduce
         .Reduce(thread_aggregate, reduction_op, valid_items);
     }
 
-    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse 
+    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse
     // the block reduction below. This also workaround hang in nvcc.
     ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);
 
@@ -439,7 +442,7 @@ private:
 
     even_share.block_offset += even_share.block_stride;
 
-    // Consume subsequent full tiles of input, at least one full tile was processed, so 
+    // Consume subsequent full tiles of input, at least one full tile was processed, so
     // `even_share.block_end >= TILE_ITEMS`
     while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
     {