NVIDIA · gevtushenko · Aug 3, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 15, 2022
diff --git a/cub/agent/agent_reduce_by_key.cuh b/cub/agent/agent_reduce_by_key.cuh
diff --git a/cub/agent/agent_scan.cuh b/cub/agent/agent_scan.cuh
diff --git a/cub/agent/agent_scan_by_key.cuh b/cub/agent/agent_scan_by_key.cuh
diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
@@ -35,11 +35,12 @@
 
 #include <iterator>
 
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -738,8 +739,10 @@ struct TilePrefixCallbackOp
         // Update our status with our tile-aggregate
         if (threadIdx.x == 0)
         {
-            temp_storage.block_aggregate = block_aggregate;
-            tile_status.SetPartial(tile_idx, block_aggregate);
+          detail::uninitialized_copy(&temp_storage.block_aggregate,
+                                     block_aggregate);
+
+          tile_status.SetPartial(tile_idx, block_aggregate);
         }
 
         int         predecessor_idx = tile_idx - threadIdx.x - 1;
@@ -768,8 +771,11 @@ struct TilePrefixCallbackOp
             inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
             tile_status.SetInclusive(tile_idx, inclusive_prefix);
 
-            temp_storage.exclusive_prefix = exclusive_prefix;
-            temp_storage.inclusive_prefix = inclusive_prefix;
+            detail::uninitialized_copy(&temp_storage.exclusive_prefix,
+                                       exclusive_prefix);
+
+            detail::uninitialized_copy(&temp_storage.inclusive_prefix,
+                                       inclusive_prefix);
         }
 
         // Return exclusive_prefix

diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
@@ -33,10 +33,11 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../warp/warp_exchange.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -209,7 +210,8 @@ private:
         {
             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -250,7 +252,8 @@ private:
                 {
                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 
@@ -298,7 +301,8 @@ private:
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         WARP_SYNC(0xffffffff);
@@ -328,7 +332,8 @@ private:
             {
                 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                temp_storage.buff[item_offset] = input_items[ITEM];
+                detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                           input_items[ITEM]);
             }
 
             WARP_SYNC(0xffffffff);
@@ -354,7 +359,8 @@ private:
                 {
                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
 
                 WARP_SYNC(0xffffffff);
@@ -385,7 +391,8 @@ private:
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -434,7 +441,9 @@ private:
                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
                     {
                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage.buff[item_offset] = input_items[ITEM];
+                        detail::uninitialized_copy(temp_storage.buff +
+                                                     item_offset,
+                                                   input_items[ITEM]);
                     }
                 }
             }
@@ -476,7 +485,8 @@ private:
         {
             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         WARP_SYNC(0xffffffff);
@@ -486,7 +496,8 @@ private:
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
+            detail::uninitialized_copy(output_items + ITEM,
+                                       temp_storage.buff[item_offset]);
         }
     }
 
@@ -512,7 +523,8 @@ private:
                 {
                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
 
                 WARP_SYNC(0xffffffff);
@@ -544,7 +556,8 @@ private:
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -584,7 +597,8 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 
@@ -626,7 +640,8 @@ private:
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -668,7 +683,8 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 

diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -33,9 +33,10 @@
 
 #pragma once
 
-#include "../../warp/warp_reduce.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -143,7 +144,8 @@ struct BlockReduceWarpReductions
         // Share lane aggregates
         if (lane_id == 0)
         {
-            new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate);
+          detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
+                                     warp_aggregate);
         }
 
         CTA_SYNC();

diff --git a/cub/block/specializations/block_scan_warp_scans.cuh b/cub/block/specializations/block_scan_warp_scans.cuh
@@ -33,9 +33,10 @@
 
 #pragma once
 
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_scan.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -151,7 +152,10 @@ struct BlockScanWarpScans
     {
         // Last lane in each warp shares its warp-aggregate
         if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        {
+          detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
+                                     warp_aggregate);
+        }
 
         CTA_SYNC();
 
@@ -293,9 +297,11 @@ struct BlockScanWarpScans
             T block_prefix = block_prefix_callback_op(block_aggregate);
             if (lane_id == 0)
             {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+              // Share the prefix with all threads
+              detail::uninitialized_copy(&temp_storage.block_prefix,
+                                         block_prefix);
+
+              exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
             }
         }
 
@@ -367,7 +373,8 @@ struct BlockScanWarpScans
             if (lane_id == 0)
             {
                 // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
+                detail::uninitialized_copy(&temp_storage.block_prefix,
+                                           block_prefix);
             }
         }
 

diff --git a/cub/detail/uninitialized_copy.cuh b/cub/detail/uninitialized_copy.cuh
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+
+namespace detail
+{
+
+template <typename T,
+          typename U,
+          typename cuda::std::enable_if<
+            cuda::std::is_trivially_copyable<T>::value, 
+            int
+          >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  *ptr = cuda::std::forward<U>(val);
+}
+
+template <typename T, 
+         typename U,
+         typename cuda::std::enable_if<
+           !cuda::std::is_trivially_copyable<T>::value,
+           int
+         >::type = 0>
+__host__ __device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  new (ptr) T(cuda::std::forward<U>(val));
+}
+
+} // namespace detail
+
+
+CUB_NAMESPACE_END
+