From a355c62afe28adbe8e1909574f64e7bff4f5f8ed Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 18 Dec 2023 21:28:20 -0800
Subject: [PATCH 01/21] CUB for each

---
 cub/benchmarks/bench/for_each/base.cu         |  79 ++
 cub/benchmarks/bench/for_each/copy.cu         |  79 ++
 cub/cub/agent/agent_for.cuh                   |  84 ++
 cub/cub/cub.cuh                               |   1 +
 cub/cub/device/device_for.cuh                 | 817 ++++++++++++++++++
 cub/cub/device/dispatch/dispatch_for.cuh      | 301 +++++++
 cub/cub/device/dispatch/tuning/tuning_for.cuh |  63 ++
 cub/docs/repo.toml                            |   2 +-
 cub/test/catch2_test_device_bulk.cu           |  76 ++
 cub/test/catch2_test_device_for.cu            | 239 +++++
 cub/test/catch2_test_device_for_api.cu        | 263 ++++++
 cub/test/catch2_test_device_for_copy.cu       | 180 ++++
 cub/test/catch2_test_device_for_utils.cu      | 102 +++
 thrust/benchmarks/bench/for_each/basic.cu     |  67 ++
 thrust/benchmarks/bench/tabulate/basic.cu     |  76 ++
 thrust/benchmarks/bench/transform/basic.cu    |  95 ++
 .../system/cuda/detail/async/for_each.h       |   5 +-
 .../system/cuda/detail/async/transform.h      |   4 +-
 thrust/thrust/system/cuda/detail/for_each.h   |  48 +-
 .../thrust/system/cuda/detail/parallel_for.h  | 111 +--
 20 files changed, 2551 insertions(+), 141 deletions(-)
 create mode 100644 cub/benchmarks/bench/for_each/base.cu
 create mode 100644 cub/benchmarks/bench/for_each/copy.cu
 create mode 100644 cub/cub/agent/agent_for.cuh
 create mode 100644 cub/cub/device/device_for.cuh
 create mode 100644 cub/cub/device/dispatch/dispatch_for.cuh
 create mode 100644 cub/cub/device/dispatch/tuning/tuning_for.cuh
 create mode 100644 cub/test/catch2_test_device_bulk.cu
 create mode 100644 cub/test/catch2_test_device_for.cu
 create mode 100644 cub/test/catch2_test_device_for_api.cu
 create mode 100644 cub/test/catch2_test_device_for_copy.cu
 create mode 100644 cub/test/catch2_test_device_for_utils.cu
 create mode 100644 thrust/benchmarks/bench/for_each/basic.cu
 create mode 100644 thrust/benchmarks/bench/tabulate/basic.cu
 create mode 100644 thrust/benchmarks/bench/transform/basic.cu

diff --git a/cub/benchmarks/bench/for_each/base.cu b/cub/benchmarks/bench/for_each/base.cu
new file mode 100644
index 00000000000..f6801a89c14
--- /dev/null
+++ b/cub/benchmarks/bench/for_each/base.cu
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = nullptr;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/for_each/copy.cu b/cub/benchmarks/bench/for_each/copy.cu
new file mode 100644
index 00000000000..7dbb3cd9fac
--- /dev/null
+++ b/cub/benchmarks/bench/for_each/copy.cu
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = nullptr;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachCopyN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachCopyN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/cub/agent/agent_for.cuh b/cub/cub/agent/agent_for.cuh
new file mode 100644
index 00000000000..eccd5859cba
--- /dev/null
+++ b/cub/cub/agent/agent_for.cuh
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+template <int BlockThreads, int ItemsPerThread>
+struct policy_t
+{
+  static constexpr int block_threads    = BlockThreads;
+  static constexpr int items_per_thread = ItemsPerThread;
+};
+
+template <class PolicyT, class OffsetT, class OpT>
+struct agent_block_striped_t
+{
+  static constexpr int items_per_thread = PolicyT::items_per_thread;
+
+  OffsetT tile_base;
+  OpT op;
+
+  template <bool IsFullTile>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int items_in_tile, int block_threads)
+  {
+#pragma unroll
+    for (int item = 0; item < items_per_thread; item++)
+    {
+      const auto idx = static_cast<OffsetT>(block_threads * item + threadIdx.x);
+
+      if (IsFullTile || idx < items_in_tile)
+      {
+        op(tile_base + idx);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index 73136a6077d..b64e5b4c5a4 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -74,6 +74,7 @@
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_spmv.cuh>
+#include <cub/device/device_for.cuh>
 
 // Grid
 // #include <cub/grid/grid_barrier.cuh>
diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
new file mode 100644
index 00000000000..1bda9198d3e
--- /dev/null
+++ b/cub/cub/device/device_for.cuh
@@ -0,0 +1,817 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/device/dispatch/dispatch_for.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+/**
+ * `op_wrapper_t` turns bulk into a for-each operation by wrapping the user-provided unary operator.
+ */
+template <class OffsetT, class OpT, class InputIteratorT>
+struct op_wrapper_t
+{
+  InputIteratorT input;
+  OpT op;
+
+  _CCCL_DEVICE void operator()(OffsetT i)
+  {
+    // Dereferencing `thrust::device_vector<T>` iterators returns a `thrust::device_reference<T>`
+    // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap.
+    op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i)));
+  }
+};
+
+/**
+ * `op_wrapper_vectorized_t` turns bulk into a for-each-copy operation.
+ * `op_wrapper_vectorized_t` is similar to `op_wrapper_t` but does not provide any guarantees about
+ * address of the input parameter. `OpT` might be given a copy of the value or an actual reference
+ * to the input iterator value (depending on the alignment of input iterator)
+ */
+template <class OffsetT, class OpT, class T>
+struct op_wrapper_vectorized_t
+{
+  const T* input; // Raw pointer to the input data
+  OpT op; // User-provided operator
+  OffsetT partially_filled_vector_id; // Index of the vector that doesn't have all elements
+  OffsetT num_items; // Total number of non-vectorized items
+
+  // TODO Can be extracted into tuning
+  constexpr static int vec_size = 4;
+
+  // Type of the vector that is used to load the input data
+  using vector_t = typename CubVector<T, vec_size>::Type;
+
+  _CCCL_DEVICE void operator()(OffsetT i)
+  {
+    // Surrounding `Bulk` call doesn't invoke this operator on invalid indices, so we don't need to
+    // check for out-of-bounds access here.
+    if (i != partially_filled_vector_id)
+    { // Case of fully filled vector
+      const vector_t vec = *reinterpret_cast<const vector_t*>(input + vec_size * i);
+
+#pragma unroll
+      for (int j = 0; j < vec_size; j++)
+      {
+        op(*(reinterpret_cast<const T*>(&vec) + j));
+      }
+    }
+    else
+    { // Case of partially filled vector
+      for (OffsetT j = i * vec_size; j < num_items; j++)
+      {
+        op(input[j]);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+struct DeviceFor
+{
+private:
+  /**
+   * Checks if the pointer is aligned to the given vector type
+   */
+  template <class VectorT, class T>
+  CUB_RUNTIME_FUNCTION static bool is_aligned(const T* ptr)
+  {
+    return (reinterpret_cast<std::size_t>(ptr) & (sizeof(VectorT) - 1)) == 0;
+  }
+
+  template <class InputIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    InputIteratorT first,
+    OffsetT num_items,
+    OpT op,
+    cudaStream_t stream,
+    ::cuda::std::false_type /* do_not_vectorize */)
+  {
+    using wrapped_op_t = detail::for_each::op_wrapper_t<OffsetT, OpT, InputIteratorT>;
+    return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(num_items, wrapped_op_t{first, op}, stream);
+  }
+
+  template <class InputIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    InputIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
+  {
+    auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first);
+    using wrapped_op_t   = detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<InputIteratorT>>;
+
+    if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
+    { // Vectorize loads
+      const OffsetT num_vec_items = cub::DivideAndRoundUp(num_items, wrapped_op_t::vec_size);
+
+      return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(
+        num_vec_items,
+        wrapped_op_t{
+          unwrapped_first, op, num_items % wrapped_op_t::vec_size ? num_vec_items - 1 : num_vec_items, num_items},
+        stream);
+    }
+
+    // Fallback to non-vectorized version
+    return for_each_n(first, num_items, op, stream, ::cuda::std::false_type{});
+  }
+
+public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-temp-storage
+  //!     :end-before: example-end bulk-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Bulk(void* d_temp_storage, size_t& temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Bulk(shape, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-temp-storage
+  //!     :end-before: example-end for-each-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-temp-storage
+  //!     :end-before: example-end for-each-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEach(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT first,
+    InputIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEach(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-temp-storage
+  //!     :end-before: example-end for-each-copy-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopyN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-temp-storage
+  //!     :end-before: example-end for-each-copy-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT first,
+    InputIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopy(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-wo-temp-storage
+  //!     :end-before: example-end bulk-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+    using offset_t = ShapeT; 
+    return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-wo-temp-storage
+  //!     :end-before: example-end for-each-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachN(InputIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t            = NumItemsT;
+    using use_vectorization_t = ::cuda::std::integral_constant<bool, false>;
+
+    // Disable auto-vectorization for now:
+    // constexpr bool use_vectorization =
+    //   detail::for_each::can_regain_copy_freedom<detail::value_t<InputIteratorT>, OpT>::value
+    //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value;
+
+    return for_each_n<InputIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-wo-temp-storage
+  //!     :end-before: example-end for-each-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEach(InputIteratorT first, InputIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<InputIteratorT>::difference_type;
+
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+
+    return ForEachN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopyN(InputIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value,
+                  "Input iterator must be contiguous");
+
+    using offset_t            = NumItemsT;
+    using use_vectorization_t = ::cuda::std::integral_constant<bool, true>;
+
+    return for_each_n<InputIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class InputIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopy(InputIteratorT first, InputIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value,
+                  "Input iterator must be contiguous");
+
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<InputIteratorT>::difference_type;
+
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+
+    return ForEachCopyN(first, num_items, op, stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
new file mode 100644
index 00000000000..dda4bad89cd
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -0,0 +1,301 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+template <class Fn>
+struct first_parameter
+{
+  using type = void;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A)>
+{
+  using type = A;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A) const>
+{
+  using type = A;
+};
+
+template <class Value, class Fn, class = void>
+struct has_unique_value_overload : ::cuda::std::false_type
+{};
+
+// clang-format off
+template <class Value, class Fn>
+struct has_unique_value_overload<
+  Value,
+  Fn,
+  typename ::cuda::std::enable_if<
+              !::cuda::std::is_reference<typename first_parameter<decltype(&Fn::operator())>::type>::value && 
+              ::cuda::std::is_convertible<Value, typename first_parameter<decltype(&Fn::operator())>::type
+             >::value>::type>
+    : ::cuda::std::true_type
+{};
+
+// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
+// This can be observable if the unary operator takes parameter by reference and modifies it or uses address. 
+// The trait below checks if the freedom to copy trivial types can be regained. 
+template <typename Value, typename Fn>
+using can_regain_copy_freedom = 
+  ::cuda::std::integral_constant<
+    bool,
+    ::cuda::std::is_trivially_constructible<Value>::value && 
+    ::cuda::std::is_trivially_copy_assignable<Value>::value && 
+    :: cuda::std::is_trivially_move_assignable<Value>::value && 
+    ::cuda::std::is_trivially_destructible<Value>::value && 
+    has_unique_value_overload<Value, Fn>::value>;
+// clang-format on
+
+// This kernel is used when the block size is not known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  const auto block_threads  = static_cast<OffsetT>(blockDim.x);
+  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = num_remaining < items_per_tile ? num_remaining : items_per_tile;
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// This kernel is used when the block size is known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES //
+__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
+  void static_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  const auto block_threads  = active_policy_t::block_threads;
+  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// The dispatch layer is in the detail namespace until we figure out tuning API
+template <class OffsetT, class OpT, class PolicyHubT = policy_hub_t>
+struct dispatch_t : PolicyHubT
+{
+  OffsetT num_items;
+  OpT op;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION dispatch_t(OffsetT num_items, OpT op, cudaStream_t stream)
+      : num_items(num_items)
+      , op(op)
+      , stream(stream)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE
+    cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int block_threads = 256;
+    cudaError_t error = cudaSuccess;
+
+    NV_IF_TARGET(NV_IS_HOST,
+                 (int _{}; //
+                  error = cudaOccupancyMaxPotentialBlockSize(
+                    &_, &block_threads, detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>);));
+
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE
+    cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    cudaError_t error              = cudaSuccess;
+    constexpr int block_threads    = ActivePolicyT::for_policy_t::block_threads;
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::static_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    constexpr bool static_block_size = ActivePolicyT::for_policy_t::block_threads > 0;
+    return Invoke<ActivePolicyT>(::cuda::std::integral_constant<bool, static_block_size>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    int ptx_version   = 0;
+    cudaError_t error = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch(num_items, op, stream);
+
+    error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch));
+
+    return error;
+  }
+};
+
+} // namespace for_each
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh
new file mode 100644
index 00000000000..a2a7ed00046
--- /dev/null
+++ b/cub/cub/device/dispatch/tuning/tuning_for.cuh
@@ -0,0 +1,63 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/util_device.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+struct policy_hub_t
+{
+  struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t>
+  {
+    using for_policy_t = policy_t<256, 2>;
+  };
+
+  using MaxPolicy = policy_350_t;
+};
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml
index 0d306e752c5..7cff1e571cf 100644
--- a/cub/docs/repo.toml
+++ b/cub/docs/repo.toml
@@ -60,7 +60,7 @@ doxygen_aliases = [
   "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)",
   "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.",
   "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.",
-  "devicestorage=When ``d_temp_storage`` is `NULL`, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
+  "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
   "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is ``O(P)``, where ``P`` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size ``N``).",
   "devicestorageNP=This operation requires an allocation of temporary device storage that is ``O(N+P)``, where ``N`` is the length of the input and ``P`` is the number of streaming multiprocessors on the device.",
   "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is ``O(N/C + P)``, where ``N`` is the length of the input, ``C`` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and ``P`` is the number of streaming multiprocessors on the device.",
diff --git a/cub/test/catch2_test_device_bulk.cu b/cub/test/catch2_test_device_bulk.cu
new file mode 100644
index 00000000000..25716056a12
--- /dev/null
+++ b/cub/test/catch2_test_device_bulk.cu
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::Bulk, device_bulk);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+template <class T>
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    static_assert(cuda::std::is_same<T, OffsetT>::value, "T and OffsetT must be the same type");
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+CUB_TEST("Device bulk works", "[bulk][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_bulk(num_items, incrementer_t<offset_t>{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
new file mode 100644
index 00000000000..2ce416db17c
--- /dev/null
+++ b/cub/test/catch2_test_device_for.cu
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEach, device_for_each);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachN, device_for_each_n);
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+struct bad_operator_t
+{
+  const std::size_t* d_input;
+  const std::size_t magic_value;
+
+  __device__ void operator()(const std::size_t& i) const
+  {
+    if (i == magic_value)
+    {
+      const std::size_t* d_ptr    = &i;
+      const auto offset           = static_cast<std::size_t>(d_ptr - d_input);
+      const_cast<std::size_t&>(i) = offset;
+    }
+  }
+};
+
+CUB_TEST("Device for each works", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each works with bad operators", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const std::size_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const std::size_t magic_value = num_items + 1;
+  thrust::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each(input.begin(), input.end(), bad_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+CUB_TEST("Device for each works with unaligned vectors", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items = static_cast<int>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+CUB_TEST("Device for each n works", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works with bad operators", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const std::size_t magic_value = num_items + 1;
+  thrust::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each_n(input.begin(), num_items, bad_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+CUB_TEST("Device for each n works with unaligned vectors", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for_api.cu b/cub/test/catch2_test_device_for_api.cu
new file mode 100644
index 00000000000..7b8abcd7f92
--- /dev/null
+++ b/cub/test/catch2_test_device_for_api.cu
@@ -0,0 +1,263 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+
+#include "catch2_test_helper.h"
+
+// example-begin bulk-square-t
+struct square_t
+{
+  int* d_ptr;
+
+  __device__ void operator()(int i)
+  {
+    d_ptr[i] *= d_ptr[i];
+  }
+};
+// example-end bulk-square-t
+
+// example-begin bulk-square-ref-t
+struct square_ref_t
+{
+  __device__ void operator()(int& i)
+  {
+    i *= i;
+  }
+};
+// example-end bulk-square-ref-t
+
+// example-begin bulk-odd-count-t
+struct odd_count_t
+{
+  int* d_count;
+
+  __device__ void operator()(int i)
+  {
+    if (i % 2 == 1)
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+// example-end bulk-odd-count-t
+
+CUB_TEST("Device bulk works with temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform bulk operation
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device bulk works without temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  cub::DeviceFor::Bulk(vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each operation
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each n copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu
new file mode 100644
index 00000000000..11fffd6f576
--- /dev/null
+++ b/cub/test/catch2_test_device_for_copy.cu
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopy, device_for_each_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopyN, device_for_each_copy_n);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+CUB_TEST("Device for each works", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each works with unaligned vectors", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items = static_cast<int>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works with unaligned vectors", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for_utils.cu b/cub/test/catch2_test_device_for_utils.cu
new file mode 100644
index 00000000000..8316b82b400
--- /dev/null
+++ b/cub/test/catch2_test_device_for_utils.cu
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include "catch2_test_helper.h"
+
+
+template <class T>
+struct value_t
+{
+  __device__ void operator()(T) {}
+};
+
+template <class T>
+struct const_ref_t
+{
+  __device__ void operator()(const T&) {}
+};
+
+template <class T>
+struct rref_t
+{
+  __device__ void operator()(T&&) {}
+};
+
+template <class T>
+struct value_ret_t
+{
+  __device__ T operator()(T v) { return v; }
+};
+
+template <class T>
+struct ref_t
+{
+  __device__ void operator()(T&) {}
+};
+
+struct tpl_value_t
+{
+  template <class T>
+  __device__ void operator()(T) {}
+};
+
+template <class T>
+struct overload_value_t
+{
+  __device__ void operator()(T) {}
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+struct value_const_t
+{
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+void test()
+{
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_const_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_ret_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, rref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, const_ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, overload_value_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, tpl_value_t>::value);
+}
+
+CUB_TEST("Device for utils correctly detect value overloads", "[for][device]")
+{
+  test<int>();
+  test<double>();
+
+  // conversions do not work ;(
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<char, value_t<int>>::value);
+}
diff --git a/thrust/benchmarks/bench/for_each/basic.cu b/thrust/benchmarks/bench/for_each/basic.cu
new file mode 100644
index 00000000000..968d1c3e81b
--- /dev/null
+++ b/thrust/benchmarks/bench/for_each/basic.cu
@@ -0,0 +1,67 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+
+#include "nvbench_helper.cuh"
+
+template <class T>
+struct square_t 
+{
+  __device__ void operator()(T &x) const 
+  {
+    x = x * x;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in(elements, T{1});
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  square_t<T> op{};
+  caching_allocator_t alloc;
+  thrust::for_each(policy(alloc), in.begin(), in.end(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::for_each(policy(alloc, launch), in.begin(), in.end(), op);
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/benchmarks/bench/tabulate/basic.cu b/thrust/benchmarks/bench/tabulate/basic.cu
new file mode 100644
index 00000000000..c397246bd00
--- /dev/null
+++ b/thrust/benchmarks/bench/tabulate/basic.cu
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "thrust/detail/raw_pointer_cast.h"
+#include <nvbench_helper.cuh>
+
+#include <thrust/tabulate.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+template <class T>
+struct seg_size_t
+{
+  T* d_offsets{};
+
+  template <class OffsetT>
+  __device__ T operator()(OffsetT i)
+  {
+    return d_offsets[i + 1] - d_offsets[i];
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state,
+                  nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> input(elements + 1);
+  thrust::device_vector<T> output(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements + 1);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc;
+  seg_size_t<T> op{thrust::raw_pointer_cast(input.data())};
+  thrust::tabulate(policy(alloc), output.begin(), output.end(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::tabulate(policy(alloc, launch), output.begin(), output.end(), op);
+             });
+}
+
+using types = nvbench::type_list<nvbench::uint32_t,
+                                 nvbench::uint64_t>;
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
new file mode 100644
index 00000000000..5485bc08b9e
--- /dev/null
+++ b/thrust/benchmarks/bench/transform/basic.cu
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <nvbench_helper.cuh>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/copy.h>
+
+template <class InT, class OutT>
+struct fib_t
+{
+  __device__ OutT operator()(InT n)
+  {
+    OutT t1 = 0;
+    OutT t2 = 1;
+
+    if (n < 1)
+    {
+      return t1;
+    }
+    else if (n == 1)
+    {
+      return t1;
+    }
+    else if (n == 2)
+    {
+      return t2;
+    }
+    for (InT i = 3; i <= n; ++i)
+    {
+      const auto next = t1 + t2;
+      t1              = t2;
+      t2              = next;
+    }
+
+    return t2;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state,
+                  nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> input = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+  thrust::device_vector<T> output(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<nvbench::uint32_t>(elements);
+
+  caching_allocator_t alloc;
+  fib_t<T, nvbench::uint32_t> op{};
+  thrust::transform(policy(alloc), input.cbegin(), input.cend(), output.begin(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::transform(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), op);
+             });
+}
+
+using types = nvbench::type_list<nvbench::uint32_t,
+                                 nvbench::uint64_t>;
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
index 9847accf07a..f3507feb63e 100644
--- a/thrust/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -48,11 +48,12 @@
 #include <thrust/system/cuda/config.h>
 
 #include <thrust/system/cuda/detail/async/customization.h>
-#include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/system/cuda/future.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 
+#include <cub/device/device_for.cuh>
+
 #include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -124,7 +125,7 @@ unique_eager_event async_for_each_n(
   );
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::__parallel_for::parallel_for(
+    cub::DeviceFor::Bulk(
       n, std::move(wrapped), e.stream().native_handle()
     )
   , "after for_each launch"
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
index f642c2591ee..a4b280d1fb3 100644
--- a/thrust/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -53,6 +53,8 @@
 #include <thrust/distance.h>
 #include <thrust/advance.h>
 
+#include <cub/device/device_for.cuh>
+
 #include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -126,7 +128,7 @@ unique_eager_event async_transform_n(
   );
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::__parallel_for::parallel_for(
+    cub::DeviceFor::Bulk(
       n, std::move(wrapped), e.stream().native_handle()
     )
   , "after transform launch"
diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
index 2225320adcd..e0f2d3db20b 100644
--- a/thrust/thrust/system/cuda/detail/for_each.h
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -40,6 +40,9 @@
 #include <iterator>
 #include <thrust/system/cuda/config.h>
 
+#include <cub/device/device_for.cuh>
+
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/detail/function.h>
@@ -49,29 +52,8 @@ THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
-  // for_each functor
-  template <class Input, class UnaryOp>
-  struct for_each_f
-  {
-    Input input;
-    UnaryOp op;
-
-    THRUST_FUNCTION
-    for_each_f(Input input, UnaryOp op)
-        : input(input), op(op) {}
-
-    template <class Size>
-    THRUST_DEVICE_FUNCTION void operator()(Size idx)
-    {
-      op(raw_reference_cast(*(input + idx)));
-    }
-  };
-
-  //-------------------------
-  // Thrust API entry points
-  //-------------------------
-
   // for_each_n
+  _CCCL_EXEC_CHECK_DISABLE
   template <class Derived,
             class Input,
             class Size,
@@ -82,12 +64,17 @@ namespace cuda_cub {
              Size                       count,
              UnaryOp                    op)
   {
-    typedef thrust::detail::wrapped_function<UnaryOp, void> wrapped_t;
-    wrapped_t wrapped_op(op);
-
-    cuda_cub::parallel_for(policy,
-                           for_each_f<Input, wrapped_t>(first, wrapped_op),
-                           count);
+    THRUST_CDP_DISPATCH(
+      (cudaStream_t stream = cuda_cub::stream(policy);
+       cudaError_t  status = cub::DeviceFor::ForEachN(first, count, op, stream);
+       cuda_cub::throw_on_error(status, "parallel_for failed");
+       status = cuda_cub::synchronize_optional(policy);
+       cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
+       (for (Size idx = 0; idx != count; ++idx)
+        {
+          op(raw_reference_cast(*(first + idx)));
+        }
+    ));
 
     return first + count;
   }
@@ -104,9 +91,10 @@ namespace cuda_cub {
   {
     typedef typename iterator_traits<Input>::difference_type size_type;
     size_type count = static_cast<size_type>(thrust::distance(first,last));
-    return cuda_cub::for_each_n(policy, first,  count, op);
+
+    return for_each_n(policy, first, count, op);
   }
-}    // namespace cuda_cub
+} // namespace cuda_cub
 
 THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/thrust/system/cuda/detail/parallel_for.h b/thrust/thrust/system/cuda/detail/parallel_for.h
index 9baab4d61cb..e4aac73329b 100644
--- a/thrust/thrust/system/cuda/detail/parallel_for.h
+++ b/thrust/thrust/system/cuda/detail/parallel_for.h
@@ -38,119 +38,16 @@
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <cub/device/device_for.cuh>
+
 #include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/cdp_dispatch.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
 #include <thrust/system/cuda/detail/util.h>
 
 THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
-namespace __parallel_for {
-
-  template <int _BLOCK_THREADS,
-            int _ITEMS_PER_THREAD = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-  };    // struct PtxPolicy
-
-  template <class Arch, class F>
-  struct Tuning;
-
-  template <class F>
-  struct Tuning<sm30, F>
-  {
-    typedef PtxPolicy<256, 2> type;
-  };
-
-
-  template <class F,
-            class Size>
-  struct ParallelForAgent
-  {
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, F>::type
-    {
-      typedef Tuning<Arch, F> tuning;
-    };
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS
-    };
-
-    template <bool IS_FULL_TILE>
-    static void    THRUST_DEVICE_FUNCTION
-    consume_tile(F    f,
-                 Size tile_base,
-                 int  items_in_tile)
-    {
-#pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        Size idx = BLOCK_THREADS * ITEM + threadIdx.x;
-        if (IS_FULL_TILE || idx < items_in_tile)
-          f(tile_base + idx);
-      }
-    }
-
-    THRUST_AGENT_ENTRY(F     f,
-                       Size  num_items,
-                       char * /*shmem*/ )
-    {
-      Size tile_base     = static_cast<Size>(blockIdx.x) * ITEMS_PER_TILE;
-      Size num_remaining = num_items - tile_base;
-      Size items_in_tile = static_cast<Size>(
-          num_remaining < ITEMS_PER_TILE ? num_remaining : ITEMS_PER_TILE);
-
-      if (items_in_tile == ITEMS_PER_TILE)
-      {
-        // full tile
-        consume_tile<true>(f, tile_base, ITEMS_PER_TILE);
-      }
-      else
-      {
-        // partial tile
-        consume_tile<false>(f, tile_base, items_in_tile);
-      }
-    }
-  };    // struct ParallelForEagent
-
-  template <class F,
-            class Size>
-  THRUST_RUNTIME_FUNCTION cudaError_t
-  parallel_for(Size         num_items,
-               F            f,
-               cudaStream_t stream)
-  {
-    if (num_items == 0)
-      return cudaSuccess;
-    using core::AgentLauncher;
-    using core::AgentPlan;
-
-    typedef AgentLauncher<ParallelForAgent<F, Size> > parallel_for_agent;
-    AgentPlan parallel_for_plan = parallel_for_agent::get_plan(stream);
-
-    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent");
-    pfa.launch(f, num_items);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    return cudaSuccess;
-  }
-}    // __parallel_for
-
 _CCCL_EXEC_CHECK_DISABLE
 template <class Derived,
           class F,
@@ -168,7 +65,7 @@ parallel_for(execution_policy<Derived> &policy,
   // clang-format off
   THRUST_CDP_DISPATCH(
     (cudaStream_t stream = cuda_cub::stream(policy);
-     cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
+     cudaError_t  status = cub::DeviceFor::Bulk(count, f, stream);
      cuda_cub::throw_on_error(status, "parallel_for failed");
      status = cuda_cub::synchronize_optional(policy);
      cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
@@ -181,7 +78,7 @@ parallel_for(execution_policy<Derived> &policy,
   // clang-format on
 }
 
-}    // namespace cuda_cub
+} // namespace cuda_cub
 
 THRUST_NAMESPACE_END
 #endif

From f1eb00ccae2dde6b53bb1efaf3c76af586336f29 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 18 Jan 2024 15:40:13 -0800
Subject: [PATCH 02/21] Fix adjacent difference tests

---
 cub/docs/test_overview.rst                    |  4 ++++
 ...vice_adjacent_difference_substract_left.cu | 17 +++++++++++----
 ...ice_adjacent_difference_substract_right.cu | 21 ++++++++++++++-----
 cub/test/catch2_test_nvrtc.cu                 | 12 +++++------
 4 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/cub/docs/test_overview.rst b/cub/docs/test_overview.rst
index 6d0de733954..0d1acde0eef 100644
--- a/cub/docs/test_overview.rst
+++ b/cub/docs/test_overview.rst
@@ -119,6 +119,10 @@ It's strongly advised to always use ``c2h::gen`` to produce input data.
 Other data generation methods might be used 
 if absolutely necessary in tests of corner cases.
 
+Do not use ``assert`` in tests.
+We run CUB tests in release mode.
+Issue with ``assert`` is that it only works in debug mode.
+
 If a custom type has to be tested, the following helper should be used:
 
 .. code-block:: c++
diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
index 9e44e4290f8..dc849c6a3f2 100644
--- a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
+++ b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
@@ -262,11 +262,18 @@ CUB_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with a different outp
   REQUIRE(reference == out);
 }
 
-struct check_difference {
+struct check_difference 
+{
+  int *d_error;
+
   template<class T>
-  __device__ T operator()(const T& lhs, const T& rhs) const noexcept {
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept 
+  {
     const T result = lhs - rhs;
-    assert(result == 1);
+    if (result != 1) 
+    {
+      atomicAdd(d_error, 1);
+    }
     return result;
   }
 };
@@ -274,10 +281,12 @@ struct check_difference {
 CUB_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with large indexes", "[device][adjacent_difference]")
 {
   constexpr cuda::std::size_t num_items = 1ll << 33;
+  thrust::device_vector<int> error(1);
+  int *d_error = thrust::raw_pointer_cast(error.data());
   adjacent_difference_subtract_left_copy(thrust::counting_iterator<cuda::std::size_t>{0},
                                          thrust::discard_iterator<>{},
                                          num_items,
-                                         check_difference{});
+                                         check_difference{d_error});
 }
 
 struct invocation_counter {
diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
index 96e2c82a5fc..a40dd531494 100644
--- a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
+++ b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
@@ -304,11 +304,18 @@ CUB_TEST("DeviceAdjacentDifference::SubtractRightCopy works with a different out
   REQUIRE(reference == out);
 }
 
-struct check_difference {
+struct check_difference 
+{
+  int *d_error;
+
   template<class T>
-  __device__ T operator()(const T& lhs, const T& rhs) const noexcept {
-    const T result = lhs - rhs;
-    assert(result == 1);
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept 
+  {
+    const T result = rhs - lhs;
+    if (result != 1)
+    {
+      atomicAdd(d_error, 1);
+    }
     return result;
   }
 };
@@ -316,10 +323,14 @@ struct check_difference {
 CUB_TEST("DeviceAdjacentDifference::SubtractRightCopy works with large indexes", "[device][adjacent_difference]")
 {
   constexpr cuda::std::size_t num_items = 1ll << 33;
+  thrust::device_vector<int> error(1);
+  int *d_error = thrust::raw_pointer_cast(error.data());
   adjacent_difference_subtract_right_copy(thrust::counting_iterator<cuda::std::size_t>{0},
                                           thrust::discard_iterator<>{},
                                           num_items,
-                                          check_difference{});
+                                          check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
 }
 
 struct invocation_counter {
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index b4563fac66b..fd038e764f9 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -249,10 +249,10 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   REQUIRE(NVRTC_SUCCESS == nvrtcGetCUBIN(prog, code.get()));
   REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog));
 
-  CUcontext context;
-  CUdevice device;
-  CUmodule module;
-  CUfunction kernel;
+  CUcontext context{};
+  CUdevice device{};
+  CUmodule module{};
+  CUfunction kernel{};
 
   REQUIRE(CUDA_SUCCESS == cuInit(0));
   REQUIRE(CUDA_SUCCESS == cuDeviceGet(&device, 0));
@@ -265,10 +265,10 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   constexpr int items_per_thread = 4;
   constexpr int tile_size = threads_in_block * items_per_thread;
 
-  CUdeviceptr d_ptr;
+  CUdeviceptr d_ptr{};
   REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_ptr, tile_size * sizeof(int)));
 
-  CUdeviceptr d_err;
+  CUdeviceptr d_err{};
   REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_err, sizeof(int)));
 
   int h_ptr[tile_size];

From 9f7821694b5b88aecd59e9b9271b2d106883fe72 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 19 Jan 2024 20:23:01 +0000
Subject: [PATCH 03/21] Fix pragma unroll warning in unique by key

---
 cub/cub/agent/agent_unique_by_key.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index 3194fe11e2d..5a0b870d017 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -304,7 +304,7 @@ struct AgentUniqueByKey
 
         // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
         // types
-        #pragma unroll(1)
+        #pragma unroll 1
         for (int item = threadIdx.x;
              item < num_tile_selections;
              item += BLOCK_THREADS)

From 4d5852126daa6fc091f22386951df4f381410cc4 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 19 Jan 2024 20:23:32 +0000
Subject: [PATCH 04/21] Template disambiguator

---
 cub/cub/device/dispatch/dispatch_for.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index dda4bad89cd..05c52227a73 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -119,11 +119,11 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
 
   if (items_in_tile == items_per_tile)
   {
-    agent_t{tile_base, op}.consume_tile<true>(items_per_tile, block_threads);
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
   }
   else
   {
-    agent_t{tile_base, op}.consume_tile<false>(items_in_tile, block_threads);
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
   }
 }
 
@@ -144,11 +144,11 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
 
   if (items_in_tile == items_per_tile)
   {
-    agent_t{tile_base, op}.consume_tile<true>(items_per_tile, block_threads);
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
   }
   else
   {
-    agent_t{tile_base, op}.consume_tile<false>(items_in_tile, block_threads);
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
   }
 }
 

From f6733778d2fa9ed70e2ac9ea71d97c6618d84d8c Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 19 Jan 2024 22:54:56 +0000
Subject: [PATCH 05/21] Update copyright year

---
 cub/benchmarks/bench/for_each/base.cu         | 2 +-
 cub/benchmarks/bench/for_each/copy.cu         | 2 +-
 cub/cub/agent/agent_for.cuh                   | 2 +-
 cub/cub/device/device_for.cuh                 | 2 +-
 cub/cub/device/dispatch/dispatch_for.cuh      | 2 +-
 cub/cub/device/dispatch/tuning/tuning_for.cuh | 2 +-
 cub/test/catch2_test_device_bulk.cu           | 2 +-
 cub/test/catch2_test_device_for.cu            | 2 +-
 cub/test/catch2_test_device_for_api.cu        | 2 +-
 cub/test/catch2_test_device_for_copy.cu       | 2 +-
 cub/test/catch2_test_device_for_utils.cu      | 2 +-
 thrust/benchmarks/bench/for_each/basic.cu     | 2 +-
 thrust/benchmarks/bench/tabulate/basic.cu     | 2 +-
 thrust/benchmarks/bench/transform/basic.cu    | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cub/benchmarks/bench/for_each/base.cu b/cub/benchmarks/bench/for_each/base.cu
index f6801a89c14..63c5495c08d 100644
--- a/cub/benchmarks/bench/for_each/base.cu
+++ b/cub/benchmarks/bench/for_each/base.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/benchmarks/bench/for_each/copy.cu b/cub/benchmarks/bench/for_each/copy.cu
index 7dbb3cd9fac..4da8373c542 100644
--- a/cub/benchmarks/bench/for_each/copy.cu
+++ b/cub/benchmarks/bench/for_each/copy.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/cub/agent/agent_for.cuh b/cub/cub/agent/agent_for.cuh
index eccd5859cba..a6366f6f491 100644
--- a/cub/cub/agent/agent_for.cuh
+++ b/cub/cub/agent/agent_for.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index 1bda9198d3e..a0063565e5e 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 05c52227a73..8d3675c40f7 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh
index a2a7ed00046..759d7e632e5 100644
--- a/cub/cub/device/dispatch/tuning/tuning_for.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_for.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/test/catch2_test_device_bulk.cu b/cub/test/catch2_test_device_bulk.cu
index 25716056a12..a978edc458c 100644
--- a/cub/test/catch2_test_device_bulk.cu
+++ b/cub/test/catch2_test_device_bulk.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
index 2ce416db17c..f1612fae9cc 100644
--- a/cub/test/catch2_test_device_for.cu
+++ b/cub/test/catch2_test_device_for.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/test/catch2_test_device_for_api.cu b/cub/test/catch2_test_device_for_api.cu
index 7b8abcd7f92..14c8b119027 100644
--- a/cub/test/catch2_test_device_for_api.cu
+++ b/cub/test/catch2_test_device_for_api.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu
index 11fffd6f576..bd55273270a 100644
--- a/cub/test/catch2_test_device_for_copy.cu
+++ b/cub/test/catch2_test_device_for_copy.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/cub/test/catch2_test_device_for_utils.cu b/cub/test/catch2_test_device_for_utils.cu
index 8316b82b400..a5320ff7fb6 100644
--- a/cub/test/catch2_test_device_for_utils.cu
+++ b/cub/test/catch2_test_device_for_utils.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/benchmarks/bench/for_each/basic.cu b/thrust/benchmarks/bench/for_each/basic.cu
index 968d1c3e81b..a975065ccb7 100644
--- a/thrust/benchmarks/bench/for_each/basic.cu
+++ b/thrust/benchmarks/bench/for_each/basic.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/benchmarks/bench/tabulate/basic.cu b/thrust/benchmarks/bench/tabulate/basic.cu
index c397246bd00..03474d72136 100644
--- a/thrust/benchmarks/bench/tabulate/basic.cu
+++ b/thrust/benchmarks/bench/tabulate/basic.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
index 5485bc08b9e..a488c80b8d6 100644
--- a/thrust/benchmarks/bench/transform/basic.cu
+++ b/thrust/benchmarks/bench/transform/basic.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:

From 070e477a171c2fb65906c4ef9f14aad8fd1a9fff Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 19 Jan 2024 23:04:08 +0000
Subject: [PATCH 06/21] Typo in docs

---
 cub/docs/test_overview.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/docs/test_overview.rst b/cub/docs/test_overview.rst
index 0d1acde0eef..0f5ca4fabdf 100644
--- a/cub/docs/test_overview.rst
+++ b/cub/docs/test_overview.rst
@@ -121,7 +121,7 @@ if absolutely necessary in tests of corner cases.
 
 Do not use ``assert`` in tests.
 We run CUB tests in release mode.
-Issue with ``assert`` is that it only works in debug mode.
+The issue with ``assert`` is that it only works in debug mode.
 
 If a custom type has to be tested, the following helper should be used:
 

From 0bf9dc1c1406e68e2ae4d3d722acd24d097b80ba Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 19 Jan 2024 23:09:14 +0000
Subject: [PATCH 07/21] Fix adjacent difference test

---
 .../catch2_test_device_adjacent_difference_substract_left.cu    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
index dc849c6a3f2..e4317203ce1 100644
--- a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
+++ b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
@@ -287,6 +287,8 @@ CUB_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with large indexes",
                                          thrust::discard_iterator<>{},
                                          num_items,
                                          check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
 }
 
 struct invocation_counter {

From d61cb8358dc7f11776bb35706b340e61573e785d Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Sun, 21 Jan 2024 22:03:08 -0800
Subject: [PATCH 08/21] Force inline

---
 cub/cub/device/device_for.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index a0063565e5e..938db91d5ae 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -65,7 +65,7 @@ struct op_wrapper_t
   InputIteratorT input;
   OpT op;
 
-  _CCCL_DEVICE void operator()(OffsetT i)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
   {
     // Dereferencing `thrust::device_vector<T>` iterators returns a `thrust::device_reference<T>`
     // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap.
@@ -93,7 +93,7 @@ struct op_wrapper_vectorized_t
   // Type of the vector that is used to load the input data
   using vector_t = typename CubVector<T, vec_size>::Type;
 
-  _CCCL_DEVICE void operator()(OffsetT i)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
   {
     // Surrounding `Bulk` call doesn't invoke this operator on invalid indices, so we don't need to
     // check for out-of-bounds access here.

From 4a2a93f220f01ee360a10b04d97888b41a7d61cf Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 22 Jan 2024 06:23:48 +0000
Subject: [PATCH 09/21] Fix typo in docs

---
 cub/cub/device/device_for.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index 938db91d5ae..ae812616e20 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -282,7 +282,7 @@ public:
   //!   Number of elements to iterate over
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -352,7 +352,7 @@ public:
   //!   The end of the sequence
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -428,7 +428,7 @@ public:
   //!   Number of elements to iterate over
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to a copy of each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -501,7 +501,7 @@ public:
   //!   The end of the sequence
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to a copy of each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -615,7 +615,7 @@ public:
   //!   Number of elements to iterate over
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -674,7 +674,7 @@ public:
   //!   The end of the sequence
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -735,7 +735,7 @@ public:
   //!   Number of elements to iterate over
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to a copy of each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
@@ -795,7 +795,7 @@ public:
   //!   The end of the sequence
   //!
   //! @param[in] op
-  //!   Function object to apply to each index in the index space
+  //!   Function object to apply to a copy of each element in the range
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.

From bc744b97f56146b16cbc8a04694ce3b737565c6e Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 22 Jan 2024 06:25:49 +0000
Subject: [PATCH 10/21] Constexpr

---
 cub/cub/device/dispatch/dispatch_for.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 8d3675c40f7..78f27882aaa 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -136,8 +136,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
   using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
   using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
 
-  const auto block_threads  = active_policy_t::block_threads;
-  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  constexpr auto block_threads  = active_policy_t::block_threads;
+  constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+
   const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
   const auto num_remaining  = num_items - tile_base;
   const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);

From ffc2391a97a2f0fd7410fde371f378b8d3d1853c Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 22 Jan 2024 06:27:39 +0000
Subject: [PATCH 11/21] Do not rely on transitive includes

---
 cub/test/catch2_test_device_bulk.cu     | 4 ++++
 cub/test/catch2_test_device_for.cu      | 3 +++
 cub/test/catch2_test_device_for_api.cu  | 2 ++
 cub/test/catch2_test_device_for_copy.cu | 2 ++
 4 files changed, 11 insertions(+)

diff --git a/cub/test/catch2_test_device_bulk.cu b/cub/test/catch2_test_device_bulk.cu
index a978edc458c..c4362362440 100644
--- a/cub/test/catch2_test_device_bulk.cu
+++ b/cub/test/catch2_test_device_bulk.cu
@@ -28,6 +28,10 @@
 #include <cub/device/device_for.cuh>
 
 #include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/std/type_traits>
 
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
index f1612fae9cc..d841481ef82 100644
--- a/cub/test/catch2_test_device_for.cu
+++ b/cub/test/catch2_test_device_for.cu
@@ -28,6 +28,9 @@
 #include <cub/device/device_for.cuh>
 
 #include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
 #include <thrust/sequence.h>
 
 #include "catch2_test_helper.h"
diff --git a/cub/test/catch2_test_device_for_api.cu b/cub/test/catch2_test_device_for_api.cu
index 14c8b119027..b964d8663b0 100644
--- a/cub/test/catch2_test_device_for_api.cu
+++ b/cub/test/catch2_test_device_for_api.cu
@@ -28,6 +28,8 @@
 #include <cub/device/device_for.cuh>
 
 #include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
 
 #include "catch2_test_helper.h"
 
diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu
index bd55273270a..d7ad063f7ad 100644
--- a/cub/test/catch2_test_device_for_copy.cu
+++ b/cub/test/catch2_test_device_for_copy.cu
@@ -28,6 +28,8 @@
 #include <cub/device/device_for.cuh>
 
 #include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
 #include "catch2_test_helper.h"

From 178eb8fdd6d34d293af10644693d509603ee146f Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 22 Jan 2024 06:50:10 +0000
Subject: [PATCH 12/21] Includes order

---
 cub/cub/device/dispatch/dispatch_for.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 78f27882aaa..f84d4682c82 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -28,7 +28,6 @@
 #pragma once
 
 #include <cub/config.cuh>
-#include <cub/device/dispatch/tuning/tuning_for.cuh>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -39,6 +38,7 @@
 #endif // no system header
 
 #include <cub/agent/agent_for.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>

From 659ad966bc91e8b619fc124ec0f48b43ea5659c1 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 15:06:56 +0000
Subject: [PATCH 13/21] Missing cast

---
 cub/cub/device/dispatch/dispatch_for.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index f84d4682c82..8dfc369a1a7 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -115,7 +115,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
   const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
   const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
   const auto num_remaining  = num_items - tile_base;
-  const auto items_in_tile  = num_remaining < items_per_tile ? num_remaining : items_per_tile;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
 
   if (items_in_tile == items_per_tile)
   {

From 39449fa0b7437c1d620abf385f4c627c28fffc42 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 15:13:25 +0000
Subject: [PATCH 14/21] Remove extra host annotations

---
 cub/cub/device/dispatch/dispatch_for.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 8dfc369a1a7..600dc2500ed 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -168,7 +168,7 @@ struct dispatch_t : PolicyHubT
   {}
 
   template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
     cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */)
   {
     using max_policy_t = typename dispatch_t::MaxPolicy;
@@ -225,7 +225,7 @@ struct dispatch_t : PolicyHubT
   }
 
   template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
     cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */)
   {
     using max_policy_t = typename dispatch_t::MaxPolicy;
@@ -270,7 +270,7 @@ struct dispatch_t : PolicyHubT
   }
 
   template <typename ActivePolicyT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
     constexpr bool static_block_size = ActivePolicyT::for_policy_t::block_threads > 0;
     return Invoke<ActivePolicyT>(::cuda::std::integral_constant<bool, static_block_size>{});

From c52c14ce431a65dab98837355c53841f9ad9ddfb Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 15:16:50 +0000
Subject: [PATCH 15/21] Explicitly discard op result

---
 cub/cub/agent/agent_for.cuh   | 2 +-
 cub/cub/device/device_for.cuh | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cub/cub/agent/agent_for.cuh b/cub/cub/agent/agent_for.cuh
index a6366f6f491..6e853cdcb83 100644
--- a/cub/cub/agent/agent_for.cuh
+++ b/cub/cub/agent/agent_for.cuh
@@ -72,7 +72,7 @@ struct agent_block_striped_t
 
       if (IsFullTile || idx < items_in_tile)
       {
-        op(tile_base + idx);
+        (void)op(tile_base + idx);
       }
     }
   }
diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index ae812616e20..fb119530892 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -69,7 +69,7 @@ struct op_wrapper_t
   {
     // Dereferencing `thrust::device_vector<T>` iterators returns a `thrust::device_reference<T>`
     // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap.
-    op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i)));
+    (void)op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i)));
   }
 };
 
@@ -104,14 +104,14 @@ struct op_wrapper_vectorized_t
 #pragma unroll
       for (int j = 0; j < vec_size; j++)
       {
-        op(*(reinterpret_cast<const T*>(&vec) + j));
+        (void)op(*(reinterpret_cast<const T*>(&vec) + j));
       }
     }
     else
     { // Case of partially filled vector
       for (OffsetT j = i * vec_size; j < num_items; j++)
       {
-        op(input[j]);
+        (void)op(input[j]);
       }
     }
   }

From fccf2e3690c84b3f53db036766f417dc7ace1ceb Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 15:23:22 +0000
Subject: [PATCH 16/21] Better template paramter name

---
 cub/cub/device/device_for.cuh | 89 ++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index fb119530892..88a6dfff43e 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -59,10 +59,10 @@ namespace for_each
 /**
  * `op_wrapper_t` turns bulk into a for-each operation by wrapping the user-provided unary operator.
  */
-template <class OffsetT, class OpT, class InputIteratorT>
+template <class OffsetT, class OpT, class RandomAccessIteratorT>
 struct op_wrapper_t
 {
-  InputIteratorT input;
+  RandomAccessIteratorT input;
   OpT op;
 
   _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
@@ -132,24 +132,25 @@ private:
     return (reinterpret_cast<std::size_t>(ptr) & (sizeof(VectorT) - 1)) == 0;
   }
 
-  template <class InputIteratorT, class OffsetT, class OpT>
+  template <class RandomAccessIteratorT, class OffsetT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
-    InputIteratorT first,
+    RandomAccessIteratorT first,
     OffsetT num_items,
     OpT op,
     cudaStream_t stream,
     ::cuda::std::false_type /* do_not_vectorize */)
   {
-    using wrapped_op_t = detail::for_each::op_wrapper_t<OffsetT, OpT, InputIteratorT>;
+    using wrapped_op_t = detail::for_each::op_wrapper_t<OffsetT, OpT, RandomAccessIteratorT>;
     return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(num_items, wrapped_op_t{first, op}, stream);
   }
 
-  template <class InputIteratorT, class OffsetT, class OpT>
+  template <class RandomAccessIteratorT, class OffsetT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
-    InputIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
+    RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
   {
     auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first);
-    using wrapped_op_t   = detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<InputIteratorT>>;
+    using wrapped_op_t =
+      detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<RandomAccessIteratorT>>;
 
     if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
     { // Vectorize loads
@@ -259,7 +260,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam NumItemsT
@@ -286,11 +287,11 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class NumItemsT, class OpT>
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    InputIteratorT first,
+    RandomAccessIteratorT first,
     NumItemsT num_items,
     OpT op,
     cudaStream_t stream = {})
@@ -332,7 +333,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam OpT
@@ -356,12 +357,12 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class OpT>
+  template <class RandomAccessIteratorT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t ForEach(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    InputIteratorT first,
-    InputIteratorT last,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
     OpT op,
     cudaStream_t stream = {})
   {
@@ -405,7 +406,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam NumItemsT
@@ -432,11 +433,11 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class NumItemsT, class OpT>
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    InputIteratorT first,
+    RandomAccessIteratorT first,
     NumItemsT num_items,
     OpT op,
     cudaStream_t stream = {})
@@ -481,7 +482,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam OpT
@@ -505,12 +506,12 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class OpT>
+  template <class RandomAccessIteratorT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
-    InputIteratorT first,
-    InputIteratorT last,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
     OpT op,
     cudaStream_t stream = {})
   {
@@ -599,7 +600,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam NumItemsT
@@ -619,19 +620,19 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class NumItemsT, class OpT>
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  ForEachN(InputIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
   {
     using offset_t            = NumItemsT;
     using use_vectorization_t = ::cuda::std::integral_constant<bool, false>;
 
     // Disable auto-vectorization for now:
     // constexpr bool use_vectorization =
-    //   detail::for_each::can_regain_copy_freedom<detail::value_t<InputIteratorT>, OpT>::value
-    //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value;
+    //   detail::for_each::can_regain_copy_freedom<detail::value_t<RandomAccessIteratorT>, OpT>::value
+    //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value;
 
-    return for_each_n<InputIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
   }
 
   //! @rst
@@ -661,7 +662,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam OpT
@@ -678,11 +679,11 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class OpT>
+  template <class RandomAccessIteratorT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  ForEach(InputIteratorT first, InputIteratorT last, OpT op, cudaStream_t stream = {})
+  ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
   {
-    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<InputIteratorT>::difference_type;
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
 
     const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
 
@@ -719,7 +720,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam NumItemsT
@@ -739,17 +740,17 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class NumItemsT, class OpT>
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  ForEachCopyN(InputIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
   {
-    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value,
-                  "Input iterator must be contiguous");
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
+                  "Iterator must be contiguous");
 
     using offset_t            = NumItemsT;
     using use_vectorization_t = ::cuda::std::integral_constant<bool, true>;
 
-    return for_each_n<InputIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
   }
 
   //! @rst
@@ -782,7 +783,7 @@ public:
   //!
   //! @endrst
   //!
-  //! @tparam InputIteratorT
+  //! @tparam RandomAccessIteratorT
   //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
   //!
   //! @tparam OpT
@@ -799,14 +800,14 @@ public:
   //!
   //! @param[in] stream
   //!   CUDA stream to launch kernels within. Default stream is `0`.
-  template <class InputIteratorT, class OpT>
+  template <class RandomAccessIteratorT, class OpT>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  ForEachCopy(InputIteratorT first, InputIteratorT last, OpT op, cudaStream_t stream = {})
+  ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
   {
-    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<InputIteratorT>::value,
-                  "Input iterator must be contiguous");
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
+                  "Iterator must be contiguous");
 
-    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<InputIteratorT>::difference_type;
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
 
     const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
 

From 22ef339ee135e16b3e59aa8d6089acd0c3e85cb2 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 15:26:52 +0000
Subject: [PATCH 17/21] Qualify for each call

---
 thrust/thrust/system/cuda/detail/for_each.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
index e0f2d3db20b..e03b6f7c78a 100644
--- a/thrust/thrust/system/cuda/detail/for_each.h
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -37,7 +37,6 @@
 #endif // no system header
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <iterator>
 #include <thrust/system/cuda/config.h>
 
 #include <cub/device/device_for.cuh>
@@ -92,7 +91,7 @@ namespace cuda_cub {
     typedef typename iterator_traits<Input>::difference_type size_type;
     size_type count = static_cast<size_type>(thrust::distance(first,last));
 
-    return for_each_n(policy, first, count, op);
+    return THRUST_NS_QUALIFIER::cuda_cub::for_each_n(policy, first, count, op);
   }
 } // namespace cuda_cub
 

From 352804ab5750d1a5531e9418a61eedf6c089d68e Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 17:19:04 +0000
Subject: [PATCH 18/21] Document bench

---
 cub/benchmarks/bench/for_each/base.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cub/benchmarks/bench/for_each/base.cu b/cub/benchmarks/bench/for_each/base.cu
index 63c5495c08d..c0f38ed1292 100644
--- a/cub/benchmarks/bench/for_each/base.cu
+++ b/cub/benchmarks/bench/for_each/base.cu
@@ -55,7 +55,10 @@ void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<T> in(elements, T{42});
 
   input_it_t d_in   = thrust::raw_pointer_cast(in.data());
-  output_it_t d_out = nullptr;
+  // `d_out` exists for visibility
+  // All inputs are equal to `42`, while the operator is searching for `0`.
+  // If the operator finds `0` in the input sequence, it's an issue leading to a segfault.
+  output_it_t d_out = nullptr; 
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements);

From b254f2a020a4faca565adb55678dd3afb89fc312 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 17:30:02 +0000
Subject: [PATCH 19/21] Improve readability

---
 cub/cub/device/dispatch/dispatch_for.cuh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 600dc2500ed..d3b266fdf2c 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -74,6 +74,9 @@ struct first_parameter<R (C::*)(A) const>
   using type = A;
 };
 
+template <class Fn>
+using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;
+
 template <class Value, class Fn, class = void>
 struct has_unique_value_overload : ::cuda::std::false_type
 {};
@@ -84,8 +87,8 @@ struct has_unique_value_overload<
   Value,
   Fn,
   typename ::cuda::std::enable_if<
-              !::cuda::std::is_reference<typename first_parameter<decltype(&Fn::operator())>::type>::value && 
-              ::cuda::std::is_convertible<Value, typename first_parameter<decltype(&Fn::operator())>::type
+              !::cuda::std::is_reference<first_parameter_t<Fn>>::value && 
+              ::cuda::std::is_convertible<Value, first_parameter_t<Fn>
              >::value>::type>
     : ::cuda::std::true_type
 {};

From 9eaac6810eb8d3b0c7b4008ccb394170bc1454e8 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 17:30:10 +0000
Subject: [PATCH 20/21] Better type name

---
 cub/test/catch2_test_device_for.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
index d841481ef82..885aa5ae6c5 100644
--- a/cub/test/catch2_test_device_for.cu
+++ b/cub/test/catch2_test_device_for.cu
@@ -68,7 +68,7 @@ public:
   }
 };
 
-struct bad_operator_t
+struct referencing_operator_t
 {
   const std::size_t* d_input;
   const std::size_t magic_value;
@@ -126,7 +126,7 @@ CUB_TEST("Device for each works with bad operators", "[for][device]")
   thrust::device_vector<std::size_t> input(num_items, magic_value);
   const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
 
-  device_for_each(input.begin(), input.end(), bad_operator_t{d_input, magic_value});
+  device_for_each(input.begin(), input.end(), referencing_operator_t{d_input, magic_value});
 
   REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
 }
@@ -206,7 +206,7 @@ CUB_TEST("Device for each n works with bad operators", "[for][device]", offset_t
   thrust::device_vector<std::size_t> input(num_items, magic_value);
   const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
 
-  device_for_each_n(input.begin(), num_items, bad_operator_t{d_input, magic_value});
+  device_for_each_n(input.begin(), num_items, referencing_operator_t{d_input, magic_value});
 
   REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
 }

From dba7579fa7f6ad22098e3426f8183fca96e527d1 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Thu, 25 Jan 2024 18:07:26 +0000
Subject: [PATCH 21/21] Improve bulk docs

---
 cub/cub/device/device_for.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
index 88a6dfff43e..c3a3435180c 100644
--- a/cub/cub/device/device_for.cuh
+++ b/cub/cub/device/device_for.cuh
@@ -173,6 +173,9 @@ public:
   //! +++++++++++++++++++++++++++++++++++++++++++++
   //!
   //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to 
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
   //!
   //! - The return value of ``op``, if any, is ignored.
   //! - @devicestorage
@@ -529,6 +532,9 @@ public:
   //! +++++++++++++++++++++++++++++++++++++++++++++
   //!
   //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to 
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
   //!
   //! - The return value of ``op``, if any, is ignored.
   //!