diff --git a/cub/benchmarks/bench/for_each/base.cu b/cub/benchmarks/bench/for_each/base.cu
new file mode 100644
index 00000000000..c0f38ed1292
--- /dev/null
+++ b/cub/benchmarks/bench/for_each/base.cu
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  // `d_out` exists for visibility
+  // All inputs are equal to `42`, while the operator is searching for `0`.
+  // If the operator finds `0` in the input sequence, it's an issue leading to a segfault.
+  output_it_t d_out = nullptr; 
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/for_each/copy.cu b/cub/benchmarks/bench/for_each/copy.cu
new file mode 100644
index 00000000000..4da8373c542
--- /dev/null
+++ b/cub/benchmarks/bench/for_each/copy.cu
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = nullptr;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachCopyN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachCopyN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/cub/agent/agent_for.cuh b/cub/cub/agent/agent_for.cuh
new file mode 100644
index 00000000000..6e853cdcb83
--- /dev/null
+++ b/cub/cub/agent/agent_for.cuh
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+template <int BlockThreads, int ItemsPerThread>
+struct policy_t
+{
+  static constexpr int block_threads    = BlockThreads;
+  static constexpr int items_per_thread = ItemsPerThread;
+};
+
+template <class PolicyT, class OffsetT, class OpT>
+struct agent_block_striped_t
+{
+  static constexpr int items_per_thread = PolicyT::items_per_thread;
+
+  OffsetT tile_base;
+  OpT op;
+
+  template <bool IsFullTile>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int items_in_tile, int block_threads)
+  {
+#pragma unroll
+    for (int item = 0; item < items_per_thread; item++)
+    {
+      const auto idx = static_cast<OffsetT>(block_threads * item + threadIdx.x);
+
+      if (IsFullTile || idx < items_in_tile)
+      {
+        (void)op(tile_base + idx);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index 3194fe11e2d..5a0b870d017 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -304,7 +304,7 @@ struct AgentUniqueByKey
 
         // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
         // types
-        #pragma unroll(1)
+        #pragma unroll 1
         for (int item = threadIdx.x;
              item < num_tile_selections;
              item += BLOCK_THREADS)
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index 73136a6077d..b64e5b4c5a4 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -74,6 +74,7 @@
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_spmv.cuh>
+#include <cub/device/device_for.cuh>
 
 // Grid
 // #include <cub/grid/grid_barrier.cuh>
diff --git a/cub/cub/device/device_for.cuh b/cub/cub/device/device_for.cuh
new file mode 100644
index 00000000000..c3a3435180c
--- /dev/null
+++ b/cub/cub/device/device_for.cuh
@@ -0,0 +1,824 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/device/dispatch/dispatch_for.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+/**
+ * `op_wrapper_t` turns bulk into a for-each operation by wrapping the user-provided unary operator.
+ */
+template <class OffsetT, class OpT, class RandomAccessIteratorT>
+struct op_wrapper_t
+{
+  RandomAccessIteratorT input;
+  OpT op;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  {
+    // Dereferencing `thrust::device_vector<T>` iterators returns a `thrust::device_reference<T>`
+    // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap.
+    (void)op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i)));
+  }
+};
+
+/**
+ * `op_wrapper_vectorized_t` turns bulk into a for-each-copy operation.
+ * `op_wrapper_vectorized_t` is similar to `op_wrapper_t` but does not provide any guarantees about
+ * address of the input parameter. `OpT` might be given a copy of the value or an actual reference
+ * to the input iterator value (depending on the alignment of input iterator)
+ */
+template <class OffsetT, class OpT, class T>
+struct op_wrapper_vectorized_t
+{
+  const T* input; // Raw pointer to the input data
+  OpT op; // User-provided operator
+  OffsetT partially_filled_vector_id; // Index of the vector that doesn't have all elements
+  OffsetT num_items; // Total number of non-vectorized items
+
+  // TODO Can be extracted into tuning
+  constexpr static int vec_size = 4;
+
+  // Type of the vector that is used to load the input data
+  using vector_t = typename CubVector<T, vec_size>::Type;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  {
+    // Surrounding `Bulk` call doesn't invoke this operator on invalid indices, so we don't need to
+    // check for out-of-bounds access here.
+    if (i != partially_filled_vector_id)
+    { // Case of fully filled vector
+      const vector_t vec = *reinterpret_cast<const vector_t*>(input + vec_size * i);
+
+#pragma unroll
+      for (int j = 0; j < vec_size; j++)
+      {
+        (void)op(*(reinterpret_cast<const T*>(&vec) + j));
+      }
+    }
+    else
+    { // Case of partially filled vector
+      for (OffsetT j = i * vec_size; j < num_items; j++)
+      {
+        (void)op(input[j]);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+struct DeviceFor
+{
+private:
+  /**
+   * Checks if the pointer is aligned to the given vector type
+   */
+  template <class VectorT, class T>
+  CUB_RUNTIME_FUNCTION static bool is_aligned(const T* ptr)
+  {
+    return (reinterpret_cast<std::size_t>(ptr) & (sizeof(VectorT) - 1)) == 0;
+  }
+
+  template <class RandomAccessIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    RandomAccessIteratorT first,
+    OffsetT num_items,
+    OpT op,
+    cudaStream_t stream,
+    ::cuda::std::false_type /* do_not_vectorize */)
+  {
+    using wrapped_op_t = detail::for_each::op_wrapper_t<OffsetT, OpT, RandomAccessIteratorT>;
+    return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(num_items, wrapped_op_t{first, op}, stream);
+  }
+
+  template <class RandomAccessIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
+  {
+    auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first);
+    using wrapped_op_t =
+      detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<RandomAccessIteratorT>>;
+
+    if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
+    { // Vectorize loads
+      const OffsetT num_vec_items = cub::DivideAndRoundUp(num_items, wrapped_op_t::vec_size);
+
+      return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(
+        num_vec_items,
+        wrapped_op_t{
+          unwrapped_first, op, num_items % wrapped_op_t::vec_size ? num_vec_items - 1 : num_vec_items, num_items},
+        stream);
+    }
+
+    // Fallback to non-vectorized version
+    return for_each_n(first, num_items, op, stream, ::cuda::std::false_type{});
+  }
+
+public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to 
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-temp-storage
+  //!     :end-before: example-end bulk-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Bulk(void* d_temp_storage, size_t& temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Bulk(shape, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-temp-storage
+  //!     :end-before: example-end for-each-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-temp-storage
+  //!     :end-before: example-end for-each-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEach(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEach(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-temp-storage
+  //!     :end-before: example-end for-each-copy-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopyN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-temp-storage
+  //!     :end-before: example-end for-each-copy-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopy(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to 
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-wo-temp-storage
+  //!     :end-before: example-end bulk-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+    using offset_t = ShapeT; 
+    return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-wo-temp-storage
+  //!     :end-before: example-end for-each-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t            = NumItemsT;
+    using use_vectorization_t = ::cuda::std::integral_constant<bool, false>;
+
+    // Disable auto-vectorization for now:
+    // constexpr bool use_vectorization =
+    //   detail::for_each::can_regain_copy_freedom<detail::value_t<RandomAccessIteratorT>, OpT>::value
+    //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value;
+
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-wo-temp-storage
+  //!     :end-before: example-end for-each-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
+
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+
+    return ForEachN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
+                  "Iterator must be contiguous");
+
+    using offset_t            = NumItemsT;
+    using use_vectorization_t = ::cuda::std::integral_constant<bool, true>;
+
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value,
+                  "Iterator must be contiguous");
+
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
+
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+
+    return ForEachCopyN(first, num_items, op, stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
new file mode 100644
index 00000000000..d3b266fdf2c
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -0,0 +1,305 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+template <class Fn>
+struct first_parameter
+{
+  using type = void;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A)>
+{
+  using type = A;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A) const>
+{
+  using type = A;
+};
+
+template <class Fn>
+using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;
+
+template <class Value, class Fn, class = void>
+struct has_unique_value_overload : ::cuda::std::false_type
+{};
+
+// clang-format off
+template <class Value, class Fn>
+struct has_unique_value_overload<
+  Value,
+  Fn,
+  typename ::cuda::std::enable_if<
+              !::cuda::std::is_reference<first_parameter_t<Fn>>::value && 
+              ::cuda::std::is_convertible<Value, first_parameter_t<Fn>
+             >::value>::type>
+    : ::cuda::std::true_type
+{};
+
+// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
+// This can be observable if the unary operator takes parameter by reference and modifies it or uses address. 
+// The trait below checks if the freedom to copy trivial types can be regained. 
+template <typename Value, typename Fn>
+using can_regain_copy_freedom = 
+  ::cuda::std::integral_constant<
+    bool,
+    ::cuda::std::is_trivially_constructible<Value>::value && 
+    ::cuda::std::is_trivially_copy_assignable<Value>::value && 
+    :: cuda::std::is_trivially_move_assignable<Value>::value && 
+    ::cuda::std::is_trivially_destructible<Value>::value && 
+    has_unique_value_overload<Value, Fn>::value>;
+// clang-format on
+
+// This kernel is used when the block size is not known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  const auto block_threads  = static_cast<OffsetT>(blockDim.x);
+  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// This kernel is used when the block size is known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES //
+__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
+  void static_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  constexpr auto block_threads  = active_policy_t::block_threads;
+  constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// The dispatch layer is in the detail namespace until we figure out tuning API
+template <class OffsetT, class OpT, class PolicyHubT = policy_hub_t>
+struct dispatch_t : PolicyHubT
+{
+  OffsetT num_items;
+  OpT op;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION dispatch_t(OffsetT num_items, OpT op, cudaStream_t stream)
+      : num_items(num_items)
+      , op(op)
+      , stream(stream)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
+    cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int block_threads = 256;
+    cudaError_t error = cudaSuccess;
+
+    NV_IF_TARGET(NV_IS_HOST,
+                 (int _{}; //
+                  error = cudaOccupancyMaxPotentialBlockSize(
+                    &_, &block_threads, detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>);));
+
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
+    cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    cudaError_t error              = cudaSuccess;
+    constexpr int block_threads    = ActivePolicyT::for_policy_t::block_threads;
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::static_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    constexpr bool static_block_size = ActivePolicyT::for_policy_t::block_threads > 0;
+    return Invoke<ActivePolicyT>(::cuda::std::integral_constant<bool, static_block_size>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    int ptx_version   = 0;
+    cudaError_t error = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch(num_items, op, stream);
+
+    error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch));
+
+    return error;
+  }
+};
+
+} // namespace for_each
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/tuning/tuning_for.cuh b/cub/cub/device/dispatch/tuning/tuning_for.cuh
new file mode 100644
index 00000000000..759d7e632e5
--- /dev/null
+++ b/cub/cub/device/dispatch/tuning/tuning_for.cuh
@@ -0,0 +1,63 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/util_device.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+struct policy_hub_t
+{
+  struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t>
+  {
+    using for_policy_t = policy_t<256, 2>;
+  };
+
+  using MaxPolicy = policy_350_t;
+};
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cub/docs/repo.toml b/cub/docs/repo.toml
index 0d306e752c5..7cff1e571cf 100644
--- a/cub/docs/repo.toml
+++ b/cub/docs/repo.toml
@@ -60,7 +60,7 @@ doxygen_aliases = [
   "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)",
   "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.",
   "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.",
-  "devicestorage=When ``d_temp_storage`` is `NULL`, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
+  "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
   "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is ``O(P)``, where ``P`` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size ``N``).",
   "devicestorageNP=This operation requires an allocation of temporary device storage that is ``O(N+P)``, where ``N`` is the length of the input and ``P`` is the number of streaming multiprocessors on the device.",
   "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is ``O(N/C + P)``, where ``N`` is the length of the input, ``C`` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and ``P`` is the number of streaming multiprocessors on the device.",
diff --git a/cub/docs/test_overview.rst b/cub/docs/test_overview.rst
index 6d0de733954..0f5ca4fabdf 100644
--- a/cub/docs/test_overview.rst
+++ b/cub/docs/test_overview.rst
@@ -119,6 +119,10 @@ It's strongly advised to always use ``c2h::gen`` to produce input data.
 Other data generation methods might be used 
 if absolutely necessary in tests of corner cases.
 
+Do not use ``assert`` in tests.
+We run CUB tests in release mode.
+The issue with ``assert`` is that it only works in debug mode.
+
 If a custom type has to be tested, the following helper should be used:
 
 .. code-block:: c++
diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
index 9e44e4290f8..e4317203ce1 100644
--- a/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
+++ b/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
@@ -262,11 +262,18 @@ CUB_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with a different outp
   REQUIRE(reference == out);
 }
 
-struct check_difference {
+struct check_difference 
+{
+  int *d_error;
+
   template<class T>
-  __device__ T operator()(const T& lhs, const T& rhs) const noexcept {
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept 
+  {
     const T result = lhs - rhs;
-    assert(result == 1);
+    if (result != 1) 
+    {
+      atomicAdd(d_error, 1);
+    }
     return result;
   }
 };
@@ -274,10 +281,14 @@ struct check_difference {
 CUB_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with large indexes", "[device][adjacent_difference]")
 {
   constexpr cuda::std::size_t num_items = 1ll << 33;
+  thrust::device_vector<int> error(1);
+  int *d_error = thrust::raw_pointer_cast(error.data());
   adjacent_difference_subtract_left_copy(thrust::counting_iterator<cuda::std::size_t>{0},
                                          thrust::discard_iterator<>{},
                                          num_items,
-                                         check_difference{});
+                                         check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
 }
 
 struct invocation_counter {
diff --git a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
index 96e2c82a5fc..a40dd531494 100644
--- a/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
+++ b/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
@@ -304,11 +304,18 @@ CUB_TEST("DeviceAdjacentDifference::SubtractRightCopy works with a different out
   REQUIRE(reference == out);
 }
 
-struct check_difference {
+struct check_difference 
+{
+  int *d_error;
+
   template<class T>
-  __device__ T operator()(const T& lhs, const T& rhs) const noexcept {
-    const T result = lhs - rhs;
-    assert(result == 1);
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept 
+  {
+    const T result = rhs - lhs;
+    if (result != 1)
+    {
+      atomicAdd(d_error, 1);
+    }
     return result;
   }
 };
@@ -316,10 +323,14 @@ struct check_difference {
 CUB_TEST("DeviceAdjacentDifference::SubtractRightCopy works with large indexes", "[device][adjacent_difference]")
 {
   constexpr cuda::std::size_t num_items = 1ll << 33;
+  thrust::device_vector<int> error(1);
+  int *d_error = thrust::raw_pointer_cast(error.data());
   adjacent_difference_subtract_right_copy(thrust::counting_iterator<cuda::std::size_t>{0},
                                           thrust::discard_iterator<>{},
                                           num_items,
-                                          check_difference{});
+                                          check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
 }
 
 struct invocation_counter {
diff --git a/cub/test/catch2_test_device_bulk.cu b/cub/test/catch2_test_device_bulk.cu
new file mode 100644
index 00000000000..c4362362440
--- /dev/null
+++ b/cub/test/catch2_test_device_bulk.cu
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/std/type_traits>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::Bulk, device_bulk);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+template <class T>
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    static_assert(cuda::std::is_same<T, OffsetT>::value, "T and OffsetT must be the same type");
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+CUB_TEST("Device bulk works", "[bulk][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_bulk(num_items, incrementer_t<offset_t>{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu
new file mode 100644
index 00000000000..885aa5ae6c5
--- /dev/null
+++ b/cub/test/catch2_test_device_for.cu
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEach, device_for_each);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachN, device_for_each_n);
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+struct referencing_operator_t
+{
+  const std::size_t* d_input;
+  const std::size_t magic_value;
+
+  __device__ void operator()(const std::size_t& i) const
+  {
+    if (i == magic_value)
+    {
+      const std::size_t* d_ptr    = &i;
+      const auto offset           = static_cast<std::size_t>(d_ptr - d_input);
+      const_cast<std::size_t&>(i) = offset;
+    }
+  }
+};
+
+CUB_TEST("Device for each works", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each works with bad operators", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const std::size_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const std::size_t magic_value = num_items + 1;
+  thrust::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each(input.begin(), input.end(), referencing_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+CUB_TEST("Device for each works with unaligned vectors", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items = static_cast<int>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+CUB_TEST("Device for each n works", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works with bad operators", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const std::size_t magic_value = num_items + 1;
+  thrust::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each_n(input.begin(), num_items, referencing_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+CUB_TEST("Device for each n works with unaligned vectors", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for_api.cu b/cub/test/catch2_test_device_for_api.cu
new file mode 100644
index 00000000000..b964d8663b0
--- /dev/null
+++ b/cub/test/catch2_test_device_for_api.cu
@@ -0,0 +1,265 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include "catch2_test_helper.h"
+
+// example-begin bulk-square-t
+struct square_t
+{
+  int* d_ptr;
+
+  __device__ void operator()(int i)
+  {
+    d_ptr[i] *= d_ptr[i];
+  }
+};
+// example-end bulk-square-t
+
+// example-begin bulk-square-ref-t
+struct square_ref_t
+{
+  __device__ void operator()(int& i)
+  {
+    i *= i;
+  }
+};
+// example-end bulk-square-ref-t
+
+// example-begin bulk-odd-count-t
+struct odd_count_t
+{
+  int* d_count;
+
+  __device__ void operator()(int i)
+  {
+    if (i % 2 == 1)
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+// example-end bulk-odd-count-t
+
+CUB_TEST("Device bulk works with temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform bulk operation
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device bulk works without temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  cub::DeviceFor::Bulk(vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each operation
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+CUB_TEST("Device for each n copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each n copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+CUB_TEST("Device for each copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu
new file mode 100644
index 00000000000..d7ad063f7ad
--- /dev/null
+++ b/cub/test/catch2_test_device_for_copy.cu
@@ -0,0 +1,182 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopy, device_for_each_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopyN, device_for_each_copy_n);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+CUB_TEST("Device for each works", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each works with unaligned vectors", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items = static_cast<int>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  thrust::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(input.begin(), input.end(), offset_t{});
+
+  thrust::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+CUB_TEST("Device for each n works with unaligned vectors", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  thrust::device_vector<int> counts(num_items);
+  thrust::device_vector<int> input(num_items + offset);
+  thrust::sequence(input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/cub/test/catch2_test_device_for_utils.cu b/cub/test/catch2_test_device_for_utils.cu
new file mode 100644
index 00000000000..a5320ff7fb6
--- /dev/null
+++ b/cub/test/catch2_test_device_for_utils.cu
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include "catch2_test_helper.h"
+
+
+template <class T>
+struct value_t
+{
+  __device__ void operator()(T) {}
+};
+
+template <class T>
+struct const_ref_t
+{
+  __device__ void operator()(const T&) {}
+};
+
+template <class T>
+struct rref_t
+{
+  __device__ void operator()(T&&) {}
+};
+
+template <class T>
+struct value_ret_t
+{
+  __device__ T operator()(T v) { return v; }
+};
+
+template <class T>
+struct ref_t
+{
+  __device__ void operator()(T&) {}
+};
+
+struct tpl_value_t
+{
+  template <class T>
+  __device__ void operator()(T) {}
+};
+
+template <class T>
+struct overload_value_t
+{
+  __device__ void operator()(T) {}
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+struct value_const_t
+{
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+void test()
+{
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_const_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_ret_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, rref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, const_ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, overload_value_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, tpl_value_t>::value);
+}
+
+CUB_TEST("Device for utils correctly detect value overloads", "[for][device]")
+{
+  test<int>();
+  test<double>();
+
+  // conversions do not work ;(
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<char, value_t<int>>::value);
+}
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index b4563fac66b..fd038e764f9 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -249,10 +249,10 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   REQUIRE(NVRTC_SUCCESS == nvrtcGetCUBIN(prog, code.get()));
   REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog));
 
-  CUcontext context;
-  CUdevice device;
-  CUmodule module;
-  CUfunction kernel;
+  CUcontext context{};
+  CUdevice device{};
+  CUmodule module{};
+  CUfunction kernel{};
 
   REQUIRE(CUDA_SUCCESS == cuInit(0));
   REQUIRE(CUDA_SUCCESS == cuDeviceGet(&device, 0));
@@ -265,10 +265,10 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   constexpr int items_per_thread = 4;
   constexpr int tile_size = threads_in_block * items_per_thread;
 
-  CUdeviceptr d_ptr;
+  CUdeviceptr d_ptr{};
   REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_ptr, tile_size * sizeof(int)));
 
-  CUdeviceptr d_err;
+  CUdeviceptr d_err{};
   REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_err, sizeof(int)));
 
   int h_ptr[tile_size];
diff --git a/thrust/benchmarks/bench/for_each/basic.cu b/thrust/benchmarks/bench/for_each/basic.cu
new file mode 100644
index 00000000000..a975065ccb7
--- /dev/null
+++ b/thrust/benchmarks/bench/for_each/basic.cu
@@ -0,0 +1,67 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+
+#include "nvbench_helper.cuh"
+
+template <class T>
+struct square_t 
+{
+  __device__ void operator()(T &x) const 
+  {
+    x = x * x;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in(elements, T{1});
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  square_t<T> op{};
+  caching_allocator_t alloc;
+  thrust::for_each(policy(alloc), in.begin(), in.end(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::for_each(policy(alloc, launch), in.begin(), in.end(), op);
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/benchmarks/bench/tabulate/basic.cu b/thrust/benchmarks/bench/tabulate/basic.cu
new file mode 100644
index 00000000000..03474d72136
--- /dev/null
+++ b/thrust/benchmarks/bench/tabulate/basic.cu
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "thrust/detail/raw_pointer_cast.h"
+#include <nvbench_helper.cuh>
+
+#include <thrust/tabulate.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+template <class T>
+struct seg_size_t
+{
+  T* d_offsets{};
+
+  template <class OffsetT>
+  __device__ T operator()(OffsetT i)
+  {
+    return d_offsets[i + 1] - d_offsets[i];
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state,
+                  nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> input(elements + 1);
+  thrust::device_vector<T> output(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements + 1);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc;
+  seg_size_t<T> op{thrust::raw_pointer_cast(input.data())};
+  thrust::tabulate(policy(alloc), output.begin(), output.end(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::tabulate(policy(alloc, launch), output.begin(), output.end(), op);
+             });
+}
+
+using types = nvbench::type_list<nvbench::uint32_t,
+                                 nvbench::uint64_t>;
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/benchmarks/bench/transform/basic.cu b/thrust/benchmarks/bench/transform/basic.cu
new file mode 100644
index 00000000000..a488c80b8d6
--- /dev/null
+++ b/thrust/benchmarks/bench/transform/basic.cu
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <nvbench_helper.cuh>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/copy.h>
+
+template <class InT, class OutT>
+struct fib_t
+{
+  __device__ OutT operator()(InT n)
+  {
+    OutT t1 = 0;
+    OutT t2 = 1;
+
+    if (n < 1)
+    {
+      return t1;
+    }
+    else if (n == 1)
+    {
+      return t1;
+    }
+    else if (n == 2)
+    {
+      return t2;
+    }
+    for (InT i = 3; i <= n; ++i)
+    {
+      const auto next = t1 + t2;
+      t1              = t2;
+      t2              = next;
+    }
+
+    return t2;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state &state,
+                  nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> input = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+  thrust::device_vector<T> output(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<nvbench::uint32_t>(elements);
+
+  caching_allocator_t alloc;
+  fib_t<T, nvbench::uint32_t> op{};
+  thrust::transform(policy(alloc), input.cbegin(), input.cend(), output.begin(), op);
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch &launch) {
+               thrust::transform(policy(alloc, launch), input.cbegin(), input.cend(), output.begin(), op);
+             });
+}
+
+using types = nvbench::type_list<nvbench::uint32_t,
+                                 nvbench::uint64_t>;
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
index 9847accf07a..f3507feb63e 100644
--- a/thrust/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -48,11 +48,12 @@
 #include <thrust/system/cuda/config.h>
 
 #include <thrust/system/cuda/detail/async/customization.h>
-#include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/system/cuda/future.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 
+#include <cub/device/device_for.cuh>
+
 #include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -124,7 +125,7 @@ unique_eager_event async_for_each_n(
   );
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::__parallel_for::parallel_for(
+    cub::DeviceFor::Bulk(
       n, std::move(wrapped), e.stream().native_handle()
     )
   , "after for_each launch"
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
index f642c2591ee..a4b280d1fb3 100644
--- a/thrust/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -53,6 +53,8 @@
 #include <thrust/distance.h>
 #include <thrust/advance.h>
 
+#include <cub/device/device_for.cuh>
+
 #include <type_traits>
 
 THRUST_NAMESPACE_BEGIN
@@ -126,7 +128,7 @@ unique_eager_event async_transform_n(
   );
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::__parallel_for::parallel_for(
+    cub::DeviceFor::Bulk(
       n, std::move(wrapped), e.stream().native_handle()
     )
   , "after transform launch"
diff --git a/thrust/thrust/system/cuda/detail/for_each.h b/thrust/thrust/system/cuda/detail/for_each.h
index 2225320adcd..e03b6f7c78a 100644
--- a/thrust/thrust/system/cuda/detail/for_each.h
+++ b/thrust/thrust/system/cuda/detail/for_each.h
@@ -37,9 +37,11 @@
 #endif // no system header
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <iterator>
 #include <thrust/system/cuda/config.h>
 
+#include <cub/device/device_for.cuh>
+
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/detail/function.h>
@@ -49,29 +51,8 @@ THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
-  // for_each functor
-  template <class Input, class UnaryOp>
-  struct for_each_f
-  {
-    Input input;
-    UnaryOp op;
-
-    THRUST_FUNCTION
-    for_each_f(Input input, UnaryOp op)
-        : input(input), op(op) {}
-
-    template <class Size>
-    THRUST_DEVICE_FUNCTION void operator()(Size idx)
-    {
-      op(raw_reference_cast(*(input + idx)));
-    }
-  };
-
-  //-------------------------
-  // Thrust API entry points
-  //-------------------------
-
   // for_each_n
+  _CCCL_EXEC_CHECK_DISABLE
   template <class Derived,
             class Input,
             class Size,
@@ -82,12 +63,17 @@ namespace cuda_cub {
              Size                       count,
              UnaryOp                    op)
   {
-    typedef thrust::detail::wrapped_function<UnaryOp, void> wrapped_t;
-    wrapped_t wrapped_op(op);
-
-    cuda_cub::parallel_for(policy,
-                           for_each_f<Input, wrapped_t>(first, wrapped_op),
-                           count);
+    THRUST_CDP_DISPATCH(
+      (cudaStream_t stream = cuda_cub::stream(policy);
+       cudaError_t  status = cub::DeviceFor::ForEachN(first, count, op, stream);
+       cuda_cub::throw_on_error(status, "parallel_for failed");
+       status = cuda_cub::synchronize_optional(policy);
+       cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
+       (for (Size idx = 0; idx != count; ++idx)
+        {
+          op(raw_reference_cast(*(first + idx)));
+        }
+    ));
 
     return first + count;
   }
@@ -104,9 +90,10 @@ namespace cuda_cub {
   {
     typedef typename iterator_traits<Input>::difference_type size_type;
     size_type count = static_cast<size_type>(thrust::distance(first,last));
-    return cuda_cub::for_each_n(policy, first,  count, op);
+
+    return THRUST_NS_QUALIFIER::cuda_cub::for_each_n(policy, first, count, op);
   }
-}    // namespace cuda_cub
+} // namespace cuda_cub
 
 THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/thrust/system/cuda/detail/parallel_for.h b/thrust/thrust/system/cuda/detail/parallel_for.h
index 9baab4d61cb..e4aac73329b 100644
--- a/thrust/thrust/system/cuda/detail/parallel_for.h
+++ b/thrust/thrust/system/cuda/detail/parallel_for.h
@@ -38,119 +38,16 @@
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <cub/device/device_for.cuh>
+
 #include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/cdp_dispatch.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
 #include <thrust/system/cuda/detail/util.h>
 
 THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
-namespace __parallel_for {
-
-  template <int _BLOCK_THREADS,
-            int _ITEMS_PER_THREAD = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-  };    // struct PtxPolicy
-
-  template <class Arch, class F>
-  struct Tuning;
-
-  template <class F>
-  struct Tuning<sm30, F>
-  {
-    typedef PtxPolicy<256, 2> type;
-  };
-
-
-  template <class F,
-            class Size>
-  struct ParallelForAgent
-  {
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, F>::type
-    {
-      typedef Tuning<Arch, F> tuning;
-    };
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS
-    };
-
-    template <bool IS_FULL_TILE>
-    static void    THRUST_DEVICE_FUNCTION
-    consume_tile(F    f,
-                 Size tile_base,
-                 int  items_in_tile)
-    {
-#pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        Size idx = BLOCK_THREADS * ITEM + threadIdx.x;
-        if (IS_FULL_TILE || idx < items_in_tile)
-          f(tile_base + idx);
-      }
-    }
-
-    THRUST_AGENT_ENTRY(F     f,
-                       Size  num_items,
-                       char * /*shmem*/ )
-    {
-      Size tile_base     = static_cast<Size>(blockIdx.x) * ITEMS_PER_TILE;
-      Size num_remaining = num_items - tile_base;
-      Size items_in_tile = static_cast<Size>(
-          num_remaining < ITEMS_PER_TILE ? num_remaining : ITEMS_PER_TILE);
-
-      if (items_in_tile == ITEMS_PER_TILE)
-      {
-        // full tile
-        consume_tile<true>(f, tile_base, ITEMS_PER_TILE);
-      }
-      else
-      {
-        // partial tile
-        consume_tile<false>(f, tile_base, items_in_tile);
-      }
-    }
-  };    // struct ParallelForEagent
-
-  template <class F,
-            class Size>
-  THRUST_RUNTIME_FUNCTION cudaError_t
-  parallel_for(Size         num_items,
-               F            f,
-               cudaStream_t stream)
-  {
-    if (num_items == 0)
-      return cudaSuccess;
-    using core::AgentLauncher;
-    using core::AgentPlan;
-
-    typedef AgentLauncher<ParallelForAgent<F, Size> > parallel_for_agent;
-    AgentPlan parallel_for_plan = parallel_for_agent::get_plan(stream);
-
-    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent");
-    pfa.launch(f, num_items);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    return cudaSuccess;
-  }
-}    // __parallel_for
-
 _CCCL_EXEC_CHECK_DISABLE
 template <class Derived,
           class F,
@@ -168,7 +65,7 @@ parallel_for(execution_policy<Derived> &policy,
   // clang-format off
   THRUST_CDP_DISPATCH(
     (cudaStream_t stream = cuda_cub::stream(policy);
-     cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
+     cudaError_t  status = cub::DeviceFor::Bulk(count, f, stream);
      cuda_cub::throw_on_error(status, "parallel_for failed");
      status = cuda_cub::synchronize_optional(policy);
      cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
@@ -181,7 +78,7 @@ parallel_for(execution_policy<Derived> &policy,
   // clang-format on
 }
 
-}    // namespace cuda_cub
+} // namespace cuda_cub
 
 THRUST_NAMESPACE_END
 #endif