diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt
index 1c3102d0d71..3932fffea6f 100644
--- a/cub/benchmarks/CMakeLists.txt
+++ b/cub/benchmarks/CMakeLists.txt
@@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
       add_bench(bench_target ${tuning_name} "${bench_src}")
       # for convenience, make tuning variant buildable by default
       file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
-      target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
+      target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
     else()
       # benchmarking
       register_cccl_benchmark("${bench_name}" "")
diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/babelstream.h
new file mode 100644
index 00000000000..0f482d59e2f
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream.h
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <stdexcept>
+
+#include <nvbench_helper.cuh>
+
+template <typename... RandomAccessIteratorsIn>
+#if TUNE_BASE
+using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
+#else
+struct policy_hub_t
+{
+  struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
+  {
+    static constexpr int min_bif    = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
+    using algo_policy =
+      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
+                       cub::detail::transform::fallback_for_policy,
+                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
+  };
+};
+#endif
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types =
+  nvbench::type_list<std::int8_t,
+                     std::int16_t,
+                     float,
+                     double
+#  ifdef NVBENCH_HELPER_HAS_I128
+                     ,
+                     __int128
+#  endif
+                     >;
+#endif
+
+// BabelStream uses 2^25, H200 can fit 2^31 int128s
+// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
+// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
+// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
+// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
+inline auto array_size_powers = nvbench::range(16, 28, 4);
+
+template <typename OffsetT,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
+void bench_transform(
+  nvbench::state& state,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  OffsetT num_items,
+  TransformOp transform_op,
+  ExecTag exec_tag = nvbench::exec_tag::no_batch)
+{
+  state.exec(exec_tag, [&](const nvbench::launch& launch) {
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
+  });
+}
+
+// Modified from BabelStream to also work for integers
+inline constexpr auto startA      = 1; // BabelStream: 0.1
+inline constexpr auto startB      = 2; // BabelStream: 0.2
+inline constexpr auto startC      = 3; // BabelStream: 0.1
+inline constexpr auto startScalar = 4; // BabelStream: 0.4
+
+// TODO(bgruber): we should put those somewhere into libcu++:
+// from C++ GSL
+struct narrowing_error : std::runtime_error
+{
+  narrowing_error()
+      : std::runtime_error("Narrowing error")
+  {}
+};
+
+// from C++ GSL
+// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
+template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
+constexpr DstT narrow(SrcT value)
+{
+  constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
+  const auto converted                   = static_cast<DstT>(value);
+  if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
+  {
+    throw narrowing_error{};
+  }
+  return converted;
+}
diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
new file mode 100644
index 00000000000..87abdfef6ff
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+
+  const T scalar = startScalar;
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
+    return ci * scalar;
+  });
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("mul")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
new file mode 100644
index 00000000000..c8fa017b788
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+      return ai + bi;
+    });
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("add")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T, typename OffsetT>
+static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+      return bi + scalar * ci;
+    });
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("triad")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
new file mode 100644
index 00000000000..db541554210
--- /dev/null
+++ b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n         = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
+  // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
+  // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
+  thrust::device_vector<T> d;
+  if (!overwrite)
+  {
+    d.resize(n);
+  }
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state,
+    ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
+    overwrite ? a.begin() : d.begin(),
+    n,
+    [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+      return ai + bi + scalar * ci;
+    },
+    nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
+                              // OverwriteInput=0, but doesn't hurt
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("nstream")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers)
+  .add_int64_axis("OverwriteInput", {1});
diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh
index f02ae6c0024..2c4d6dd5f4e 100644
--- a/cub/cub/cub.cuh
+++ b/cub/cub/cub.cuh
@@ -76,6 +76,7 @@
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_spmv.cuh>
+#include <cub/device/device_transform.cuh>
 
 // Grid
 // #include <cub/grid/grid_barrier.cuh>
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
new file mode 100644
index 00000000000..984109692f6
--- /dev/null
+++ b/cub/cub/device/device_transform.cuh
@@ -0,0 +1,271 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/tuple>
+
+CUB_NAMESPACE_BEGIN
+
+//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
+//! sequences into an output sequence.
+struct DeviceTransform
+{
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many
+  //!     :end-before: example-end transform-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    return detail::transform::
+      dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-stable
+  //!     :end-before: example-end transform-many-stable
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
+    return detail::transform::
+      dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
new file mode 100644
index 00000000000..8fb596da075
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -0,0 +1,866 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+_CCCL_NV_DIAG_SUPPRESS(186)
+#  include <cuda_pipeline_primitives.h>
+// we cannot re-enable the warning here, because it is triggered outside the translation unit
+// see also: https://godbolt.org/z/1x8b4hn3G
+#endif // defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+#include <cuda/cmath>
+#include <cuda/ptx>
+#include <cuda/std/__algorithm/clamp.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/expected>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <cassert>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+
+CUB_NAMESPACE_BEGIN
+
+// the ublkcp kernel needs PTX features that are only available and understood by CTK 12 and later
+#if _CCCL_CUDACC_VER_MAJOR >= 12
+#  define _CUB_HAS_TRANSFORM_UBLKCP
+#endif // _CCCL_CUDACC_VER_MAJOR >= 12
+
+namespace detail
+{
+namespace transform
+{
+_CCCL_HOST_DEVICE constexpr int sum()
+{
+  return 0;
+}
+
+// TODO(bgruber): remove with C++17
+template <typename... Ts>
+_CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail)
+{
+  return head + sum(tail...);
+}
+
+#if _CCCL_STD_VER >= 2017
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return (int{sizeof(value_t<Its>)} + ... + 0);
+}
+#else // ^^^ C++17 ^^^ / vvv C++11 vvv
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return sum(int{sizeof(value_t<Its>)}...);
+}
+#endif // _CCCL_STD_VER >= 2017
+
+enum class Algorithm
+{
+  fallback_for,
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  ublkcp,
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+// this kernel replicates the behavior of cub::DeviceFor::Bulk
+template <typename ForPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorsIn>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::fallback_for>,
+  Offset num_items,
+  int /* items_per_thread */,
+  F transform_op,
+  RandomAccessIteratorOut out,
+  RandomAccessIteratorsIn... ins)
+{
+  auto op = [&](Offset i) {
+    out[i] = transform_op(ins[i]...);
+  };
+  using OpT = decltype(op);
+
+  // TODO(bgruber): verbatim copy from for_each's static_kernel below:
+  using agent_t = for_each::agent_block_striped_t<ForPolicy, Offset, OpT>;
+
+  constexpr auto block_threads  = ForPolicy::block_threads;
+  constexpr auto items_per_tile = ForPolicy::items_per_thread * block_threads;
+
+  const auto tile_base     = static_cast<Offset>(blockIdx.x) * items_per_tile;
+  const auto num_remaining = num_items - tile_base;
+  const auto items_in_tile = static_cast<Offset>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+template <int BlockThreads>
+struct async_copy_policy_t
+{
+  static constexpr int block_threads = BlockThreads;
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  static constexpr int min_items_per_thread = 1;
+  static constexpr int max_items_per_thread = 32;
+};
+
+// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17.
+template <class F, class Tuple, std::size_t... Is>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence<Is...>)
+  -> decltype(::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...))
+{
+  return ::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...);
+}
+
+template <class F, class Tuple>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t)
+  -> decltype(poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{}))
+{
+  return poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{});
+}
+
+// mult must be a power of 2
+template <typename Integral>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral
+{
+#if _CCCL_STD_VER > 2011
+  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
+#endif // _CCCL_STD_VER > 2011
+  return (x + mult - 1) & ~(mult - 1);
+}
+
+template <typename T>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
+{
+#if _CCCL_STD_VER > 2011
+  _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), "");
+#endif // _CCCL_STD_VER > 2011
+  return reinterpret_cast<const char*>(
+    reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
+}
+
+// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
+//
+// For performance considerations of memcpy_async:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async
+//
+// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we
+// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower
+// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to
+// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we
+// have to align to 128 bytes instead of 16.
+//
+// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts:
+// * According to the CUDA programming guide
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable
+// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is
+// always aligned to at least 256 bytes."
+// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on
+// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer.
+// * This should count for device memory, but also for device accessible memory living on the host.
+// * The base pointer alignment and size rounding also never leaves the size of a cache line.
+//
+// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is
+// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a
+// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end.
+
+// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some
+// conditions.
+// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring
+// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of
+// the element type.
+
+// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels.
+// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes
+// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes
+template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation
+struct aligned_base_ptr
+{
+  using value_type = T;
+
+  const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to
+                   // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0
+  int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127].
+
+  _CCCL_HOST_DEVICE const T* ptr_to_elements() const
+  {
+    return reinterpret_cast<const T*>(ptr + head_padding);
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b)
+  {
+    return a.ptr == b.ptr && a.head_padding == b.head_padding;
+  }
+};
+
+template <typename T>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T>
+{
+  const char* base_ptr = round_down_ptr(ptr, alignment);
+  return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)};
+}
+
+constexpr int bulk_copy_alignment     = 128;
+constexpr int bulk_copy_size_multiple = 16;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
+{
+  const ::cuda::std::uint32_t membermask = ~0;
+  ::cuda::std::uint32_t is_elected;
+  asm volatile(
+    "{\n\t .reg .pred P_OUT; \n\t"
+    "elect.sync _|P_OUT, %1;\n\t"
+    "selp.b32 %0, 1, 0, P_OUT; \n"
+    "}"
+    : "=r"(is_elected)
+    : "r"(membermask)
+    :);
+  return threadIdx.x < 32 && static_cast<bool>(is_elected);
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile(
+  ::cuda::std::uint64_t& bar,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  ::cuda::std::uint32_t& total_bytes_bulk_copied,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  static_assert(alignof(T) <= bulk_copy_alignment, "");
+
+  const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
+  char* dst       = smem + smem_offset;
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+
+  // TODO(bgruber): we could precompute bytes_to_copy on the host
+  const int bytes_to_copy = round_up_to_po2_multiple(
+    aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
+
+  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
+  total_bytes_bulk_copied += bytes_to_copy;
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile_fallback(
+  int tile_size,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* src = aligned_ptr.ptr_to_elements() + global_offset;
+  T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+  _LIBCUDACXX_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+
+  const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
+  cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE const T&
+fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
+  smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
+  return smem_operand_tile_base[smem_idx];
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_ublkcp(
+  Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  __shared__ uint64_t bar;
+  extern __shared__ char __align__(bulk_copy_alignment) smem[];
+
+  namespace ptx = ::cuda::ptx;
+
+  constexpr int block_dim = BulkCopyPolicy::block_threads;
+  const int tile_stride   = block_dim * num_elem_per_thread;
+  const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
+  const int tile_size     = ::cuda::std::min(num_items - offset, Offset{tile_stride});
+
+  const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
+  if (inner_blocks)
+  {
+    // use one thread to setup the entire bulk copy
+    if (elect_one())
+    {
+      ptx::mbarrier_init(&bar, 1);
+      ptx::fence_proxy_async(ptx::space_shared);
+
+      int smem_offset                    = 0;
+      ::cuda::std::uint32_t total_copied = 0;
+
+      // TODO(bgruber): use a fold over comma in C++17
+      // Order of evaluation is left-to-right
+      int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)...,
+                     0};
+      (void) dummy;
+
+      // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
+      ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
+    }
+
+    // all threads wait for bulk copy
+    __syncthreads();
+    while (!ptx::mbarrier_try_wait_parity(&bar, 0))
+      ;
+  }
+  else
+  {
+    // use all threads to schedule an async_memcpy
+    int smem_offset = 0;
+
+    // TODO(bgruber): use a fold over comma in C++17
+    // Order of evaluation is left-to-right
+    int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0};
+    (void) dummy;
+
+    cooperative_groups::wait(cooperative_groups::this_thread_block());
+  }
+
+  // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
+  out += offset;
+
+  // note: I tried expressing the UBLKCP_AGENT as a function object but it adds a lot of code to handle the variadics
+  // TODO(bgruber): use a polymorphic lambda in C++14
+#  define UBLKCP_AGENT(full_tile)                                                                                 \
+    _Pragma("unroll 1") /* Unroll 1 tends to improve performance, especially for smaller data types (confirmed by \
+                             benchmark) */                                                                        \
+      for (int j = 0; j < num_elem_per_thread; ++j)                                                               \
+    {                                                                                                             \
+      const int idx = j * block_dim + threadIdx.x;                                                                \
+      if (full_tile || idx < tile_size)                                                                           \
+      {                                                                                                           \
+        int smem_offset = 0;                                                                                      \
+        /* need to expand into a tuple for guaranteed order of evaluation*/                                       \
+        out[idx] = poor_apply(                                                                                    \
+          [&](const InTs&... values) {                                                                            \
+            return f(values...);                                                                                  \
+          },                                                                                                      \
+          ::cuda::std::tuple<InTs...>{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...});      \
+      }                                                                                                           \
+    }
+  if (tile_stride == tile_size)
+  {
+    UBLKCP_AGENT(true);
+  }
+  else
+  {
+    UBLKCP_AGENT(false);
+  }
+#  undef UBLKCP_AGENT
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>,
+  Offset num_items,
+  int num_elem_per_thread,
+  F f,
+  RandomAccessIteratorOut out,
+  aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  // only call the real kernel for sm90 and later
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (transform_kernel_ublkcp<BulkCopyPolicy>(num_items, num_elem_per_thread, f, out, aligned_ptrs...);));
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <typename It>
+union kernel_arg
+{
+  aligned_base_ptr<value_t<It>> aligned_ptr;
+  It iterator;
+
+  _CCCL_HOST_DEVICE kernel_arg() {} // in case It is not default-constructible
+};
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.iterator = it;
+  return arg;
+}
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment);
+  return arg;
+}
+
+// TODO(bgruber): make a variable template in C++14
+template <Algorithm Alg>
+using needs_aligned_ptr_t =
+  ::cuda::std::bool_constant<false
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                             || Alg == Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                             >;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg(
+  ::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> aligned_base_ptr<value_t<It>>&&
+{
+  return ::cuda::std::move(arg.aligned_ptr);
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<!needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto
+select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> It&&
+{
+  return ::cuda::std::move(arg.iterator);
+}
+
+// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
+// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This
+// saves some compile-time and binary size.
+template <typename MaxPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteartorsIn>
+__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
+    Offset num_items,
+    int num_elem_per_thread,
+    F f,
+    RandomAccessIteratorOut out,
+    kernel_arg<RandomAccessIteartorsIn>... ins)
+{
+  constexpr auto alg = ::cuda::std::integral_constant<Algorithm, MaxPolicy::ActivePolicy::algorithm>{};
+  transform_kernel_impl<typename MaxPolicy::ActivePolicy::algo_policy>(
+    alg,
+    num_items,
+    num_elem_per_thread,
+    ::cuda::std::move(f),
+    ::cuda::std::move(out),
+    select_kernel_arg(alg, ::cuda::std::move(ins))...);
+}
+
+constexpr int arch_to_min_bytes_in_flight(int sm_arch)
+{
+  // TODO(bgruber): use if-else in C++14 for better readability
+  return sm_arch >= 900 ? 48 * 1024 // 32 for H100, 48 for H200
+       : sm_arch >= 800 ? 16 * 1024 // A100
+                        : 12 * 1024; // V100 and below
+}
+
+template <typename... RandomAccessIteratorsIn>
+_CCCL_HOST_DEVICE constexpr auto bulk_copy_smem_for_tile_size(int tile_size) -> int
+{
+  return round_up_to_po2_multiple(int{sizeof(int64_t)}, bulk_copy_alignment) /* bar */
+       // 128 bytes of padding for each input tile (handles before + after)
+       + tile_size * loaded_bytes_per_iteration<RandomAccessIteratorsIn...>()
+       + sizeof...(RandomAccessIteratorsIn) * bulk_copy_alignment;
+}
+
+using fallback_for_policy = for_each::policy_hub_t::policy_350_t::for_policy_t;
+
+template <bool RequiresStableAddress, typename RandomAccessIteratorTupleIn>
+struct policy_hub
+{
+  static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple");
+};
+
+template <bool RequiresStableAddress, typename... RandomAccessIteratorsIn>
+struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIteratorsIn...>>
+{
+  static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0;
+  static constexpr bool all_contiguous =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorsIn>...>::value;
+  static constexpr bool all_values_trivially_reloc =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_trivially_relocatable<value_t<RandomAccessIteratorsIn>>...>::value;
+
+  static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc;
+
+  // TODO(bgruber): consider a separate kernel for just filling
+
+  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
+    // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
+    static constexpr auto algorithm = Algorithm::fallback_for;
+    using algo_policy               = fallback_for_policy;
+  };
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // H100 and H200
+  struct policy900 : ChainedPolicy<900, policy900, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(900);
+    using async_policy           = async_copy_policy_t<256>;
+    static constexpr bool exhaust_smem =
+      bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(
+        async_policy::block_threads * async_policy::min_items_per_thread)
+      > 48 * 1024;
+    static constexpr bool any_type_is_overalinged =
+#  if _CCCL_STD_VER >= 2017
+      ((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment) || ...);
+#  else
+      sum((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment)...) > 0;
+#  endif
+
+    static constexpr bool use_fallback =
+      RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
+    static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp;
+    using algo_policy               = ::cuda::std::_If<use_fallback, fallback_for_policy, async_policy>;
+  };
+
+  using max_policy = policy900;
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+  using max_policy = policy300;
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+// TODO(bgruber): replace by ::cuda::std::expected in C++14
+template <typename T>
+struct PoorExpected
+{
+  alignas(T) char storage[sizeof(T)];
+  cudaError_t error;
+
+  _CCCL_HOST_DEVICE PoorExpected(T value)
+      : error(cudaSuccess)
+  {
+    new (storage) T(::cuda::std::move(value));
+  }
+
+  _CCCL_HOST_DEVICE PoorExpected(cudaError_t error)
+      : error(error)
+  {}
+
+  _CCCL_HOST_DEVICE explicit operator bool() const
+  {
+    return error == cudaSuccess;
+  }
+
+  _CCCL_HOST_DEVICE T& operator*()
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE const T& operator*() const
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<const T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE T* operator->()
+  {
+    return &**this;
+  }
+
+  _CCCL_HOST_DEVICE const T* operator->() const
+  {
+    return &**this;
+  }
+};
+
+// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this.
+_CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
+{
+  //  gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels
+  //  will later be launched on) belongs to the currently active device. So we can just query the active device here.
+  int device = 0;
+  auto error = CubDebug(cudaGetDevice(&device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  int max_smem = 0;
+  error        = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  return max_smem;
+}
+
+struct elem_counts
+{
+  int elem_per_thread;
+  int tile_size;
+  int smem_size;
+};
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename RandomAccessIteratorTupleIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub = policy_hub<RequiresStableAddress, RandomAccessIteratorTupleIn>>
+struct dispatch_t;
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub>
+struct dispatch_t<RequiresStableAddress,
+                  Offset,
+                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                  RandomAccessIteratorOut,
+                  TransformOp,
+                  PolicyHub>
+{
+  static_assert(::cuda::std::is_same<Offset, ::cuda::std::int32_t>::value
+                  || ::cuda::std::is_same<Offset, ::cuda::std::int64_t>::value,
+                "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
+
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
+  RandomAccessIteratorOut out;
+  Offset num_items;
+  TransformOp op;
+  cudaStream_t stream;
+
+#define CUB_DETAIL_TRANSFORM_KERNEL_PTR             \
+  &transform_kernel<typename PolicyHub::max_policy, \
+                    Offset,                         \
+                    TransformOp,                    \
+                    RandomAccessIteratorOut,        \
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>
+
+  static constexpr int loaded_bytes_per_iter = loaded_bytes_per_iteration<RandomAccessIteratorsIn...>();
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running
+  // on?
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel()
+    -> PoorExpected<
+      ::cuda::std::
+        tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
+  {
+    using policy_t          = typename ActivePolicy::algo_policy;
+    constexpr int block_dim = policy_t::block_threads;
+    static_assert(block_dim % bulk_copy_alignment == 0,
+                  "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple
+                                                                                        // of 128-byte
+
+    auto determine_element_counts = [&]() -> PoorExpected<elem_counts> {
+      const auto max_smem = get_max_shared_memory();
+      if (!max_smem)
+      {
+        return max_smem.error;
+      }
+
+      elem_counts last_counts{};
+      // Increase the number of output elements per thread until we reach the required bytes in flight.
+      static_assert(policy_t::min_items_per_thread <= policy_t::max_items_per_thread, ""); // ensures the loop below
+      // runs at least once
+      for (int elem_per_thread = +policy_t::min_items_per_thread; elem_per_thread < +policy_t::max_items_per_thread;
+           ++elem_per_thread)
+      {
+        const int tile_size = block_dim * elem_per_thread;
+        const int smem_size = bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(tile_size);
+        if (smem_size > *max_smem)
+        {
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          // assert should be prevented by smem check in policy
+          assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory");
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          return last_counts;
+        }
+
+        if (tile_size >= num_items)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        int max_occupancy = 0;
+        const auto error =
+          CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size));
+        if (error != cudaSuccess)
+        {
+          return error;
+        }
+
+        const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter;
+        if (ActivePolicy::min_bif <= bytes_in_flight_SM)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        last_counts = elem_counts{elem_per_thread, tile_size, smem_size};
+      }
+      return last_counts;
+    };
+    PoorExpected<elem_counts> config = [&]() {
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        (
+          // this static variable exists for each template instantiation of the surrounding function and class, on which
+          // the chosen element count solely depends (assuming max SMEM is constant during a program execution)
+          static auto cached_config = determine_element_counts(); return cached_config;),
+        (
+          // we cannot cache the determined element count in device code
+          return determine_element_counts();));
+    }();
+    if (!config)
+    {
+      return config.error;
+    }
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+    assert(config->elem_per_thread > 0);
+    assert(config->tile_size > 0);
+    assert(config->tile_size % bulk_copy_alignment == 0);
+    assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+
+    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
+    return ::cuda::std::make_tuple(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream),
+      CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+      config->elem_per_thread);
+  }
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>)
+  {
+    auto ret = configure_ublkcp_kernel<ActivePolicy>();
+    if (!ret)
+    {
+      return ret.error;
+    }
+    // TODO(bgruber): use a structured binding in C++17
+    // auto [launcher, kernel, elem_per_thread] = *ret;
+
+    return ::cuda::std::get<0>(*ret).doit(
+      ::cuda::std::get<1>(*ret),
+      num_items,
+      ::cuda::std::get<2>(*ret),
+      op,
+      out,
+      make_aligned_base_ptr_kernel_arg(
+        THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)), bulk_copy_alignment)...);
+  }
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::fallback_for>)
+  {
+    constexpr int block_threads    = ActivePolicy::algo_policy::block_threads;
+    constexpr int items_per_thread = ActivePolicy::algo_policy::items_per_thread;
+    constexpr int tile_size        = block_threads * items_per_thread;
+    const auto grid_dim            = static_cast<unsigned>(::cuda::ceil_div(num_items, Offset{tile_size}));
+    return CubDebug(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_threads, 0, stream)
+        .doit(
+          CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+          num_items,
+          items_per_thread,
+          op,
+          out,
+          make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
+  }
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    // // TODO(bgruber): replace the overload set by if constexpr in C++17
+    return invoke_algorithm<ActivePolicy>(::cuda::std::index_sequence_for<RandomAccessIteratorsIn...>{},
+                                          ::cuda::std::integral_constant<Algorithm, ActivePolicy::algorithm>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+    RandomAccessIteratorOut out,
+    Offset num_items,
+    TransformOp op,
+    cudaStream_t stream)
+  {
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int ptx_version = 0;
+    auto error      = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream};
+    return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
+  }
+
+#undef CUB_DETAIL_TRANSFORM_KERNEL_PTR
+};
+} // namespace transform
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 48a0142801a..3ec8c94eef1 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -261,6 +261,11 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
       target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
     endif()
 
+    # enable lambdas for all API examples
+    if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api")
+      target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+    endif()
+
     target_link_libraries(${test_target} PRIVATE
       ${cub_target}
       ${config_c2h_target}
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
new file mode 100644
index 00000000000..50f253ef5c3
--- /dev/null
+++ b/cub/test/catch2_test_device_transform.cu
@@ -0,0 +1,556 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/zip_function.h>
+
+#include <sstream>
+
+#include "c2h/custom_type.cuh"
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+#include "test/test_util_vec.h"
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using cub::detail::transform::Algorithm;
+
+template <Algorithm Alg>
+struct policy_hub_for_alg
+{
+  struct max_policy : cub::ChainedPolicy<300, max_policy, max_policy>
+  {
+    static constexpr int min_bif         = 64 * 1024;
+    static constexpr Algorithm algorithm = Alg;
+    using algo_policy =
+      ::cuda::std::_If<Alg == Algorithm::fallback_for,
+                       cub::detail::transform::fallback_for_policy,
+                       cub::detail::transform::async_copy_policy_t<256>>;
+  };
+};
+
+template <Algorithm Alg,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp>
+CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  Offset num_items,
+  TransformOp transform_op,
+  cudaStream_t stream = nullptr)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+
+  constexpr bool RequiresStableAddress = false;
+  return cub::detail::transform::dispatch_t<RequiresStableAddress,
+                                            Offset,
+                                            ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                            RandomAccessIteratorOut,
+                                            TransformOp,
+                                            policy_hub_for_alg<Alg>>{}
+    .dispatch(inputs, output, num_items, transform_op, stream);
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::TransformStableArgumentAddresses, transform_many_stable);
+DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point,
+                            transform_many_with_alg,
+                            ESCAPE_LIST(Algorithm Alg, typename Offset),
+                            ESCAPE_LIST(Alg, Offset));
+
+using algorithms =
+  c2h::enum_type_list<Algorithm,
+                      Algorithm::fallback_for
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                      ,
+                      Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                      >;
+
+using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP                                \
+    if (alg == Algorithm::ublkcp && ptx_version < 900) \
+    {                                                  \
+      return;                                          \
+    }
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+#define FILTER_UNSUPPORTED_ALGS                                           \
+  int ptx_version = 0;                                                    \
+  REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess);                   \
+  _CCCL_DIAG_PUSH                                                         \
+  _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ \
+  FILTER_UBLKCP                                                           \
+  _CCCL_DIAG_POP
+
+CUB_TEST("DeviceTransform::Transform BabelStream add",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t /*, int3, double3*/>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  const int num_items = GENERATE(0, 1, 15, 16, 17, 127, 128, 129, 4095, 4096, 4097); // edge cases around 16 and 128
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg, num_items);
+
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::gen(CUB_SEED(1), a);
+  c2h::gen(CUB_SEED(1), b);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+template <int Alignment>
+struct alignas(Alignment) overaligned_addable_t
+{
+  int value;
+
+  overaligned_addable_t() = default;
+
+  _CCCL_HOST_DEVICE overaligned_addable_t(int val)
+      : value{val}
+  {}
+
+  _CCCL_HOST_DEVICE static void check(const overaligned_addable_t& obj)
+  {
+    if (reinterpret_cast<uintptr_t>(&obj) % Alignment != 0)
+    {
+      printf("Error: object not aligned to %d: %p\n", Alignment, &obj);
+      ::cuda::std::terminate();
+    }
+  }
+
+  _CCCL_HOST_DEVICE friend auto operator==(const overaligned_addable_t& a, const overaligned_addable_t& b) -> bool
+  {
+    check(a);
+    check(b);
+    return a.value == b.value;
+  }
+
+  _CCCL_HOST_DEVICE friend auto
+  operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t
+  {
+    check(a);
+    check(b);
+    return overaligned_addable_t{a.value + b.value};
+  }
+
+  _CCCL_HOST friend auto operator<<(std::ostream& os, const overaligned_addable_t& obj) -> std::ostream&
+  {
+    check(obj);
+    return os << "over{" << obj.value << "}";
+  }
+};
+
+using overaligned_types =
+  c2h::type_list<overaligned_addable_t<32>
+#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned
+                 ,
+                 overaligned_addable_t<256>
+#endif // _CCCL_COMPILER_MSVC
+                 >;
+
+// test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively)
+CUB_TEST("DeviceTransform::Transform overaligned type", "[device][device_transform]", overaligned_types)
+{
+  using type = c2h::get<0, TestType>;
+  CAPTURE(c2h::demangle(typeid(type).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<int> a(num_items, 3); // put some integers at the front, so SMEM has to handle different alignments
+  c2h::device_vector<type> b(num_items, 4);
+
+  c2h::device_vector<type> result(num_items);
+  // we need raw pointers here to halfen the conversion sequence from device_reference<int> -> int -> type when calling
+  // plus(...), which is too long to compile
+  transform_many(::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())),
+                 result.begin(),
+                 num_items,
+                 ::cuda::std::plus<type>{});
+
+  REQUIRE(result == c2h::device_vector<type>(num_items, 7));
+}
+
+CUB_TEST("DeviceTransform::Transform huge type", "[device][device_transform]")
+{
+  using huge_t = c2h::custom_type_t<c2h::equal_comparable_t, c2h::accumulateable_t, c2h::huge_data<666>::type>;
+  static_assert(alignof(huge_t) == 8, "Need a large type with alignment < 16");
+  CAPTURE(c2h::demangle(typeid(huge_t).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<huge_t> a(num_items);
+  c2h::device_vector<huge_t> b(num_items);
+  c2h::gen(CUB_SEED(1), a);
+  c2h::gen(CUB_SEED(1), b);
+
+  c2h::device_vector<huge_t> result(num_items);
+  transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<huge_t>{});
+
+  c2h::host_vector<huge_t> a_h = a;
+  c2h::host_vector<huge_t> b_h = b;
+  c2h::host_vector<huge_t> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<huge_t>{});
+  REQUIRE(result == reference_h);
+}
+
+struct times_seven
+{
+  _CCCL_HOST_DEVICE auto operator()(unsigned char v) const -> char
+  {
+    return static_cast<unsigned char>(v * 7);
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform with large input", "[device][device_transform]", algorithms)
+try
+{
+  using type         = unsigned char;
+  using offset_t     = cuda::std::int64_t;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(alg);
+
+  constexpr offset_t num_items = (offset_t{1} << 32) + 123456; // a few thread blocks beyond 4GiB
+  c2h::device_vector<type> input(num_items);
+  c2h::gen(CUB_SEED(1), input);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, times_seven{});
+
+  // compute reference and verify
+  c2h::host_vector<type> input_h = input;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(input_h.begin(), input_h.end(), reference_h.begin(), times_seven{});
+  REQUIRE((reference_h == result));
+}
+catch (const std::bad_alloc&)
+{
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
+}
+
+template <typename T>
+struct nstream_kernel
+{
+  static constexpr T scalar = 42;
+
+  _CCCL_HOST_DEVICE T operator()(const T& ai, const T& bi, const T& ci) const
+  {
+    return ai + bi + scalar * ci;
+  }
+};
+
+// overwrites one input stream
+CUB_TEST("DeviceTransform::Transform BabelStream nstream",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg);
+
+  const int num_items = GENERATE(0, 1, 100, 1000, 10000);
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::device_vector<type> c(num_items);
+  c2h::gen(CUB_SEED(1), a, type{10}, type{100});
+  c2h::gen(CUB_SEED(1), b, type{10}, type{100});
+  c2h::gen(CUB_SEED(1), c, type{10}, type{100});
+
+  // copy to host before changing
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> c_h = c;
+
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel<type>{});
+
+  // compute reference and verify
+  auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin());
+  std::transform(z, z + num_items, a_h.begin(), thrust::make_zip_function(nstream_kernel<type>{}));
+  REQUIRE(a_h == a);
+}
+
+struct sum_five
+{
+  __device__ auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  {
+    return a + b + c + d + e;
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transform]", algorithms)
+{
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<std::int8_t> a(num_items, 1);
+  c2h::device_vector<std::int16_t> b(num_items, 2);
+  c2h::device_vector<std::int32_t> c(num_items, 3);
+  c2h::device_vector<std::int64_t> d(num_items, 4);
+  c2h::device_vector<float> e(num_items, 5);
+
+  c2h::device_vector<double> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()),
+    result.begin(),
+    num_items,
+    sum_five{});
+
+  // compute reference and verify
+  c2h::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+  REQUIRE(reference == result);
+}
+
+struct give_me_five
+{
+  __device__ auto operator()() const -> int
+  {
+    return 5;
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<int> result(num_items);
+  transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, give_me_five{});
+
+  // compute reference and verify
+  c2h::device_vector<int> reference(num_items, 5);
+  REQUIRE(reference == result);
+}
+
+CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  thrust::counting_iterator<type> b{10};
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b, reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][device_transform]", algorithms)
+{
+  using type         = int;
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items, 13);
+  c2h::device_vector<type> b(num_items, 35);
+  c2h::device_vector<type> result(num_items);
+
+  using thrust::placeholders::_1;
+  auto out = thrust::make_transform_output_iterator(result.begin(), _1 + 4);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), out, num_items, ::cuda::std::plus<type>{});
+  REQUIRE(result == c2h::device_vector<type>(num_items, (13 + 35) + 4));
+}
+
+CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  c2h::device_vector<type> b(num_items, 10);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+struct plus_needs_stable_address
+{
+  int* a;
+  int* b;
+
+  _CCCL_HOST_DEVICE int operator()(const int& v) const
+  {
+    const auto i = &v - a;
+    return v + b[i];
+  }
+};
+
+CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  thrust::sequence(a.begin(), a.end());
+  thrust::sequence(b.begin(), b.end(), 42);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_stable(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())),
+    result.begin(),
+    num_items,
+    plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())});
+
+  // compute reference and verify
+  c2h::device_vector<type> a_h = a;
+  c2h::device_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+// Non-trivially-copyable/relocatable type which cannot be copied using std::memcpy or cudaMemcpy
+struct non_trivial
+{
+  int data;
+
+  non_trivial() = default;
+
+  _CCCL_HOST_DEVICE explicit non_trivial(int data)
+      : data(data)
+  {}
+
+  _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt)
+      : data(nt.data)
+  {}
+
+  _CCCL_HOST_DEVICE auto operator=(const non_trivial& nt) -> non_trivial&
+  {
+    data = nt.data;
+    return *this;
+  }
+
+  _CCCL_HOST_DEVICE auto operator-() const -> non_trivial
+  {
+    return non_trivial{-data};
+  }
+
+  friend _CCCL_HOST_DEVICE auto operator==(non_trivial a, non_trivial b) -> bool
+  {
+    return a.data == b.data;
+  }
+};
+static_assert(!::cuda::std::is_trivially_copyable<non_trivial>::value, ""); // as required by the standard
+static_assert(!thrust::is_trivially_relocatable<non_trivial>::value, ""); // CUB uses this check internally
+
+// Note(bgruber): I gave up on writing a test that checks whether the copy ctor/assignment operator is actually called
+// (e.g. by tracking/counting invocations of those), since C++ allows (but not guarantees) elision of these operations.
+// Also thrust algorithms perform a lot of copies in-between, so the test needs to use only raw allocations and
+// iteration for setup and checking.
+CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<non_trivial> input(num_items, non_trivial{42});
+  c2h::device_vector<non_trivial> result(num_items);
+  transform_many(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(input.data())), result.begin(), num_items, ::cuda::std::negate<>{});
+
+  const auto reference = c2h::device_vector<non_trivial>(num_items, non_trivial{-42});
+  REQUIRE((reference == result));
+}
+
+CUB_TEST("DeviceTransform::Transform buffer start alignment",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, float, double>)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 1000;
+  const int offset        = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned
+  CAPTURE(c2h::demangle(typeid(type).name()), offset);
+  c2h::device_vector<type> input(num_items);
+  thrust::sequence(input.begin(), input.end());
+  c2h::device_vector<type> result(num_items);
+  using thrust::placeholders::_1;
+  transform_many(::cuda::std::make_tuple(input.begin() + offset),
+                 result.begin() + offset,
+                 num_items - offset,
+                 _1 * 10); // FIXME(bgruber): does not work on negative
+
+  c2h::device_vector<type> reference(num_items);
+  thrust::tabulate(reference.begin() + offset, reference.end(), (_1 + offset) * 10);
+  REQUIRE(reference == result);
+}
+
+namespace Catch
+{
+template <typename T>
+struct StringMaker<cub::detail::transform::aligned_base_ptr<T>>
+{
+  static auto convert(cub::detail::transform::aligned_base_ptr<T> abp) -> std::string
+  {
+    std::stringstream ss;
+    ss << "{ptr: " << abp.ptr << ", head_padding: " << abp.head_padding << "}";
+    return ss.str();
+  }
+};
+} // namespace Catch
+
+// TODO(bgruber): rewrite this example using int3
+CUB_TEST("DeviceTransform::Transform aligned_base_ptr", "[device][device_transform]")
+{
+  alignas(128) int arr[256];
+  using namespace cub::detail::transform;
+  CHECK(make_aligned_base_ptr(&arr[0], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 0});
+  CHECK(make_aligned_base_ptr(&arr[1], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 4});
+  CHECK(make_aligned_base_ptr(&arr[5], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 20});
+  CHECK(make_aligned_base_ptr(&arr[31], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 124});
+  CHECK(make_aligned_base_ptr(&arr[32], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 0});
+  CHECK(make_aligned_base_ptr(&arr[33], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 4});
+  CHECK(make_aligned_base_ptr(&arr[127], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[96]), 124});
+  CHECK(make_aligned_base_ptr(&arr[128], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 0});
+  CHECK(make_aligned_base_ptr(&arr[129], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 4});
+}
diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu
new file mode 100644
index 00000000000..46388ed6b23
--- /dev/null
+++ b/cub/test/catch2_test_device_transform_api.cu
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "catch2_test_helper.h"
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_api()
+{
+  // example-begin transform-many
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<float>{5.2f, 3.1f, -1.1f, 3.0f};
+  auto input3              = thrust::counting_iterator<int>{100};
+  auto op                  = [] __device__(int a, float b, int c) {
+    return (a + b) * c;
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::Transform(
+    ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{520, 111, 397, 618};
+  // example-end transform-many
+  CHECK(result == expected);
+}
+
+CUB_TEST("DeviceTransform::Transform API example", "[device][device_transform]")
+{
+  test_transform_api();
+}
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_stable_api()
+{
+  // example-begin transform-many-stable
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<int>{52, 31, -11, 30};
+
+  auto* input1_ptr = thrust::raw_pointer_cast(input1.data());
+  auto* input2_ptr = thrust::raw_pointer_cast(input2.data());
+
+  auto op = [input1_ptr, input2_ptr] __device__(const int& a) -> int {
+    const auto i = &a - input1_ptr; // we depend on the address of a
+    return a + input2_ptr[i];
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::TransformStableArgumentAddresses(
+    ::cuda::std::make_tuple(input1_ptr), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{52, 29, -6, 33};
+  // example-end transform-many-stable
+  CHECK(result == expected);
+}
+
+CUB_TEST("DeviceTransform::TransformStableArgumentAddresses API example", "[device][device_transform]")
+{
+  test_transform_stable_api();
+}
diff --git a/cub/test/catch2_test_launch_helper.h b/cub/test/catch2_test_launch_helper.h
index 11da1a32e3b..4add1d15d11 100644
--- a/cub/test/catch2_test_launch_helper.h
+++ b/cub/test/catch2_test_launch_helper.h
@@ -74,7 +74,7 @@
 //! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples.
 
 #if !defined(TEST_LAUNCH)
-#  error Test file should contain %PARAM% TEST_LAUNCH lid 0:1
+#  error Test file should contain %PARAM% TEST_LAUNCH lid 0:1:2
 #endif
 
 #define DECLARE_INVOCABLE(API, WRAPPED_API_NAME, TMPL_HEAD_OPT, TMPL_ARGS_OPT)                  \