diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
index 575fcf63b8eaa..5136af4724736 100644
--- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -4,6 +4,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
-  data_d_ = cms::cuda::make_device_unique<Data>(stream);
+  data_d_ = cms::cuda::make_device_unique<Data>();
   cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
 }
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 6fec0026dfaa1..4a22cdd23a43b 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -56,29 +56,19 @@ namespace cudaCompat {
     template <typename T>
     using unique_ptr = cms::cuda::device::unique_ptr<T>;
 
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_host_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_device_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
   };
 
@@ -86,24 +76,19 @@ namespace cudaCompat {
     template <typename T>
     using unique_ptr = cms::cuda::host::unique_ptr<T>;
 
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
+    template <typename T, typename... Args>
+    static auto make_host_unique(Args &&... args) {
+      return cms::cuda::make_host_unique<T>(std::forward<Args>(args)...);
     }
 
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
+    template <typename T, typename... Args>
+    static auto make_device_unique(Args &&... args) {
+      return cms::cuda::make_device_unique<T>(std::forward<Args>(args)...);
     }
   };
 
@@ -111,26 +96,46 @@ namespace cudaCompat {
     template <typename T>
     using unique_ptr = std::unique_ptr<T>;
 
+    template <typename T>
+    static auto make_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_unique(size_t size) {
+      return std::make_unique<T>(size);
+    }
     template <typename T>
     static auto make_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
     }
 
+    template <typename T>
+    static auto make_host_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_host_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_device_unique() {
+      return std::make_unique<T>();
+    }
     template <typename T>
     static auto make_device_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
+    template <typename T>
+    static auto make_device_unique(size_t size) {
+      return std::make_unique<T>(size);
+    }
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
@@ -146,13 +151,12 @@ class HeterogeneousSoAImpl {
   template <typename V>
   using unique_ptr = typename Traits::template unique_ptr<V>;
 
-  HeterogeneousSoAImpl() = default;  // make root happy
+  HeterogeneousSoAImpl();
   ~HeterogeneousSoAImpl() = default;
   HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
   HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
 
   explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cudaStream_t stream);
 
   T const *get() const { return m_ptr.get(); }
 
@@ -165,8 +169,8 @@ class HeterogeneousSoAImpl {
 };
 
 template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
-  m_ptr = Traits::template make_unique<T>(stream);
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl() {
+  m_ptr = Traits::template make_unique<T>();
 }
 
 // in reality valid only for GPU version...
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
index 7bef9d0d8a52f..978b1ae776fa4 100644
--- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -5,17 +5,19 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
-  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
-  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);
+  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
+  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters);
+  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>();
   cms::cuda::copyAsync(view_d, view, stream);
 }
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
index ffef71092f6c9..abf5124f80db4 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -9,11 +9,13 @@
 
 SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
     : formatterErrors_h(std::move(errors)) {
-  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>();
+  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords);
 
   cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   error_h = cms::cuda::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
   GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 664364b6ff25a..656d69d4da5d8 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -5,15 +5,17 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
-  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
+  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords);
+  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords);
 
-  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
-  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
+  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);
+  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords);
 
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
@@ -21,7 +23,7 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
   view->moduleInd_ = moduleInd_d.get();
   view->clus_ = clus_d.get();
 
-  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>();
   cms::cuda::copyAsync(view_d, view, stream);
 }
 
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 955f97ca6bd54..1221db3219ef8 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -73,11 +73,13 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
                                                                      uint32_t const* hitsModuleStart,
                                                                      cudaStream_t stream)
     : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
+  // device-side ownership to guarantee that the host memory is alive
+  // until the copy finishes
   auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;
-  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
-  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+  m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>();
+  m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>();
   view->m_averageGeometry = m_AverageGeometryStore.get();
   view->m_cpeParams = cpeParams;
   view->m_hitsModuleStart = hitsModuleStart;
@@ -101,9 +103,9 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // if ordering is relevant they may have to be stored phi-ordered by layer or so
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
-  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
-  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
+  m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16);
+  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11);
+  m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>();
 
   auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
index 9c271fc2fbff1..1cc014978ecb1 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
@@ -6,6 +6,14 @@
 namespace cms {
   namespace cuda {
     // Allocate device memory
+    // This variant does not create device-side ownership
+    void *allocate_device(int dev, size_t nbytes);
+
+    // Allocate device memory
+    // This variant creates device-side ownership. When freed, all work
+    // in the stream up to the freeing point must be finished for the
+    // memory block to be considered free (except for new allocation in
+    // the same stream)
     void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
 
     // Free device memory (to be called from unique_ptr)
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
index 1bba4580028d3..7e6f87bd9b04e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
@@ -6,6 +6,14 @@
 namespace cms {
   namespace cuda {
     // Allocate pinned host memory (to be called from unique_ptr)
+    // This variant does not create device-side ownership
+    void *allocate_host(size_t nbytes);
+
+    // Allocate pinned host memory (to be called from unique_ptr)
+    // This variant creates device-side ownership. When freed, all work
+    // in the stream up to the freeing point must be finished for the
+    // memory block to be considered free (except for new allocation in
+    // the same stream)
     void *allocate_host(size_t nbytes, cudaStream_t stream);
 
     // Free pinned host memory (to be called from unique_ptr)
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
index eb86c05be465c..756991c323de0 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -48,6 +48,16 @@ namespace cms {
       }  // namespace impl
     }    // namespace device
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique() {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T));
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
       static_assert(std::is_trivially_constructible<T>::value,
@@ -58,6 +68,17 @@ namespace cms {
                                                                               device::impl::DeviceDeleter{dev}};
     }
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type));
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
                                                                                               cudaStream_t stream) {
@@ -74,6 +95,14 @@ namespace cms {
     typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
 
     // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized() {
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T));
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
         cudaStream_t stream) {
@@ -83,6 +112,15 @@ namespace cms {
                                                                               device::impl::DeviceDeleter{dev}};
     }
 
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type));
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
+
     template <typename T>
     typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
         size_t n, cudaStream_t stream) {
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
index a64b7c9869d6c..d408fd7081c4d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -37,6 +37,14 @@ namespace cms {
     }    // namespace host
 
     // Allocate pinned host memory
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique() {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(sizeof(T));
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
       static_assert(std::is_trivially_constructible<T>::value,
@@ -45,6 +53,15 @@ namespace cms {
       return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
     }
 
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(n * sizeof(element_type));
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
       using element_type = typename std::remove_extent<T>::type;
@@ -58,12 +75,25 @@ namespace cms {
     typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
 
     // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized() {
+      void *mem = allocate_host(sizeof(T));
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
+
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
       void *mem = allocate_host(sizeof(T), stream);
       return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
     }
 
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(size_t n) {
+      using element_type = typename std::remove_extent<T>::type;
+      void *mem = allocate_host(n * sizeof(element_type));
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
+
     template <typename T>
     typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
         size_t n, cudaStream_t stream) {
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
index 075d568f21039..e884ae2039cff 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
@@ -126,19 +126,29 @@ namespace notcub {
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+      bool has_stream;  // Whether or not the block has an associated stream (CMS: support for no stream association)
 
       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+      // CMS: add initialization of has_stream
       BlockDescriptor(void *d_ptr, int device)
-          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+          : d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr),
+            has_stream(false) {}
 
       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+      // CMS: add initialization of has_stream
       BlockDescriptor(int device)
           : d_ptr(nullptr),
             bytes(0),
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
-            ready_event(nullptr) {}
+            ready_event(nullptr),
+            has_stream(false) {}
 
       // Comparison functor for comparing device pointers
       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
@@ -321,9 +331,10 @@ namespace notcub {
      * streams when all prior work submitted to \p active_stream has completed.
      */
     cudaError_t DeviceAllocate(
-        int device,                            ///< [in] Device on which to place the allocation
-        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
-        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        int device,    ///< [in] Device on which to place the allocation
+        void **d_ptr,  ///< [out] Reference to pointer to the allocation
+        size_t bytes,  ///< [in] Minimum number of bytes for the allocation
+        bool has_stream,  ///< [in] Whether or not associate the stream with this allocation (CMS: support for no stream association)
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
       *d_ptr = nullptr;
@@ -339,6 +350,7 @@ namespace notcub {
       // Create a block descriptor for the requested allocation
       bool found = false;
       BlockDescriptor search_key(device);
+      search_key.has_stream = has_stream;  // CMS: support for no stream association
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
 
@@ -364,12 +376,14 @@ namespace notcub {
                (block_itr->bin == search_key.bin)) {
           // To prevent races with reusing blocks returned by the host but still
           // in use by the device, only consider cached blocks that are
-          // either (from the active stream) or (from an idle stream)
-          if ((active_stream == block_itr->associated_stream) ||
+          // either (not associated to a stream) or (from the active stream) or (from an idle stream)
+          // CMS: do stream association check only if block is associated to a stream
+          if ((!block_itr->has_stream) || (has_stream && active_stream == block_itr->associated_stream) ||
               (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
             // Reuse existing cache block.  Insert into live blocks.
             found = true;
             search_key = *block_itr;
+            search_key.has_stream = has_stream;  // CMS: support for no stream association
             search_key.associated_stream = active_stream;
             live_blocks.insert(search_key);
 
@@ -377,18 +391,35 @@ namespace notcub {
             cached_bytes[device].free -= search_key.bytes;
             cached_bytes[device].live += search_key.bytes;
 
-            if (debug)
+            if (debug) {
               // CMS: improved debug message
-              _CubLog(
-                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
-                  "associated with stream %lld, event %lld).\n",
-                  device,
-                  search_key.d_ptr,
-                  (long long)search_key.bytes,
-                  (long long)search_key.associated_stream,
-                  (long long)search_key.ready_event,
-                  (long long)block_itr->associated_stream,
-                  (long long)block_itr->ready_event);
+              // CMS: support for no stream association
+              if (search_key.has_stream) {
+                _CubLog(
+                    "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
+                    "associated %d with stream %lld, event %lld).\n",
+                    device,
+                    search_key.d_ptr,
+                    (long long)search_key.bytes,
+                    (long long)search_key.associated_stream,
+                    (long long)search_key.ready_event,
+                    block_itr->has_stream,
+                    (long long)block_itr->associated_stream,
+                    (long long)block_itr->ready_event);
+              } else {
+                _CubLog(
+                    "\tDevice %d reused cached block at %p (%lld bytes) without stream association, event %lld "
+                    "(previously "
+                    "associated %d with stream %lld, event %lld).\n",
+                    device,
+                    search_key.d_ptr,
+                    (long long)search_key.bytes,
+                    (long long)search_key.ready_event,
+                    block_itr->has_stream,
+                    (long long)block_itr->associated_stream,
+                    (long long)block_itr->ready_event);
+              }
+            }
 
             cached_blocks.erase(block_itr);
 
@@ -414,12 +445,23 @@ namespace notcub {
         // Attempt to allocate
         if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
-          if (debug)
-            _CubLog(
-                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                device,
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream);
+          if (debug) {
+            // CMS: support for no stream association
+            if (search_key.has_stream) {
+              _CubLog(
+                  "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached "
+                  "allocations",
+                  device,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream);
+            } else {
+              _CubLog(
+                  "\tDevice %d failed to allocate %lld bytes without stream association, retrying after freeing cached "
+                  "allocations",
+                  device,
+                  (long long)search_key.bytes);
+            }
+          }
 
           error = cudaSuccess;  // Reset the error we will return
           cudaGetLastError();   // Reset CUDART's error
@@ -483,15 +525,26 @@ namespace notcub {
         cached_bytes[device].live += search_key.bytes;
         mutex.Unlock();
 
-        if (debug)
+        if (debug) {
           // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
-              device,
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event);
+          // CMS: support for no stream association
+          if (search_key.has_stream) {
+            _CubLog(
+                "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+                device,
+                search_key.d_ptr,
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event);
+          } else {
+            _CubLog(
+                "\tDevice %d allocated new device block at %p (%lld bytes without stream association, event %lld).\n",
+                device,
+                search_key.d_ptr,
+                (long long)search_key.bytes,
+                (long long)search_key.ready_event);
+          }
+        }
 
         // Attempt to revert back to previous device if necessary
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
@@ -565,20 +618,35 @@ namespace notcub {
           cached_blocks.insert(search_key);
           cached_bytes[device].free += search_key.bytes;
 
-          if (debug)
+          if (debug) {
             // CMS: improved debug message
-            _CubLog(
-                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
-                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                device,
-                (long long)search_key.bytes,
-                d_ptr,
-                (long long)search_key.associated_stream,
-                (long long)search_key.ready_event,
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes[device].free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes[device].live);
+            // CMS: support for no stream association
+            if (search_key.has_stream) {
+              _CubLog(
+                  "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
+                  "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                  device,
+                  (long long)search_key.bytes,
+                  d_ptr,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes[device].free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes[device].live);
+            } else {
+              _CubLog(
+                  "\tDevice %d returned %lld bytes at %p without stream association.\n\t\t %lld available "
+                  "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                  device,
+                  (long long)search_key.bytes,
+                  d_ptr,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes[device].free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes[device].live);
+            }
+          }
         }
       }
 
@@ -590,7 +658,8 @@ namespace notcub {
           return error;
       }
 
-      if (recached) {
+      // CMS: support for no stream association
+      if (recached && search_key.has_stream) {
         // Insert the ready event in the associated stream (must have current device set properly)
         if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
           return error;
@@ -606,20 +675,35 @@ namespace notcub {
         if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
           return error;
 
-        if (debug)
+        if (debug) {
           // CMS: improved debug message
-          _CubLog(
-              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
-              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-              device,
-              (long long)search_key.bytes,
-              d_ptr,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes[device].free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes[device].live);
+          // CMS: support for no stream association
+          if (search_key.has_stream) {
+            _CubLog(
+                "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
+                "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device,
+                (long long)search_key.bytes,
+                d_ptr,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+          } else {
+            _CubLog(
+                "\tDevice %d freed %lld bytes at %p without stream association.\n\t\t  %lld available "
+                "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device,
+                (long long)search_key.bytes,
+                d_ptr,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+          }
+        }
       }
 
       // Reset device
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
index 53901e1f1cb27..9f5695e6b555f 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
@@ -127,6 +127,7 @@ namespace notcub {
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+      bool has_stream;          // Whether or not the block has an associated stream
 
       // Constructor (suitable for searching maps for a specific block, given its pointer)
       BlockDescriptor(void *d_ptr)
@@ -135,7 +136,8 @@ namespace notcub {
             bin(INVALID_BIN),
             device(INVALID_DEVICE_ORDINAL),
             associated_stream(nullptr),
-            ready_event(nullptr) {}
+            ready_event(nullptr),
+            has_stream(false) {}
 
       // Constructor (suitable for searching maps for a range of suitable blocks)
       BlockDescriptor()
@@ -144,7 +146,8 @@ namespace notcub {
             bin(INVALID_BIN),
             device(INVALID_DEVICE_ORDINAL),
             associated_stream(nullptr),
-            ready_event(nullptr) {}
+            ready_event(nullptr),
+            has_stream(false) {}
 
       // Comparison functor for comparing host pointers
       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
@@ -312,6 +315,7 @@ namespace notcub {
     cudaError_t HostAllocate(
         void **d_ptr,                          ///< [out] Reference to pointer to the allocation
         size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        bool has_stream,                       ///< [in] Whether or not associate the stream with this allocation
         cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
     {
       *d_ptr = nullptr;
@@ -325,6 +329,7 @@ namespace notcub {
       bool found = false;
       BlockDescriptor search_key;
       search_key.device = device;
+      search_key.has_stream = has_stream;
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
 
@@ -347,12 +352,15 @@ namespace notcub {
         // Iterate through the range of cached blocks in the same bin
         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
         while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
-          // To prevent races with reusing blocks returned by the host but still
-          // in use for transfers, only consider cached blocks that are from an idle stream
-          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
+          // To prevent races with reusing blocks returned by the host
+          // but still in use for transfers, only consider cached
+          // blocks that are from an idle stream or have no stream
+          // association
+          if ((!block_itr->has_stream) or cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
             // Reuse existing cache block.  Insert into live blocks.
             found = true;
             search_key = *block_itr;
+            search_key.has_stream = has_stream;
             search_key.associated_stream = active_stream;
             if (search_key.device != device) {
               // If "associated" device changes, need to re-create the event on the right device
@@ -373,17 +381,29 @@ namespace notcub {
             cached_bytes.free -= search_key.bytes;
             cached_bytes.live += search_key.bytes;
 
-            if (debug)
-              _CubLog(
-                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
-                  "(previously associated with stream %lld, event %lld).\n",
-                  search_key.d_ptr,
-                  (long long)search_key.bytes,
-                  (long long)search_key.associated_stream,
-                  (long long)search_key.ready_event,
-                  (long long)search_key.device,
-                  (long long)block_itr->associated_stream,
-                  (long long)block_itr->ready_event);
+            if (debug) {
+              if (search_key.has_stream) {
+                _CubLog(
+                    "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
+                    "(previously associated with stream %lld, event %lld).\n",
+                    search_key.d_ptr,
+                    (long long)search_key.bytes,
+                    (long long)search_key.associated_stream,
+                    (long long)search_key.ready_event,
+                    (long long)search_key.device,
+                    (long long)block_itr->associated_stream,
+                    (long long)block_itr->ready_event);
+              } else {
+                _CubLog(
+                    "\tHost reused cached block at %p (%lld bytes) without stream association "
+                    "(previously associated %d with stream %lld, event %lld).\n",
+                    search_key.d_ptr,
+                    (long long)search_key.bytes,
+                    block_itr->has_stream,
+                    (long long)block_itr->associated_stream,
+                    (long long)block_itr->ready_event);
+              }
+            }
 
             cached_blocks.erase(block_itr);
 
@@ -403,13 +423,21 @@ namespace notcub {
         if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
             cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
-          if (debug)
-            _CubLog(
-                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
-                "allocations",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.device);
+          if (debug) {
+            if (search_key.has_stream) {
+              _CubLog(
+                  "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
+                  "allocations",
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.device);
+            } else {
+              _CubLog(
+                  "\tHost failed to allocate %lld bytes without stream association, retrying after freeing cached "
+                  "allocations",
+                  (long long)search_key.bytes);
+            }
+          }
 
           error = cudaSuccess;  // Reset the error we will return
           cudaGetLastError();   // Reset CUDART's error
@@ -471,15 +499,22 @@ namespace notcub {
         cached_bytes.live += search_key.bytes;
         mutex.Unlock();
 
-        if (debug)
-          _CubLog(
-              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
-              "%lld).\n",
-              search_key.d_ptr,
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device);
+        if (debug) {
+          if (search_key.has_stream) {
+            _CubLog(
+                "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
+                "%lld).\n",
+                search_key.d_ptr,
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)search_key.device);
+          } else {
+            _CubLog("\tHost allocated new host block at %p (%lld bytes without stream association).\n",
+                    search_key.d_ptr,
+                    (long long)search_key.bytes);
+          }
+        }
       }
 
       // Copy host pointer to output parameter
@@ -524,18 +559,30 @@ namespace notcub {
           cached_blocks.insert(search_key);
           cached_bytes.free += search_key.bytes;
 
-          if (debug)
-            _CubLog(
-                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
-                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                (long long)search_key.bytes,
-                (long long)search_key.associated_stream,
-                (long long)search_key.ready_event,
-                (long long)search_key.device,
-                (long long)cached_blocks.size(),
-                (long long)cached_bytes.free,
-                (long long)live_blocks.size(),
-                (long long)cached_bytes.live);
+          if (debug) {
+            if (search_key.has_stream) {
+              _CubLog(
+                  "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
+                  "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)search_key.device,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes.free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes.live);
+            } else {
+              _CubLog(
+                  "\tHost returned %lld bytes without stream association.\n\t\t %lld "
+                  "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                  (long long)search_key.bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes.free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes.live);
+            }
+          }
         }
       }
 
@@ -546,7 +593,7 @@ namespace notcub {
           return error;
       }
 
-      if (recached) {
+      if (recached && search_key.has_stream) {
         // Insert the ready event in the associated stream (must have current device set properly)
         if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
           return error;
@@ -562,18 +609,30 @@ namespace notcub {
         if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
           return error;
 
-        if (debug)
-          _CubLog(
-              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
-              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-              (long long)search_key.bytes,
-              (long long)search_key.associated_stream,
-              (long long)search_key.ready_event,
-              (long long)search_key.device,
-              (long long)cached_blocks.size(),
-              (long long)cached_bytes.free,
-              (long long)live_blocks.size(),
-              (long long)cached_bytes.live);
+        if (debug) {
+          if (search_key.has_stream) {
+            _CubLog(
+                "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
+                "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)search_key.device,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+          } else {
+            _CubLog(
+                "\tHost freed %lld bytes without stream association.\n\t\t  %lld available "
+                "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long)search_key.bytes,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+          }
+        }
       }
 
       // Reset device
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
index c3a33fcee3553..68870f2bdb91f 100644
--- a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
@@ -13,6 +13,21 @@ namespace {
 }
 
 namespace cms::cuda {
+  void *allocate_device(int dev, size_t nbytes) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, false));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      cudaCheck(cudaMalloc(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
   void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
     void *ptr = nullptr;
     if constexpr (allocator::useCaching) {
@@ -20,7 +35,7 @@ namespace cms::cuda {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, true, stream));
     } else {
       ScopedSetDevice setDeviceForThisScope(dev);
       cudaCheck(cudaMalloc(&ptr, nbytes));
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
index 1e8c8f9cd33c5..746e998296865 100644
--- a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
@@ -12,6 +12,20 @@ namespace {
 }
 
 namespace cms::cuda {
+  void *allocate_host(size_t nbytes) {
+    void *ptr = nullptr;
+    if constexpr (allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, false));
+    } else {
+      cudaCheck(cudaMallocHost(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
   void *allocate_host(size_t nbytes, cudaStream_t stream) {
     void *ptr = nullptr;
     if constexpr (allocator::useCaching) {
@@ -19,7 +33,7 @@ namespace cms::cuda {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, true, stream));
     } else {
       cudaCheck(cudaMallocHost(&ptr, nbytes));
     }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 53af26ac7527d..dd43c04753fbb 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -549,7 +549,7 @@ namespace pixelgpudetails {
     }
     clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
 
-    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
+    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2);
 
     if (wordCounter)  // protect in case of empty event....
     {