From 4f44db0d667a2c576dd6247090ca391aa4ed1518 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 4 Nov 2020 02:57:28 +0100
Subject: [PATCH 1/4] Add GenericCachingAllocator, use it for device memory

---
 .../CUDACore/GenericCachingAllocator.h        | 426 ++++++++++++++++++
 src/cudadev/CUDACore/allocate_device.cc       |   7 +-
 src/cudadev/CUDACore/allocate_host.cc         |   3 +-
 src/cudadev/CUDACore/deviceAllocatorStatus.cc |   2 +-
 src/cudadev/CUDACore/deviceAllocatorStatus.h  |   7 +-
 .../CUDACore/getCachingDeviceAllocator.h      |  73 ++-
 6 files changed, 498 insertions(+), 20 deletions(-)
 create mode 100644 src/cudadev/CUDACore/GenericCachingAllocator.h
diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h
new file mode 100644
index 000000000..32584f055
--- /dev/null
+++ b/src/cudadev/CUDACore/GenericCachingAllocator.h
@@ -0,0 +1,426 @@
+#ifndef CUDACore_GenericCachingAllocator_h
+#define CUDACore_GenericCachingAllocator_h
+
+#include <exception>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "deviceAllocatorStatus.h"
+
+// Inspired by cub::CachingDeviceAllocator
+
+/*
+struct CUDAHostTraits {
+  using DeviceType = int;
+  using QueueType = cudaStream_t;
+  using EventType = cudaEvent_t;
+  struct Dummy {};
+
+  constexpr DeviceType kInvalidDevice = -1;
+
+  static DeviceType currentDevice() {
+    return cms::cuda::currentDevice();
+  }
+
+  static Dummy setDevice(DeviceType device) {
+    return {};
+  }
+
+  static bool canReuseInDevice(DeviceType a, DeviceType b) {
+    // Pinned host memory can be reused in any device, but in case of
+    // changes the event must be re-created
+    return true;
+  }
+
+  static bool canReuseInQueue(QueueType a, QueueType b) {
+    // For pinned host memory a freed block without completed event
+    // can not be re-used even for operations in the same queue
+    return false;
+  }
+
+  static bool eventWorkHasCompleted(EventType e) {
+    return cms::cuda::eventWorkHasCompleted(e);
+  }
+
+  static EventType createEvent() {
+    EventType e;
+    cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+    return e;
+  }
+
+  static void destroyEvent(EventType e) {
+    cudaCheck(cudaEventDestroy(e));
+  }
+
+  static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) {
+    cudaCheck(cudaSetDevice(prev));
+    destroyEvent(e);
+    cudaCheck(cudaSetDevice(next));
+    return createEvent();
+  }
+
+  static EventType recordEvent(EventType e, QueueType queue) {
+    cudaCheck(cudaEventRecord(e, queue));
+  }
+
+  static std::ostream& printDevice(std::ostream& os, DeviceType dev) {
+    os << "Host";
+    return os;
+  }
+
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
+    return ptr;
+  }
+
+  static void* tryAllocate(size_t bytes) {
+    void* ptr;
+    auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault);
+    if (error == cudaErrorMemoryAllocation) {
+      return nullptr;
+    }
+    cudaCheck(error);
+    return ptr;
+  }
+
+  static void free(void* ptr) {
+    cudaCheck(cudaFreeHost(ptr));
+  }
+}
+*/
+
+namespace allocator {
+  inline unsigned int intPow(unsigned int base, unsigned int exp) {
+    unsigned int ret = 1;
+    while (exp > 0) {
+      if (exp & 1) {
+        ret = ret * base;
+      }
+      base = base * base;
+      exp = exp >> 1;
+    }
+    return ret;
+  }
+
+  // return (power, roundedBytes)
+  inline std::tuple<unsigned int, size_t> nearestPowerOf(unsigned int base, size_t value) {
+    unsigned int power = 0;
+    size_t roundedBytes = 1;
+    if (value * base < value) {
+      // Overflow
+      power = sizeof(size_t) * 8;
+      roundedBytes = size_t(0) - 1;
+    } else {
+      while (roundedBytes < value) {
+        roundedBytes *= base;
+        ++power;
+      }
+    }
+
+    return std::tuple(power, roundedBytes);
+  }
+}  // namespace allocator
+
+template <typename Traits>
+class GenericCachingAllocator {
+public:
+  using DeviceType = typename Traits::DeviceType;
+  using QueueType = typename Traits::QueueType;
+  using EventType = typename Traits::EventType;
+
+  using TotalBytes = cms::cuda::allocator::TotalBytes;
+  using DeviceCachedBytes = std::map<DeviceType, TotalBytes>;
+
+  explicit GenericCachingAllocator(
+      unsigned int binGrowth, unsigned int minBin, unsigned int maxBin, size_t maxCachedBytes, bool debug)
+      : cachedBlocks_(&BlockDescriptor::SizeCompare),
+        liveBlocks_(&BlockDescriptor::PtrCompare),
+        minBinBytes_(allocator::intPow(binGrowth, minBin)),
+        maxBinBytes_(allocator::intPow(binGrowth, maxBin)),
+        maxCachedBytes_(maxCachedBytes),
+        binGrowth_(binGrowth),
+        minBin_(minBin),
+        maxBin_(maxBin),
+        debug_(debug) {}
+  ~GenericCachingAllocator() { freeAllCached(); }
+
+  // Cache allocation status (for monitoring purposes)
+  DeviceCachedBytes cacheStatus() const {
+    std::scoped_lock lock(mutex_);
+    return cachedBytes_;
+  }
+
+  // Allocate given number of bytes on the current device associated to queue
+  void* allocate(DeviceType device, size_t bytes, QueueType queue) {
+    if (bytes > maxBinBytes_) {
+      throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
+                               " bytes is too large for the caching allocator with maximum bin " +
+                               std::to_string(maxBinBytes_) +
+                               " bytes. You might want to increase the maximum bin size");
+    }
+
+    // Create a block descriptor for the requested allocation
+    BlockDescriptor searchKey;
+    searchKey.bytesRequested = bytes;
+    searchKey.device = device;
+    searchKey.associatedQueue = queue;
+    if (bytes < minBinBytes_) {
+      searchKey.bin = minBin_;
+      searchKey.bytes = minBinBytes_;
+    } else {
+      std::tie(searchKey.bin, searchKey.bytes) = allocator::nearestPowerOf(binGrowth_, bytes);
+    }
+
+    // Try to re-use cached block
+    searchKey.ptr = tryReuseCachedBlock(searchKey);
+
+    // allocate if necessary
+    if (searchKey.ptr == nullptr) {
+      auto scopedSetDevice = Traits::setDevice(device);
+
+      searchKey.ptr = Traits::tryAllocate(searchKey.bytes);
+      if (searchKey.ptr == nullptr) {
+        // The allocation attempt failed: free all cached blocks on device and retry
+        if (debug_) {
+          std::cout << "\t";
+          Traits::printDevice(std::cout, device)
+              << " failed to allocate " << searchKey.bytes << " bytes for queue " << searchKey.associatedQueue
+              << ", retrying after freeing cached allocations" << std::endl;
+        }
+
+        freeCachedBlocksOnDevice(device);
+
+        searchKey.ptr = Traits::allocate(searchKey.bytes);
+      }
+
+      searchKey.readyEvent = Traits::createEvent();
+
+      {
+        std::scoped_lock lock(mutex_);
+        liveBlocks_.insert(searchKey);
+        cachedBytes_[device].live += searchKey.bytes;
+        cachedBytes_[device].liveRequested += searchKey.bytesRequested;
+      }
+
+      if (debug_) {
+        std::cout << "\t";
+        Traits::printDevice(std::cout, device)
+            << " allocated new block at " << searchKey.ptr << " (" << searchKey.bytes << " bytes associated with queue "
+            << searchKey.associatedQueue << ", event " << searchKey.readyEvent << "." << std::endl;
+      }
+    }
+
+    if (debug_) {
+      std::cout << "\t\t" << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free
+                << " bytes), " << liveBlocks_.size() << " live blocks outstanding (" << cachedBytes_[device].live
+                << " bytes)." << std::endl;
+    }
+
+    return searchKey.ptr;
+  }
+
+  // Frees an allocation on a given device
+  void free(DeviceType device, void* ptr) {
+    bool recache = false;
+    BlockDescriptor searchKey;
+    searchKey.device = device;
+    searchKey.ptr = ptr;
+
+    auto scopedSetDevice = Traits::setDevice(device);
+
+    {
+      std::scoped_lock lock(mutex_);
+
+      auto iBlock = liveBlocks_.find(searchKey);
+      if (iBlock == liveBlocks_.end()) {
+        std::stringstream ss;
+        ss << "Trying to free a non-live block at " << ptr;
+        throw std::runtime_error(ss.str());
+      }
+      searchKey = *iBlock;
+      liveBlocks_.erase(iBlock);
+      cachedBytes_[device].live -= searchKey.bytes;
+      cachedBytes_[device].liveRequested -= searchKey.bytesRequested;
+
+      recache = (cachedBytes_[device].free + searchKey.bytes <= maxCachedBytes_);
+      if (recache) {
+        cachedBlocks_.insert(searchKey);
+        cachedBytes_[device].free += searchKey.bytes;
+
+        if (debug_) {
+          std::cout << "\t";
+          Traits::printDevice(std::cout, device)
+              << " returned " << searchKey.bytes << " bytes at " << ptr << " from associated queue "
+              << searchKey.associatedQueue << " , event " << searchKey.readyEvent << " .\n\t\t " << cachedBlocks_.size()
+              << " available "
+                 "blocks cached ("
+              << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. ("
+              << cachedBytes_[device].live << " bytes)" << std::endl;
+        }
+      }
+
+      if (recache) {
+        Traits::recordEvent(searchKey.readyEvent, searchKey.associatedQueue);
+      }
+    }
+
+    if (not recache) {
+      Traits::free(ptr);
+      Traits::destroyEvent(searchKey.readyEvent);
+      if (debug_) {
+        std::cout << "\t";
+        Traits::printDevice(std::cout, device)
+            << " freed " << searchKey.bytes << " bytes at " << ptr << " from associated queue "
+            << searchKey.associatedQueue << ", event " << searchKey.readyEvent << ".\n\t\t  " << cachedBlocks_.size()
+            << " available "
+               "blocks cached ("
+            << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks ("
+            << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
+      }
+    }
+  }
+
+private:
+  struct BlockDescriptor {
+    void* ptr = nullptr;
+    size_t bytes = 0;
+    size_t bytesRequested = 0;  // for monitoring only
+    unsigned int bin = 0;
+    DeviceType device = Traits::kInvalidDevice;
+    QueueType associatedQueue;
+    EventType readyEvent;
+
+    static bool PtrCompare(BlockDescriptor const& a, BlockDescriptor const& b) {
+      if (a.device == b.device)
+        return a.ptr < b.ptr;
+      return a.device < b.device;
+    }
+
+    static bool SizeCompare(BlockDescriptor const& a, BlockDescriptor const& b) {
+      if (a.device == b.device)
+        return a.bytes < b.bytes;
+      return a.device < b.device;
+    }
+  };
+
+  void* tryReuseCachedBlock(BlockDescriptor& searchKey) {
+    std::scoped_lock lock(mutex_);
+
+    // Iterate through the range of cached blocks on the same device in the same bin
+    for (auto iBlock = cachedBlocks_.lower_bound(searchKey);
+         iBlock != cachedBlocks_.end() and Traits::canReuseInDevice(searchKey.device, iBlock->device) and
+         iBlock->bin == searchKey.bin;
+         ++iBlock) {
+      if (Traits::canReuseInQueue(searchKey.associatedQueue, iBlock->associatedQueue) or
+          Traits::eventWorkHasCompleted(iBlock->readyEvent)) {
+        // Reuse existing cache block. Insert into live blocks.
+        auto device = searchKey.device;
+        auto queue = searchKey.associatedQueue;
+        searchKey = *iBlock;
+        searchKey.associatedQueue = queue;
+
+        if (searchKey.device != device) {
+          searchKey.readyEvent = Traits::recreateEvent(searchKey.readyEvent, searchKey.device, device);
+          searchKey.device = device;
+        }
+
+        liveBlocks_.insert(searchKey);
+
+        cachedBytes_[device].free -= searchKey.bytes;
+        cachedBytes_[device].live += searchKey.bytes;
+        cachedBytes_[device].live += searchKey.bytesRequested;
+
+        if (debug_) {
+          std::cout << "\t";
+          Traits::printDevice(std::cout, device)
+              << " reused cached block at " << searchKey.ptr << " (" << searchKey.bytes << "bytes) for queue "
+              << searchKey.associatedQueue << ", event " << searchKey.readyEvent
+              << " (previously "
+                 "associated with stream "
+              << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl;
+        }
+
+        cachedBlocks_.erase(iBlock);
+        return searchKey.ptr;
+      }
+    }
+
+    return nullptr;
+  }
+
+  void freeCachedBlocksOnDevice(DeviceType device) {
+    std::scoped_lock lock(mutex_);
+
+    BlockDescriptor freeKey;
+    freeKey.device = device;
+    for (auto iBlock = cachedBlocks_.lower_bound(freeKey);
+         iBlock != cachedBlocks_.end() and iBlock->device == device;) {
+      Traits::free(iBlock->ptr);
+      Traits::destroyEvent(iBlock->readyEvent);
+      cachedBytes_[device].free -= iBlock->bytes;
+
+      if (debug_) {
+        std::cout << "\t";
+        Traits::printDevice(std::cout, device)
+            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << cachedBlocks_.size() << " available blocks cached ("
+            << cachedBytes_[device].free << " bytes), " << liveBlocks_.size()
+            << " live blocks "
+               "("
+            << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
+      }
+
+      iBlock = cachedBlocks_.erase(iBlock);
+    }
+  }
+
+  void freeAllCached() {
+    std::scoped_lock lock(mutex_);
+
+    while (not cachedBlocks_.empty()) {
+      auto iBlock = cachedBlocks_.begin();
+      auto scopedSetDevice = Traits::setDevice(iBlock->device);
+      Traits::free(iBlock->ptr);
+      Traits::destroyEvent(iBlock->readyEvent);
+      cachedBytes_[iBlock->device].free -= iBlock->bytes;
+
+      if (debug_) {
+        std::cout << "\t";
+        Traits::printDevice(std::cout, iBlock->device)
+            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << (cachedBlocks_.size()-1) << " available blocks cached ("
+            << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() << " live blocks ("
+            << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl;
+      }
+
+      cachedBlocks_.erase(iBlock);
+    }
+  }
+
+  using Compare = typename std::add_pointer<bool(BlockDescriptor const&, BlockDescriptor const&)>::type;
+  using CachedBlocks = std::multiset<BlockDescriptor, Compare>;  // ordered by size
+  using BusyBlocks = std::multiset<BlockDescriptor, Compare>;    // ordered by ptr
+
+  mutable std::mutex mutex_;
+
+  DeviceCachedBytes cachedBytes_;
+  CachedBlocks cachedBlocks_;  // Set of cached device allocations available for reuse
+  BusyBlocks liveBlocks_;      // Set of live device allocations currently in use
+
+  size_t const minBinBytes_;
+  size_t const maxBinBytes_;
+  size_t const maxCachedBytes_;  // Maximum aggregate cached bytes per device
+
+  unsigned int const binGrowth_;  // Geometric growth factor for bin-sizes
+  unsigned int const minBin_;
+  unsigned int const maxBin_;
+
+  bool const debug_;
+};
+
+#endif
diff --git a/src/cudadev/CUDACore/allocate_device.cc b/src/cudadev/CUDACore/allocate_device.cc
index 2e4a6fab9..221a25c43 100644
--- a/src/cudadev/CUDACore/allocate_device.cc
+++ b/src/cudadev/CUDACore/allocate_device.cc
@@ -10,8 +10,7 @@
 #include "getCachingDeviceAllocator.h"
 
 namespace {
-  const size_t maxAllocationSize =
-      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+  const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
 }
 
 namespace cms::cuda {
@@ -22,7 +21,7 @@ namespace cms::cuda {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+      ptr = allocator::getCachingDeviceAllocator().allocate(dev, nbytes, stream);
 #if CUDA_VERSION >= 11020
     } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
       ScopedSetDevice setDeviceForThisScope(dev);
@@ -37,7 +36,7 @@ namespace cms::cuda {
 
   void free_device(int device, void *ptr, cudaStream_t stream) {
     if constexpr (allocator::policy == allocator::Policy::Caching) {
-      cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+      allocator::getCachingDeviceAllocator().free(device, ptr);
 #if CUDA_VERSION >= 11020
     } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
       ScopedSetDevice setDeviceForThisScope(device);
diff --git a/src/cudadev/CUDACore/allocate_host.cc b/src/cudadev/CUDACore/allocate_host.cc
index 0f972caf6..a03121918 100644
--- a/src/cudadev/CUDACore/allocate_host.cc
+++ b/src/cudadev/CUDACore/allocate_host.cc
@@ -6,8 +6,7 @@
 #include "getCachingHostAllocator.h"
 
 namespace {
-  const size_t maxAllocationSize =
-      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
+  const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
 }
 
 namespace cms::cuda {
diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.cc b/src/cudadev/CUDACore/deviceAllocatorStatus.cc
index 5d4a0ca09..fa7f64023 100644
--- a/src/cudadev/CUDACore/deviceAllocatorStatus.cc
+++ b/src/cudadev/CUDACore/deviceAllocatorStatus.cc
@@ -3,5 +3,5 @@
 #include "getCachingDeviceAllocator.h"
 
 namespace cms::cuda {
-  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().cacheStatus(); }
 }  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.h b/src/cudadev/CUDACore/deviceAllocatorStatus.h
index 92f9f87e8..32be5af07 100644
--- a/src/cudadev/CUDACore/deviceAllocatorStatus.h
+++ b/src/cudadev/CUDACore/deviceAllocatorStatus.h
@@ -7,10 +7,9 @@ namespace cms {
   namespace cuda {
     namespace allocator {
       struct TotalBytes {
-        size_t free;
-        size_t live;
-        size_t liveRequested;  // CMS: monitor also requested amount
-        TotalBytes() { free = live = liveRequested = 0; }
+        size_t free = 0;
+        size_t live = 0;
+        size_t liveRequested = 0;
       };
       /// Map type of device ordinals to the number of cached bytes cached by each device
       using GpuCachedBytes = std::map<int, TotalBytes>;
diff --git a/src/cudadev/CUDACore/getCachingDeviceAllocator.h b/src/cudadev/CUDACore/getCachingDeviceAllocator.h
index 95f4c90b1..190d0991a 100644
--- a/src/cudadev/CUDACore/getCachingDeviceAllocator.h
+++ b/src/cudadev/CUDACore/getCachingDeviceAllocator.h
@@ -7,8 +7,11 @@
 #include <cuda_runtime.h>
 
 #include "CUDACore/cudaCheck.h"
+#include "CUDACore/currentDevice.h"
 #include "CUDACore/deviceCount.h"
-#include "CachingDeviceAllocator.h"
+#include "CUDACore/eventWorkHasCompleted.h"
+#include "CUDACore/GenericCachingAllocator.h"
+#include "CUDACore/ScopedSetDevice.h"
 
 namespace cms::cuda::allocator {
   // Use caching or not
@@ -50,7 +53,64 @@ namespace cms::cuda::allocator {
     return ret;
   }
 
-  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+  struct DeviceTraits {
+    using DeviceType = int;
+    using QueueType = cudaStream_t;
+    using EventType = cudaEvent_t;
+
+    static constexpr DeviceType kInvalidDevice = -1;
+
+    static DeviceType currentDevice() { return cms::cuda::currentDevice(); }
+
+    static cms::cuda::ScopedSetDevice setDevice(DeviceType device) { return cms::cuda::ScopedSetDevice(device); }
+
+    static bool canReuseInDevice(DeviceType a, DeviceType b) { return a == b; }
+
+    static bool canReuseInQueue(QueueType a, QueueType b) { return a == b; }
+
+    static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); }
+
+    static EventType createEvent() {
+      EventType e;
+      cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+      return e;
+    }
+
+    static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); }
+
+    static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) {
+      throw std::runtime_error("CUDADeviceTraits::recreateEvent() should never be called");
+    }
+
+    static void recordEvent(EventType e, QueueType queue) { cudaCheck(cudaEventRecord(e, queue)); }
+
+    static std::ostream& printDevice(std::ostream& os, DeviceType device) {
+      os << "Device " << device;
+      return os;
+    }
+
+    static void* allocate(size_t bytes) {
+      void* ptr;
+      cudaCheck(cudaMalloc(&ptr, bytes));
+      return ptr;
+    }
+
+    static void* tryAllocate(size_t bytes) {
+      void* ptr;
+      auto error = cudaMalloc(&ptr, bytes);
+      if (error == cudaErrorMemoryAllocation) {
+        return nullptr;
+      }
+      cudaCheck(error);
+      return ptr;
+    }
+
+    static void free(void* ptr) { cudaCheck(cudaFree(ptr)); }
+  };
+
+  using CachingDeviceAllocator = GenericCachingAllocator<DeviceTraits>;
+
+  inline CachingDeviceAllocator& getCachingDeviceAllocator() {
     if (debug) {
       std::cout << "cub::CachingDeviceAllocator settings\n"
                 << "  bin growth " << binGrowth << "\n"
@@ -58,7 +118,7 @@ namespace cms::cuda::allocator {
                 << "  max bin    " << maxBin << "\n"
                 << "  resulting bins:\n";
       for (auto bin = minBin; bin <= maxBin; ++bin) {
-        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        auto binSize = ::allocator::intPow(binGrowth, bin);
         if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
           std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
         } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
@@ -73,12 +133,7 @@ namespace cms::cuda::allocator {
     }
 
     // the public interface is thread safe
-    static notcub::CachingDeviceAllocator allocator{binGrowth,
-                                                    minBin,
-                                                    maxBin,
-                                                    minCachedBytes(),
-                                                    false,  // do not skip cleanup
-                                                    debug};
+    static CachingDeviceAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug};
     return allocator;
   }
 }  // namespace cms::cuda::allocator

From 8529091fff37c2ecb30bee06623e89e3187ab350 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 4 Nov 2020 23:19:10 +0100
Subject: [PATCH 2/4] Use GenericCachingAllocator for pinned host memory

---
 .../CUDACore/GenericCachingAllocator.h        | 93 ++-----------------
 src/cudadev/CUDACore/allocate_host.cc         |  4 +-
 .../CUDACore/getCachingHostAllocator.h        | 85 +++++++++++++++--
 3 files changed, 84 insertions(+), 98 deletions(-)

diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h
index 32584f055..a90a8acad 100644
--- a/src/cudadev/CUDACore/GenericCachingAllocator.h
+++ b/src/cudadev/CUDACore/GenericCachingAllocator.h
@@ -15,87 +15,6 @@
 
 // Inspired by cub::CachingDeviceAllocator
 
-/*
-struct CUDAHostTraits {
-  using DeviceType = int;
-  using QueueType = cudaStream_t;
-  using EventType = cudaEvent_t;
-  struct Dummy {};
-
-  constexpr DeviceType kInvalidDevice = -1;
-
-  static DeviceType currentDevice() {
-    return cms::cuda::currentDevice();
-  }
-
-  static Dummy setDevice(DeviceType device) {
-    return {};
-  }
-
-  static bool canReuseInDevice(DeviceType a, DeviceType b) {
-    // Pinned host memory can be reused in any device, but in case of
-    // changes the event must be re-created
-    return true;
-  }
-
-  static bool canReuseInQueue(QueueType a, QueueType b) {
-    // For pinned host memory a freed block without completed event
-    // can not be re-used even for operations in the same queue
-    return false;
-  }
-
-  static bool eventWorkHasCompleted(EventType e) {
-    return cms::cuda::eventWorkHasCompleted(e);
-  }
-
-  static EventType createEvent() {
-    EventType e;
-    cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
-    return e;
-  }
-
-  static void destroyEvent(EventType e) {
-    cudaCheck(cudaEventDestroy(e));
-  }
-
-  static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) {
-    cudaCheck(cudaSetDevice(prev));
-    destroyEvent(e);
-    cudaCheck(cudaSetDevice(next));
-    return createEvent();
-  }
-
-  static EventType recordEvent(EventType e, QueueType queue) {
-    cudaCheck(cudaEventRecord(e, queue));
-  }
-
-  static std::ostream& printDevice(std::ostream& os, DeviceType dev) {
-    os << "Host";
-    return os;
-  }
-
-  static void* allocate(size_t bytes) {
-    void* ptr;
-    cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
-    return ptr;
-  }
-
-  static void* tryAllocate(size_t bytes) {
-    void* ptr;
-    auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault);
-    if (error == cudaErrorMemoryAllocation) {
-      return nullptr;
-    }
-    cudaCheck(error);
-    return ptr;
-  }
-
-  static void free(void* ptr) {
-    cudaCheck(cudaFreeHost(ptr));
-  }
-}
-*/
-
 namespace allocator {
   inline unsigned int intPow(unsigned int base, unsigned int exp) {
     unsigned int ret = 1;
@@ -183,7 +102,7 @@ class GenericCachingAllocator {
 
     // allocate if necessary
     if (searchKey.ptr == nullptr) {
-      auto scopedSetDevice = Traits::setDevice(device);
+      [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device);
 
       searchKey.ptr = Traits::tryAllocate(searchKey.bytes);
       if (searchKey.ptr == nullptr) {
@@ -233,7 +152,7 @@ class GenericCachingAllocator {
     searchKey.device = device;
     searchKey.ptr = ptr;
 
-    auto scopedSetDevice = Traits::setDevice(device);
+    [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device);
 
     {
       std::scoped_lock lock(mutex_);
@@ -385,7 +304,7 @@ class GenericCachingAllocator {
 
     while (not cachedBlocks_.empty()) {
       auto iBlock = cachedBlocks_.begin();
-      auto scopedSetDevice = Traits::setDevice(iBlock->device);
+      [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(iBlock->device);
       Traits::free(iBlock->ptr);
       Traits::destroyEvent(iBlock->readyEvent);
       cachedBytes_[iBlock->device].free -= iBlock->bytes;
@@ -393,9 +312,9 @@ class GenericCachingAllocator {
       if (debug_) {
         std::cout << "\t";
         Traits::printDevice(std::cout, iBlock->device)
-            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << (cachedBlocks_.size()-1) << " available blocks cached ("
-            << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() << " live blocks ("
-            << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl;
+            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1)
+            << " available blocks cached (" << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size()
+            << " live blocks (" << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl;
       }
 
       cachedBlocks_.erase(iBlock);
diff --git a/src/cudadev/CUDACore/allocate_host.cc b/src/cudadev/CUDACore/allocate_host.cc
index a03121918..ff58e98a2 100644
--- a/src/cudadev/CUDACore/allocate_host.cc
+++ b/src/cudadev/CUDACore/allocate_host.cc
@@ -17,7 +17,7 @@ namespace cms::cuda {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+      ptr = allocator::getCachingHostAllocator().allocate(allocator::HostTraits::kHostDevice, nbytes, stream);
     } else {
       cudaCheck(cudaMallocHost(&ptr, nbytes));
     }
@@ -26,7 +26,7 @@ namespace cms::cuda {
 
   void free_host(void *ptr) {
     if constexpr (allocator::policy == allocator::Policy::Caching) {
-      cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr));
+      allocator::getCachingHostAllocator().free(allocator::HostTraits::kHostDevice, ptr);
     } else {
       cudaCheck(cudaFreeHost(ptr));
     }
diff --git a/src/cudadev/CUDACore/getCachingHostAllocator.h b/src/cudadev/CUDACore/getCachingHostAllocator.h
index d29080795..a6524bedd 100644
--- a/src/cudadev/CUDACore/getCachingHostAllocator.h
+++ b/src/cudadev/CUDACore/getCachingHostAllocator.h
@@ -5,12 +5,84 @@
 #include <iostream>
 
 #include "CUDACore/cudaCheck.h"
-#include "CachingHostAllocator.h"
 
 #include "getCachingDeviceAllocator.h"
 
 namespace cms::cuda::allocator {
-  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+  struct HostTraits {
+    using DeviceType = int;
+    using QueueType = cudaStream_t;
+    using EventType = cudaEvent_t;
+    struct Dummy {};
+
+    static constexpr DeviceType kInvalidDevice = -1;
+    static constexpr DeviceType kHostDevice = 0;
+
+    static DeviceType currentDevice() { return cms::cuda::currentDevice(); }
+
+    static Dummy setDevice(DeviceType device) { return {}; }
+
+    static bool canReuseInDevice(DeviceType a, DeviceType b) {
+      // Pinned host memory can be reused in any device, but in case of
+      // changes the event must be re-created
+      return true;
+    }
+
+    static bool canReuseInQueue(QueueType a, QueueType b) {
+      // For pinned host memory a freed block without completed event
+      // can not be re-used even for operations in the same queue
+      return false;
+    }
+
+    static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); }
+
+    static EventType createEvent() {
+      EventType e;
+      cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+      return e;
+    }
+
+    static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); }
+
+    static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) {
+      cudaCheck(cudaSetDevice(prev));
+      destroyEvent(e);
+      cudaCheck(cudaSetDevice(next));
+      return createEvent();
+    }
+
+    static EventType recordEvent(EventType e, QueueType queue) {
+      cudaCheck(cudaEventRecord(e, queue));
+      return e;
+    }
+
+    static std::ostream& printDevice(std::ostream& os, DeviceType dev) {
+      os << "Host";
+      return os;
+    }
+
+    static void* allocate(size_t bytes) {
+      void* ptr;
+      cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
+      return ptr;
+    }
+
+    static void* tryAllocate(size_t bytes) {
+      void* ptr;
+      auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault);
+      if (error == cudaErrorMemoryAllocation) {
+        return nullptr;
+      }
+      cudaCheck(error);
+      return ptr;
+    }
+
+    static void free(void* ptr) { cudaCheck(cudaFreeHost(ptr)); }
+  };
+
+  using CachingHostAllocator = GenericCachingAllocator<HostTraits>;
+
+  inline CachingHostAllocator& getCachingHostAllocator() {
     if (debug) {
       std::cout << "cub::CachingHostAllocator settings\n"
                 << "  bin growth " << binGrowth << "\n"
@@ -18,7 +90,7 @@ namespace cms::cuda::allocator {
                 << "  max bin    " << maxBin << "\n"
                 << "  resulting bins:\n";
       for (auto bin = minBin; bin <= maxBin; ++bin) {
-        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        auto binSize = ::allocator::intPow(binGrowth, bin);
         if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
           std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
         } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
@@ -33,12 +105,7 @@ namespace cms::cuda::allocator {
     }
 
     // the public interface is thread safe
-    static notcub::CachingHostAllocator allocator{binGrowth,
-                                                  minBin,
-                                                  maxBin,
-                                                  minCachedBytes(),
-                                                  false,  // do not skip cleanup
-                                                  debug};
+    static CachingHostAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug};
     return allocator;
   }
 }  // namespace cms::cuda::allocator

From 03e690b865b4b4ba066648f2fa9e5cf26e4a52dc Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 1 Sep 2021 09:13:13 -0700
Subject: [PATCH 3/4] Fix comment

---
 src/cudadev/CUDACore/GenericCachingAllocator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h
index a90a8acad..36a931d47 100644
--- a/src/cudadev/CUDACore/GenericCachingAllocator.h
+++ b/src/cudadev/CUDACore/GenericCachingAllocator.h
@@ -76,7 +76,7 @@ class GenericCachingAllocator {
     return cachedBytes_;
   }
 
-  // Allocate given number of bytes on the current device associated to queue
+  // Allocate given number of bytes on the given device associated to given queue
   void* allocate(DeviceType device, size_t bytes, QueueType queue) {
     if (bytes > maxBinBytes_) {
       throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +

From fd96d4e20fd5c3211c2a43b70112bdb177e92e21 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 1 Sep 2021 10:10:50 -0700
Subject: [PATCH 4/4] Improve device information printing

---
 .../CUDACore/GenericCachingAllocator.h        | 79 +++++++++----------
 .../CUDACore/getCachingDeviceAllocator.h      | 16 +++-
 .../CUDACore/getCachingHostAllocator.h        | 14 +++-
 3 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h
index 36a931d47..607472485 100644
--- a/src/cudadev/CUDACore/GenericCachingAllocator.h
+++ b/src/cudadev/CUDACore/GenericCachingAllocator.h
@@ -108,10 +108,9 @@ class GenericCachingAllocator {
       if (searchKey.ptr == nullptr) {
         // The allocation attempt failed: free all cached blocks on device and retry
         if (debug_) {
-          std::cout << "\t";
-          Traits::printDevice(std::cout, device)
-              << " failed to allocate " << searchKey.bytes << " bytes for queue " << searchKey.associatedQueue
-              << ", retrying after freeing cached allocations" << std::endl;
+          std::cout << "\t" << Traits::printDevice(device) << " failed to allocate " << searchKey.bytes
+                    << " bytes for queue " << searchKey.associatedQueue << ", retrying after freeing cached allocations"
+                    << std::endl;
         }
 
         freeCachedBlocksOnDevice(device);
@@ -129,10 +128,9 @@ class GenericCachingAllocator {
       }
 
       if (debug_) {
-        std::cout << "\t";
-        Traits::printDevice(std::cout, device)
-            << " allocated new block at " << searchKey.ptr << " (" << searchKey.bytes << " bytes associated with queue "
-            << searchKey.associatedQueue << ", event " << searchKey.readyEvent << "." << std::endl;
+        std::cout << "\t" << Traits::printDevice(device) << " allocated new block at " << searchKey.ptr << " ("
+                  << searchKey.bytes << " bytes associated with queue " << searchKey.associatedQueue << ", event "
+                  << searchKey.readyEvent << "." << std::endl;
       }
     }
 
@@ -174,14 +172,13 @@ class GenericCachingAllocator {
         cachedBytes_[device].free += searchKey.bytes;
 
         if (debug_) {
-          std::cout << "\t";
-          Traits::printDevice(std::cout, device)
-              << " returned " << searchKey.bytes << " bytes at " << ptr << " from associated queue "
-              << searchKey.associatedQueue << " , event " << searchKey.readyEvent << " .\n\t\t " << cachedBlocks_.size()
-              << " available "
-                 "blocks cached ("
-              << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. ("
-              << cachedBytes_[device].live << " bytes)" << std::endl;
+          std::cout << "\t" << Traits::printDevice(device) << " returned " << searchKey.bytes << " bytes at " << ptr
+                    << " from associated queue " << searchKey.associatedQueue << " , event " << searchKey.readyEvent
+                    << " .\n\t\t " << cachedBlocks_.size()
+                    << " available "
+                       "blocks cached ("
+                    << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. ("
+                    << cachedBytes_[device].live << " bytes)" << std::endl;
         }
       }
 
@@ -194,14 +191,13 @@ class GenericCachingAllocator {
       Traits::free(ptr);
       Traits::destroyEvent(searchKey.readyEvent);
       if (debug_) {
-        std::cout << "\t";
-        Traits::printDevice(std::cout, device)
-            << " freed " << searchKey.bytes << " bytes at " << ptr << " from associated queue "
-            << searchKey.associatedQueue << ", event " << searchKey.readyEvent << ".\n\t\t  " << cachedBlocks_.size()
-            << " available "
-               "blocks cached ("
-            << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks ("
-            << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
+        std::cout << "\t" << Traits::printDevice(device) << " freed " << searchKey.bytes << " bytes at " << ptr
+                  << " from associated queue " << searchKey.associatedQueue << ", event " << searchKey.readyEvent
+                  << ".\n\t\t  " << cachedBlocks_.size()
+                  << " available "
+                     "blocks cached ("
+                  << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks ("
+                  << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
       }
     }
   }
@@ -257,13 +253,12 @@ class GenericCachingAllocator {
         cachedBytes_[device].live += searchKey.bytesRequested;
 
         if (debug_) {
-          std::cout << "\t";
-          Traits::printDevice(std::cout, device)
-              << " reused cached block at " << searchKey.ptr << " (" << searchKey.bytes << "bytes) for queue "
-              << searchKey.associatedQueue << ", event " << searchKey.readyEvent
-              << " (previously "
-                 "associated with stream "
-              << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl;
+          std::cout << "\t" << Traits::printDevice(device) << " reused cached block at " << searchKey.ptr << " ("
+                    << searchKey.bytes << "bytes) for queue " << searchKey.associatedQueue << ", event "
+                    << searchKey.readyEvent
+                    << " (previously "
+                       "associated with stream "
+                    << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl;
         }
 
         cachedBlocks_.erase(iBlock);
@@ -286,13 +281,12 @@ class GenericCachingAllocator {
       cachedBytes_[device].free -= iBlock->bytes;
 
       if (debug_) {
-        std::cout << "\t";
-        Traits::printDevice(std::cout, device)
-            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << cachedBlocks_.size() << " available blocks cached ("
-            << cachedBytes_[device].free << " bytes), " << liveBlocks_.size()
-            << " live blocks "
-               "("
-            << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
+        std::cout << "\t" << Traits::printDevice(device) << " freed " << iBlock->bytes << " bytes.\n\t\t  "
+                  << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free << " bytes), "
+                  << liveBlocks_.size()
+                  << " live blocks "
+                     "("
+                  << cachedBytes_[device].live << " bytes) outstanding." << std::endl;
       }
 
       iBlock = cachedBlocks_.erase(iBlock);
@@ -310,11 +304,10 @@ class GenericCachingAllocator {
       cachedBytes_[iBlock->device].free -= iBlock->bytes;
 
       if (debug_) {
-        std::cout << "\t";
-        Traits::printDevice(std::cout, iBlock->device)
-            << " freed " << iBlock->bytes << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1)
-            << " available blocks cached (" << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size()
-            << " live blocks (" << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl;
+        std::cout << "\t" << Traits::printDevice(iBlock->device) << " freed " << iBlock->bytes << " bytes.\n\t\t  "
+                  << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_[iBlock->device].free
+                  << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_[iBlock->device].live
+                  << " bytes) outstanding." << std::endl;
       }
 
       cachedBlocks_.erase(iBlock);
diff --git a/src/cudadev/CUDACore/getCachingDeviceAllocator.h b/src/cudadev/CUDACore/getCachingDeviceAllocator.h
index 190d0991a..62471b453 100644
--- a/src/cudadev/CUDACore/getCachingDeviceAllocator.h
+++ b/src/cudadev/CUDACore/getCachingDeviceAllocator.h
@@ -84,10 +84,13 @@ namespace cms::cuda::allocator {
 
     static void recordEvent(EventType e, QueueType queue) { cudaCheck(cudaEventRecord(e, queue)); }
 
-    static std::ostream& printDevice(std::ostream& os, DeviceType device) {
-      os << "Device " << device;
-      return os;
-    }
+    struct DevicePrinter {
+      DevicePrinter(DeviceType device) : device_(device) {}
+      void write(std::ostream& os) const { os << "Device " << device_; }
+      DeviceType device_;
+    };
+
+    static DevicePrinter printDevice(DeviceType device) { return DevicePrinter(device); }
 
     static void* allocate(size_t bytes) {
       void* ptr;
@@ -108,6 +111,11 @@ namespace cms::cuda::allocator {
     static void free(void* ptr) { cudaCheck(cudaFree(ptr)); }
   };
 
+  inline std::ostream& operator<<(std::ostream& os, DeviceTraits::DevicePrinter const& pr) {
+    pr.write(os);
+    return os;
+  }
+
   using CachingDeviceAllocator = GenericCachingAllocator<DeviceTraits>;
 
   inline CachingDeviceAllocator& getCachingDeviceAllocator() {
diff --git a/src/cudadev/CUDACore/getCachingHostAllocator.h b/src/cudadev/CUDACore/getCachingHostAllocator.h
index a6524bedd..0692c1686 100644
--- a/src/cudadev/CUDACore/getCachingHostAllocator.h
+++ b/src/cudadev/CUDACore/getCachingHostAllocator.h
@@ -56,10 +56,11 @@ namespace cms::cuda::allocator {
       return e;
     }
 
-    static std::ostream& printDevice(std::ostream& os, DeviceType dev) {
-      os << "Host";
-      return os;
-    }
+    struct DevicePrinter {
+      static void write(std::ostream& os) { os << "Host"; }
+    };
+
+    static DevicePrinter printDevice(DeviceType dev) { return DevicePrinter(); }
 
     static void* allocate(size_t bytes) {
       void* ptr;
@@ -80,6 +81,11 @@ namespace cms::cuda::allocator {
     static void free(void* ptr) { cudaCheck(cudaFreeHost(ptr)); }
   };
 
+  inline std::ostream& operator<<(std::ostream& os, HostTraits::DevicePrinter const& pr) {
+    pr.write(os);
+    return os;
+  }
+
   using CachingHostAllocator = GenericCachingAllocator<HostTraits>;
 
   inline CachingHostAllocator& getCachingHostAllocator() {