diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h new file mode 100644 index 000000000..607472485 --- /dev/null +++ b/src/cudadev/CUDACore/GenericCachingAllocator.h @@ -0,0 +1,338 @@ +#ifndef CUDACore_GenericCachingAllocator_h +#define CUDACore_GenericCachingAllocator_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "deviceAllocatorStatus.h" + +// Inspired by cub::CachingDeviceAllocator + +namespace allocator { + inline unsigned int intPow(unsigned int base, unsigned int exp) { + unsigned int ret = 1; + while (exp > 0) { + if (exp & 1) { + ret = ret * base; + } + base = base * base; + exp = exp >> 1; + } + return ret; + } + + // return (power, roundedBytes) + inline std::tuple nearestPowerOf(unsigned int base, size_t value) { + unsigned int power = 0; + size_t roundedBytes = 1; + if (value * base < value) { + // Overflow + power = sizeof(size_t) * 8; + roundedBytes = size_t(0) - 1; + } else { + while (roundedBytes < value) { + roundedBytes *= base; + ++power; + } + } + + return std::tuple(power, roundedBytes); + } +} // namespace allocator + +template +class GenericCachingAllocator { +public: + using DeviceType = typename Traits::DeviceType; + using QueueType = typename Traits::QueueType; + using EventType = typename Traits::EventType; + + using TotalBytes = cms::cuda::allocator::TotalBytes; + using DeviceCachedBytes = std::map; + + explicit GenericCachingAllocator( + unsigned int binGrowth, unsigned int minBin, unsigned int maxBin, size_t maxCachedBytes, bool debug) + : cachedBlocks_(&BlockDescriptor::SizeCompare), + liveBlocks_(&BlockDescriptor::PtrCompare), + minBinBytes_(allocator::intPow(binGrowth, minBin)), + maxBinBytes_(allocator::intPow(binGrowth, maxBin)), + maxCachedBytes_(maxCachedBytes), + binGrowth_(binGrowth), + minBin_(minBin), + maxBin_(maxBin), + debug_(debug) {} + ~GenericCachingAllocator() { freeAllCached(); } + + // Cache allocation status (for monitoring purposes) + DeviceCachedBytes cacheStatus() const { + std::scoped_lock lock(mutex_); + return cachedBytes_; + } + + // Allocate given number of bytes on the given device associated to given queue + void* allocate(DeviceType device, size_t bytes, QueueType queue) { + if (bytes > maxBinBytes_) { + throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + + " bytes is too large for the caching allocator with maximum bin " + + std::to_string(maxBinBytes_) + + " bytes. You might want to increase the maximum bin size"); + } + + // Create a block descriptor for the requested allocation + BlockDescriptor searchKey; + searchKey.bytesRequested = bytes; + searchKey.device = device; + searchKey.associatedQueue = queue; + if (bytes < minBinBytes_) { + searchKey.bin = minBin_; + searchKey.bytes = minBinBytes_; + } else { + std::tie(searchKey.bin, searchKey.bytes) = allocator::nearestPowerOf(binGrowth_, bytes); + } + + // Try to re-use cached block + searchKey.ptr = tryReuseCachedBlock(searchKey); + + // allocate if necessary + if (searchKey.ptr == nullptr) { + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device); + + searchKey.ptr = Traits::tryAllocate(searchKey.bytes); + if (searchKey.ptr == nullptr) { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " failed to allocate " << searchKey.bytes + << " bytes for queue " << searchKey.associatedQueue << ", retrying after freeing cached allocations" + << std::endl; + } + + freeCachedBlocksOnDevice(device); + + searchKey.ptr = Traits::allocate(searchKey.bytes); + } + + searchKey.readyEvent = Traits::createEvent(); + + { + std::scoped_lock lock(mutex_); + liveBlocks_.insert(searchKey); + cachedBytes_[device].live += searchKey.bytes; + cachedBytes_[device].liveRequested += searchKey.bytesRequested; + } + + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " allocated new block at " << searchKey.ptr << " (" + << searchKey.bytes << " bytes associated with queue " << searchKey.associatedQueue << ", event " + << searchKey.readyEvent << "." << std::endl; + } + } + + if (debug_) { + std::cout << "\t\t" << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free + << " bytes), " << liveBlocks_.size() << " live blocks outstanding (" << cachedBytes_[device].live + << " bytes)." << std::endl; + } + + return searchKey.ptr; + } + + // Frees an allocation on a given device + void free(DeviceType device, void* ptr) { + bool recache = false; + BlockDescriptor searchKey; + searchKey.device = device; + searchKey.ptr = ptr; + + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device); + + { + std::scoped_lock lock(mutex_); + + auto iBlock = liveBlocks_.find(searchKey); + if (iBlock == liveBlocks_.end()) { + std::stringstream ss; + ss << "Trying to free a non-live block at " << ptr; + throw std::runtime_error(ss.str()); + } + searchKey = *iBlock; + liveBlocks_.erase(iBlock); + cachedBytes_[device].live -= searchKey.bytes; + cachedBytes_[device].liveRequested -= searchKey.bytesRequested; + + recache = (cachedBytes_[device].free + searchKey.bytes <= maxCachedBytes_); + if (recache) { + cachedBlocks_.insert(searchKey); + cachedBytes_[device].free += searchKey.bytes; + + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " returned " << searchKey.bytes << " bytes at " << ptr + << " from associated queue " << searchKey.associatedQueue << " , event " << searchKey.readyEvent + << " .\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. (" + << cachedBytes_[device].live << " bytes)" << std::endl; + } + } + + if (recache) { + Traits::recordEvent(searchKey.readyEvent, searchKey.associatedQueue); + } + } + + if (not recache) { + Traits::free(ptr); + Traits::destroyEvent(searchKey.readyEvent); + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " freed " << searchKey.bytes << " bytes at " << ptr + << " from associated queue " << searchKey.associatedQueue << ", event " << searchKey.readyEvent + << ".\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks (" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + } + } + } + +private: + struct BlockDescriptor { + void* ptr = nullptr; + size_t bytes = 0; + size_t bytesRequested = 0; // for monitoring only + unsigned int bin = 0; + DeviceType device = Traits::kInvalidDevice; + QueueType associatedQueue; + EventType readyEvent; + + static bool PtrCompare(BlockDescriptor const& a, BlockDescriptor const& b) { + if (a.device == b.device) + return a.ptr < b.ptr; + return a.device < b.device; + } + + static bool SizeCompare(BlockDescriptor const& a, BlockDescriptor const& b) { + if (a.device == b.device) + return a.bytes < b.bytes; + return a.device < b.device; + } + }; + + void* tryReuseCachedBlock(BlockDescriptor& searchKey) { + std::scoped_lock lock(mutex_); + + // Iterate through the range of cached blocks on the same device in the same bin + for (auto iBlock = cachedBlocks_.lower_bound(searchKey); + iBlock != cachedBlocks_.end() and Traits::canReuseInDevice(searchKey.device, iBlock->device) and + iBlock->bin == searchKey.bin; + ++iBlock) { + if (Traits::canReuseInQueue(searchKey.associatedQueue, iBlock->associatedQueue) or + Traits::eventWorkHasCompleted(iBlock->readyEvent)) { + // Reuse existing cache block. Insert into live blocks. + auto device = searchKey.device; + auto queue = searchKey.associatedQueue; + searchKey = *iBlock; + searchKey.associatedQueue = queue; + + if (searchKey.device != device) { + searchKey.readyEvent = Traits::recreateEvent(searchKey.readyEvent, searchKey.device, device); + searchKey.device = device; + } + + liveBlocks_.insert(searchKey); + + cachedBytes_[device].free -= searchKey.bytes; + cachedBytes_[device].live += searchKey.bytes; + cachedBytes_[device].live += searchKey.bytesRequested; + + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " reused cached block at " << searchKey.ptr << " (" + << searchKey.bytes << "bytes) for queue " << searchKey.associatedQueue << ", event " + << searchKey.readyEvent + << " (previously " + "associated with stream " + << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl; + } + + cachedBlocks_.erase(iBlock); + return searchKey.ptr; + } + } + + return nullptr; + } + + void freeCachedBlocksOnDevice(DeviceType device) { + std::scoped_lock lock(mutex_); + + BlockDescriptor freeKey; + freeKey.device = device; + for (auto iBlock = cachedBlocks_.lower_bound(freeKey); + iBlock != cachedBlocks_.end() and iBlock->device == device;) { + Traits::free(iBlock->ptr); + Traits::destroyEvent(iBlock->readyEvent); + cachedBytes_[device].free -= iBlock->bytes; + + if (debug_) { + std::cout << "\t" << Traits::printDevice(device) << " freed " << iBlock->bytes << " bytes.\n\t\t " + << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free << " bytes), " + << liveBlocks_.size() + << " live blocks " + "(" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + } + + iBlock = cachedBlocks_.erase(iBlock); + } + } + + void freeAllCached() { + std::scoped_lock lock(mutex_); + + while (not cachedBlocks_.empty()) { + auto iBlock = cachedBlocks_.begin(); + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(iBlock->device); + Traits::free(iBlock->ptr); + Traits::destroyEvent(iBlock->readyEvent); + cachedBytes_[iBlock->device].free -= iBlock->bytes; + + if (debug_) { + std::cout << "\t" << Traits::printDevice(iBlock->device) << " freed " << iBlock->bytes << " bytes.\n\t\t " + << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_[iBlock->device].free + << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_[iBlock->device].live + << " bytes) outstanding." << std::endl; + } + + cachedBlocks_.erase(iBlock); + } + } + + using Compare = typename std::add_pointer::type; + using CachedBlocks = std::multiset; // ordered by size + using BusyBlocks = std::multiset; // ordered by ptr + + mutable std::mutex mutex_; + + DeviceCachedBytes cachedBytes_; + CachedBlocks cachedBlocks_; // Set of cached device allocations available for reuse + BusyBlocks liveBlocks_; // Set of live device allocations currently in use + + size_t const minBinBytes_; + size_t const maxBinBytes_; + size_t const maxCachedBytes_; // Maximum aggregate cached bytes per device + + unsigned int const binGrowth_; // Geometric growth factor for bin-sizes + unsigned int const minBin_; + unsigned int const maxBin_; + + bool const debug_; +}; + +#endif diff --git a/src/cudadev/CUDACore/allocate_device.cc b/src/cudadev/CUDACore/allocate_device.cc index 2e4a6fab9..221a25c43 100644 --- a/src/cudadev/CUDACore/allocate_device.cc +++ b/src/cudadev/CUDACore/allocate_device.cc @@ -10,8 +10,7 @@ #include "getCachingDeviceAllocator.h" namespace { - const size_t maxAllocationSize = - notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); + const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); } namespace cms::cuda { @@ -22,7 +21,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream)); + ptr = allocator::getCachingDeviceAllocator().allocate(dev, nbytes, stream); #if CUDA_VERSION >= 11020 } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) { ScopedSetDevice setDeviceForThisScope(dev); @@ -37,7 +36,7 @@ namespace cms::cuda { void free_device(int device, void *ptr, cudaStream_t stream) { if constexpr (allocator::policy == allocator::Policy::Caching) { - cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr)); + allocator::getCachingDeviceAllocator().free(device, ptr); #if CUDA_VERSION >= 11020 } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) { ScopedSetDevice setDeviceForThisScope(device); diff --git a/src/cudadev/CUDACore/allocate_host.cc b/src/cudadev/CUDACore/allocate_host.cc index 0f972caf6..ff58e98a2 100644 --- a/src/cudadev/CUDACore/allocate_host.cc +++ b/src/cudadev/CUDACore/allocate_host.cc @@ -6,8 +6,7 @@ #include "getCachingHostAllocator.h" namespace { - const size_t maxAllocationSize = - notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); + const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); } namespace cms::cuda { @@ -18,7 +17,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream)); + ptr = allocator::getCachingHostAllocator().allocate(allocator::HostTraits::kHostDevice, nbytes, stream); } else { cudaCheck(cudaMallocHost(&ptr, nbytes)); } @@ -27,7 +26,7 @@ namespace cms::cuda { void free_host(void *ptr) { if constexpr (allocator::policy == allocator::Policy::Caching) { - cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr)); + allocator::getCachingHostAllocator().free(allocator::HostTraits::kHostDevice, ptr); } else { cudaCheck(cudaFreeHost(ptr)); } diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.cc b/src/cudadev/CUDACore/deviceAllocatorStatus.cc index 5d4a0ca09..fa7f64023 100644 --- a/src/cudadev/CUDACore/deviceAllocatorStatus.cc +++ b/src/cudadev/CUDACore/deviceAllocatorStatus.cc @@ -3,5 +3,5 @@ #include "getCachingDeviceAllocator.h" namespace cms::cuda { - allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); } + allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().cacheStatus(); } } // namespace cms::cuda diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.h b/src/cudadev/CUDACore/deviceAllocatorStatus.h index 92f9f87e8..32be5af07 100644 --- a/src/cudadev/CUDACore/deviceAllocatorStatus.h +++ b/src/cudadev/CUDACore/deviceAllocatorStatus.h @@ -7,10 +7,9 @@ namespace cms { namespace cuda { namespace allocator { struct TotalBytes { - size_t free; - size_t live; - size_t liveRequested; // CMS: monitor also requested amount - TotalBytes() { free = live = liveRequested = 0; } + size_t free = 0; + size_t live = 0; + size_t liveRequested = 0; }; /// Map type of device ordinals to the number of cached bytes cached by each device using GpuCachedBytes = std::map; diff --git a/src/cudadev/CUDACore/getCachingDeviceAllocator.h b/src/cudadev/CUDACore/getCachingDeviceAllocator.h index 95f4c90b1..62471b453 100644 --- a/src/cudadev/CUDACore/getCachingDeviceAllocator.h +++ b/src/cudadev/CUDACore/getCachingDeviceAllocator.h @@ -7,8 +7,11 @@ #include #include "CUDACore/cudaCheck.h" +#include "CUDACore/currentDevice.h" #include "CUDACore/deviceCount.h" -#include "CachingDeviceAllocator.h" +#include "CUDACore/eventWorkHasCompleted.h" +#include "CUDACore/GenericCachingAllocator.h" +#include "CUDACore/ScopedSetDevice.h" namespace cms::cuda::allocator { // Use caching or not @@ -50,7 +53,72 @@ namespace cms::cuda::allocator { return ret; } - inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { + struct DeviceTraits { + using DeviceType = int; + using QueueType = cudaStream_t; + using EventType = cudaEvent_t; + + static constexpr DeviceType kInvalidDevice = -1; + + static DeviceType currentDevice() { return cms::cuda::currentDevice(); } + + static cms::cuda::ScopedSetDevice setDevice(DeviceType device) { return cms::cuda::ScopedSetDevice(device); } + + static bool canReuseInDevice(DeviceType a, DeviceType b) { return a == b; } + + static bool canReuseInQueue(QueueType a, QueueType b) { return a == b; } + + static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); } + + static EventType createEvent() { + EventType e; + cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + return e; + } + + static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); } + + static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { + throw std::runtime_error("CUDADeviceTraits::recreateEvent() should never be called"); + } + + static void recordEvent(EventType e, QueueType queue) { cudaCheck(cudaEventRecord(e, queue)); } + + struct DevicePrinter { + DevicePrinter(DeviceType device) : device_(device) {} + void write(std::ostream& os) const { os << "Device " << device_; } + DeviceType device_; + }; + + static DevicePrinter printDevice(DeviceType device) { return DevicePrinter(device); } + + static void* allocate(size_t bytes) { + void* ptr; + cudaCheck(cudaMalloc(&ptr, bytes)); + return ptr; + } + + static void* tryAllocate(size_t bytes) { + void* ptr; + auto error = cudaMalloc(&ptr, bytes); + if (error == cudaErrorMemoryAllocation) { + return nullptr; + } + cudaCheck(error); + return ptr; + } + + static void free(void* ptr) { cudaCheck(cudaFree(ptr)); } + }; + + inline std::ostream& operator<<(std::ostream& os, DeviceTraits::DevicePrinter const& pr) { + pr.write(os); + return os; + } + + using CachingDeviceAllocator = GenericCachingAllocator; + + inline CachingDeviceAllocator& getCachingDeviceAllocator() { if (debug) { std::cout << "cub::CachingDeviceAllocator settings\n" << " bin growth " << binGrowth << "\n" @@ -58,7 +126,7 @@ namespace cms::cuda::allocator { << " max bin " << maxBin << "\n" << " resulting bins:\n"; for (auto bin = minBin; bin <= maxBin; ++bin) { - auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + auto binSize = ::allocator::intPow(binGrowth, bin); if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { @@ -73,12 +141,7 @@ namespace cms::cuda::allocator { } // the public interface is thread safe - static notcub::CachingDeviceAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; + static CachingDeviceAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; return allocator; } } // namespace cms::cuda::allocator diff --git a/src/cudadev/CUDACore/getCachingHostAllocator.h b/src/cudadev/CUDACore/getCachingHostAllocator.h index d29080795..0692c1686 100644 --- a/src/cudadev/CUDACore/getCachingHostAllocator.h +++ b/src/cudadev/CUDACore/getCachingHostAllocator.h @@ -5,12 +5,90 @@ #include #include "CUDACore/cudaCheck.h" -#include "CachingHostAllocator.h" #include "getCachingDeviceAllocator.h" namespace cms::cuda::allocator { - inline notcub::CachingHostAllocator& getCachingHostAllocator() { + struct HostTraits { + using DeviceType = int; + using QueueType = cudaStream_t; + using EventType = cudaEvent_t; + struct Dummy {}; + + static constexpr DeviceType kInvalidDevice = -1; + static constexpr DeviceType kHostDevice = 0; + + static DeviceType currentDevice() { return cms::cuda::currentDevice(); } + + static Dummy setDevice(DeviceType device) { return {}; } + + static bool canReuseInDevice(DeviceType a, DeviceType b) { + // Pinned host memory can be reused in any device, but in case of + // changes the event must be re-created + return true; + } + + static bool canReuseInQueue(QueueType a, QueueType b) { + // For pinned host memory a freed block without completed event + // can not be re-used even for operations in the same queue + return false; + } + + static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); } + + static EventType createEvent() { + EventType e; + cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + return e; + } + + static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); } + + static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { + cudaCheck(cudaSetDevice(prev)); + destroyEvent(e); + cudaCheck(cudaSetDevice(next)); + return createEvent(); + } + + static EventType recordEvent(EventType e, QueueType queue) { + cudaCheck(cudaEventRecord(e, queue)); + return e; + } + + struct DevicePrinter { + static void write(std::ostream& os) { os << "Host"; } + }; + + static DevicePrinter printDevice(DeviceType dev) { return DevicePrinter(); } + + static void* allocate(size_t bytes) { + void* ptr; + cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault)); + return ptr; + } + + static void* tryAllocate(size_t bytes) { + void* ptr; + auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault); + if (error == cudaErrorMemoryAllocation) { + return nullptr; + } + cudaCheck(error); + return ptr; + } + + static void free(void* ptr) { cudaCheck(cudaFreeHost(ptr)); } + }; + + inline std::ostream& operator<<(std::ostream& os, HostTraits::DevicePrinter const& pr) { + pr.write(os); + return os; + } + + using CachingHostAllocator = GenericCachingAllocator; + + inline CachingHostAllocator& getCachingHostAllocator() { if (debug) { std::cout << "cub::CachingHostAllocator settings\n" << " bin growth " << binGrowth << "\n" @@ -18,7 +96,7 @@ namespace cms::cuda::allocator { << " max bin " << maxBin << "\n" << " resulting bins:\n"; for (auto bin = minBin; bin <= maxBin; ++bin) { - auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + auto binSize = ::allocator::intPow(binGrowth, bin); if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { @@ -33,12 +111,7 @@ namespace cms::cuda::allocator { } // the public interface is thread safe - static notcub::CachingHostAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; + static CachingHostAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; return allocator; } } // namespace cms::cuda::allocator