From 4f44db0d667a2c576dd6247090ca391aa4ed1518 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 4 Nov 2020 02:57:28 +0100 Subject: [PATCH 1/4] Add GenericCachingAllocator, use it for device memory --- .../CUDACore/GenericCachingAllocator.h | 426 ++++++++++++++++++ src/cudadev/CUDACore/allocate_device.cc | 7 +- src/cudadev/CUDACore/allocate_host.cc | 3 +- src/cudadev/CUDACore/deviceAllocatorStatus.cc | 2 +- src/cudadev/CUDACore/deviceAllocatorStatus.h | 7 +- .../CUDACore/getCachingDeviceAllocator.h | 73 ++- 6 files changed, 498 insertions(+), 20 deletions(-) create mode 100644 src/cudadev/CUDACore/GenericCachingAllocator.h diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h new file mode 100644 index 000000000..32584f055 --- /dev/null +++ b/src/cudadev/CUDACore/GenericCachingAllocator.h @@ -0,0 +1,426 @@ +#ifndef CUDACore_GenericCachingAllocator_h +#define CUDACore_GenericCachingAllocator_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "deviceAllocatorStatus.h" + +// Inspired by cub::CachingDeviceAllocator + +/* +struct CUDAHostTraits { + using DeviceType = int; + using QueueType = cudaStream_t; + using EventType = cudaEvent_t; + struct Dummy {}; + + constexpr DeviceType kInvalidDevice = -1; + + static DeviceType currentDevice() { + return cms::cuda::currentDevice(); + } + + static Dummy setDevice(DeviceType device) { + return {}; + } + + static bool canReuseInDevice(DeviceType a, DeviceType b) { + // Pinned host memory can be reused in any device, but in case of + // changes the event must be re-created + return true; + } + + static bool canReuseInQueue(QueueType a, QueueType b) { + // For pinned host memory a freed block without completed event + // can not be re-used even for operations in the same queue + return false; + } + + static bool eventWorkHasCompleted(EventType e) { + return cms::cuda::eventWorkHasCompleted(e); + } + + static EventType createEvent() { + EventType e; + cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + return e; + } + + static void destroyEvent(EventType e) { + cudaCheck(cudaEventDestroy(e)); + } + + static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { + cudaCheck(cudaSetDevice(prev)); + destroyEvent(e); + cudaCheck(cudaSetDevice(next)); + return createEvent(); + } + + static EventType recordEvent(EventType e, QueueType queue) { + cudaCheck(cudaEventRecord(e, queue)); + } + + static std::ostream& printDevice(std::ostream& os, DeviceType dev) { + os << "Host"; + return os; + } + + static void* allocate(size_t bytes) { + void* ptr; + cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault)); + return ptr; + } + + static void* tryAllocate(size_t bytes) { + void* ptr; + auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault); + if (error == cudaErrorMemoryAllocation) { + return nullptr; + } + cudaCheck(error); + return ptr; + } + + static void free(void* ptr) { + cudaCheck(cudaFreeHost(ptr)); + } +} +*/ + +namespace allocator { + inline unsigned int intPow(unsigned int base, unsigned int exp) { + unsigned int ret = 1; + while (exp > 0) { + if (exp & 1) { + ret = ret * base; + } + base = base * base; + exp = exp >> 1; + } + return ret; + } + + // return (power, roundedBytes) + inline std::tuple nearestPowerOf(unsigned int base, size_t value) { + unsigned int power = 0; + size_t roundedBytes = 1; + if (value * base < value) { + // Overflow + power = sizeof(size_t) * 8; + roundedBytes = size_t(0) - 1; + } else { + while (roundedBytes < value) { + roundedBytes *= base; + ++power; + } + } + + return std::tuple(power, roundedBytes); + } +} // namespace allocator + +template +class GenericCachingAllocator { +public: + using DeviceType = typename Traits::DeviceType; + using QueueType = typename Traits::QueueType; + using EventType = typename Traits::EventType; + + using TotalBytes = cms::cuda::allocator::TotalBytes; + using DeviceCachedBytes = std::map; + + explicit GenericCachingAllocator( + unsigned int binGrowth, unsigned int minBin, unsigned int maxBin, size_t maxCachedBytes, bool debug) + : cachedBlocks_(&BlockDescriptor::SizeCompare), + liveBlocks_(&BlockDescriptor::PtrCompare), + minBinBytes_(allocator::intPow(binGrowth, minBin)), + maxBinBytes_(allocator::intPow(binGrowth, maxBin)), + maxCachedBytes_(maxCachedBytes), + binGrowth_(binGrowth), + minBin_(minBin), + maxBin_(maxBin), + debug_(debug) {} + ~GenericCachingAllocator() { freeAllCached(); } + + // Cache allocation status (for monitoring purposes) + DeviceCachedBytes cacheStatus() const { + std::scoped_lock lock(mutex_); + return cachedBytes_; + } + + // Allocate given number of bytes on the current device associated to queue + void* allocate(DeviceType device, size_t bytes, QueueType queue) { + if (bytes > maxBinBytes_) { + throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + + " bytes is too large for the caching allocator with maximum bin " + + std::to_string(maxBinBytes_) + + " bytes. You might want to increase the maximum bin size"); + } + + // Create a block descriptor for the requested allocation + BlockDescriptor searchKey; + searchKey.bytesRequested = bytes; + searchKey.device = device; + searchKey.associatedQueue = queue; + if (bytes < minBinBytes_) { + searchKey.bin = minBin_; + searchKey.bytes = minBinBytes_; + } else { + std::tie(searchKey.bin, searchKey.bytes) = allocator::nearestPowerOf(binGrowth_, bytes); + } + + // Try to re-use cached block + searchKey.ptr = tryReuseCachedBlock(searchKey); + + // allocate if necessary + if (searchKey.ptr == nullptr) { + auto scopedSetDevice = Traits::setDevice(device); + + searchKey.ptr = Traits::tryAllocate(searchKey.bytes); + if (searchKey.ptr == nullptr) { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " failed to allocate " << searchKey.bytes << " bytes for queue " << searchKey.associatedQueue + << ", retrying after freeing cached allocations" << std::endl; + } + + freeCachedBlocksOnDevice(device); + + searchKey.ptr = Traits::allocate(searchKey.bytes); + } + + searchKey.readyEvent = Traits::createEvent(); + + { + std::scoped_lock lock(mutex_); + liveBlocks_.insert(searchKey); + cachedBytes_[device].live += searchKey.bytes; + cachedBytes_[device].liveRequested += searchKey.bytesRequested; + } + + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " allocated new block at " << searchKey.ptr << " (" << searchKey.bytes << " bytes associated with queue " + << searchKey.associatedQueue << ", event " << searchKey.readyEvent << "." << std::endl; + } + } + + if (debug_) { + std::cout << "\t\t" << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free + << " bytes), " << liveBlocks_.size() << " live blocks outstanding (" << cachedBytes_[device].live + << " bytes)." << std::endl; + } + + return searchKey.ptr; + } + + // Frees an allocation on a given device + void free(DeviceType device, void* ptr) { + bool recache = false; + BlockDescriptor searchKey; + searchKey.device = device; + searchKey.ptr = ptr; + + auto scopedSetDevice = Traits::setDevice(device); + + { + std::scoped_lock lock(mutex_); + + auto iBlock = liveBlocks_.find(searchKey); + if (iBlock == liveBlocks_.end()) { + std::stringstream ss; + ss << "Trying to free a non-live block at " << ptr; + throw std::runtime_error(ss.str()); + } + searchKey = *iBlock; + liveBlocks_.erase(iBlock); + cachedBytes_[device].live -= searchKey.bytes; + cachedBytes_[device].liveRequested -= searchKey.bytesRequested; + + recache = (cachedBytes_[device].free + searchKey.bytes <= maxCachedBytes_); + if (recache) { + cachedBlocks_.insert(searchKey); + cachedBytes_[device].free += searchKey.bytes; + + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " returned " << searchKey.bytes << " bytes at " << ptr << " from associated queue " + << searchKey.associatedQueue << " , event " << searchKey.readyEvent << " .\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. (" + << cachedBytes_[device].live << " bytes)" << std::endl; + } + } + + if (recache) { + Traits::recordEvent(searchKey.readyEvent, searchKey.associatedQueue); + } + } + + if (not recache) { + Traits::free(ptr); + Traits::destroyEvent(searchKey.readyEvent); + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " freed " << searchKey.bytes << " bytes at " << ptr << " from associated queue " + << searchKey.associatedQueue << ", event " << searchKey.readyEvent << ".\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks (" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + } + } + } + +private: + struct BlockDescriptor { + void* ptr = nullptr; + size_t bytes = 0; + size_t bytesRequested = 0; // for monitoring only + unsigned int bin = 0; + DeviceType device = Traits::kInvalidDevice; + QueueType associatedQueue; + EventType readyEvent; + + static bool PtrCompare(BlockDescriptor const& a, BlockDescriptor const& b) { + if (a.device == b.device) + return a.ptr < b.ptr; + return a.device < b.device; + } + + static bool SizeCompare(BlockDescriptor const& a, BlockDescriptor const& b) { + if (a.device == b.device) + return a.bytes < b.bytes; + return a.device < b.device; + } + }; + + void* tryReuseCachedBlock(BlockDescriptor& searchKey) { + std::scoped_lock lock(mutex_); + + // Iterate through the range of cached blocks on the same device in the same bin + for (auto iBlock = cachedBlocks_.lower_bound(searchKey); + iBlock != cachedBlocks_.end() and Traits::canReuseInDevice(searchKey.device, iBlock->device) and + iBlock->bin == searchKey.bin; + ++iBlock) { + if (Traits::canReuseInQueue(searchKey.associatedQueue, iBlock->associatedQueue) or + Traits::eventWorkHasCompleted(iBlock->readyEvent)) { + // Reuse existing cache block. Insert into live blocks. + auto device = searchKey.device; + auto queue = searchKey.associatedQueue; + searchKey = *iBlock; + searchKey.associatedQueue = queue; + + if (searchKey.device != device) { + searchKey.readyEvent = Traits::recreateEvent(searchKey.readyEvent, searchKey.device, device); + searchKey.device = device; + } + + liveBlocks_.insert(searchKey); + + cachedBytes_[device].free -= searchKey.bytes; + cachedBytes_[device].live += searchKey.bytes; + cachedBytes_[device].live += searchKey.bytesRequested; + + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " reused cached block at " << searchKey.ptr << " (" << searchKey.bytes << "bytes) for queue " + << searchKey.associatedQueue << ", event " << searchKey.readyEvent + << " (previously " + "associated with stream " + << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl; + } + + cachedBlocks_.erase(iBlock); + return searchKey.ptr; + } + } + + return nullptr; + } + + void freeCachedBlocksOnDevice(DeviceType device) { + std::scoped_lock lock(mutex_); + + BlockDescriptor freeKey; + freeKey.device = device; + for (auto iBlock = cachedBlocks_.lower_bound(freeKey); + iBlock != cachedBlocks_.end() and iBlock->device == device;) { + Traits::free(iBlock->ptr); + Traits::destroyEvent(iBlock->readyEvent); + cachedBytes_[device].free -= iBlock->bytes; + + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, device) + << " freed " << iBlock->bytes << " bytes.\n\t\t " << cachedBlocks_.size() << " available blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() + << " live blocks " + "(" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + } + + iBlock = cachedBlocks_.erase(iBlock); + } + } + + void freeAllCached() { + std::scoped_lock lock(mutex_); + + while (not cachedBlocks_.empty()) { + auto iBlock = cachedBlocks_.begin(); + auto scopedSetDevice = Traits::setDevice(iBlock->device); + Traits::free(iBlock->ptr); + Traits::destroyEvent(iBlock->readyEvent); + cachedBytes_[iBlock->device].free -= iBlock->bytes; + + if (debug_) { + std::cout << "\t"; + Traits::printDevice(std::cout, iBlock->device) + << " freed " << iBlock->bytes << " bytes.\n\t\t " << (cachedBlocks_.size()-1) << " available blocks cached (" + << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() << " live blocks (" + << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl; + } + + cachedBlocks_.erase(iBlock); + } + } + + using Compare = typename std::add_pointer::type; + using CachedBlocks = std::multiset; // ordered by size + using BusyBlocks = std::multiset; // ordered by ptr + + mutable std::mutex mutex_; + + DeviceCachedBytes cachedBytes_; + CachedBlocks cachedBlocks_; // Set of cached device allocations available for reuse + BusyBlocks liveBlocks_; // Set of live device allocations currently in use + + size_t const minBinBytes_; + size_t const maxBinBytes_; + size_t const maxCachedBytes_; // Maximum aggregate cached bytes per device + + unsigned int const binGrowth_; // Geometric growth factor for bin-sizes + unsigned int const minBin_; + unsigned int const maxBin_; + + bool const debug_; +}; + +#endif diff --git a/src/cudadev/CUDACore/allocate_device.cc b/src/cudadev/CUDACore/allocate_device.cc index 2e4a6fab9..221a25c43 100644 --- a/src/cudadev/CUDACore/allocate_device.cc +++ b/src/cudadev/CUDACore/allocate_device.cc @@ -10,8 +10,7 @@ #include "getCachingDeviceAllocator.h" namespace { - const size_t maxAllocationSize = - notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); + const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); } namespace cms::cuda { @@ -22,7 +21,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream)); + ptr = allocator::getCachingDeviceAllocator().allocate(dev, nbytes, stream); #if CUDA_VERSION >= 11020 } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) { ScopedSetDevice setDeviceForThisScope(dev); @@ -37,7 +36,7 @@ namespace cms::cuda { void free_device(int device, void *ptr, cudaStream_t stream) { if constexpr (allocator::policy == allocator::Policy::Caching) { - cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr)); + allocator::getCachingDeviceAllocator().free(device, ptr); #if CUDA_VERSION >= 11020 } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) { ScopedSetDevice setDeviceForThisScope(device); diff --git a/src/cudadev/CUDACore/allocate_host.cc b/src/cudadev/CUDACore/allocate_host.cc index 0f972caf6..a03121918 100644 --- a/src/cudadev/CUDACore/allocate_host.cc +++ b/src/cudadev/CUDACore/allocate_host.cc @@ -6,8 +6,7 @@ #include "getCachingHostAllocator.h" namespace { - const size_t maxAllocationSize = - notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); + const size_t maxAllocationSize = allocator::intPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin); } namespace cms::cuda { diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.cc b/src/cudadev/CUDACore/deviceAllocatorStatus.cc index 5d4a0ca09..fa7f64023 100644 --- a/src/cudadev/CUDACore/deviceAllocatorStatus.cc +++ b/src/cudadev/CUDACore/deviceAllocatorStatus.cc @@ -3,5 +3,5 @@ #include "getCachingDeviceAllocator.h" namespace cms::cuda { - allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); } + allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().cacheStatus(); } } // namespace cms::cuda diff --git a/src/cudadev/CUDACore/deviceAllocatorStatus.h b/src/cudadev/CUDACore/deviceAllocatorStatus.h index 92f9f87e8..32be5af07 100644 --- a/src/cudadev/CUDACore/deviceAllocatorStatus.h +++ b/src/cudadev/CUDACore/deviceAllocatorStatus.h @@ -7,10 +7,9 @@ namespace cms { namespace cuda { namespace allocator { struct TotalBytes { - size_t free; - size_t live; - size_t liveRequested; // CMS: monitor also requested amount - TotalBytes() { free = live = liveRequested = 0; } + size_t free = 0; + size_t live = 0; + size_t liveRequested = 0; }; /// Map type of device ordinals to the number of cached bytes cached by each device using GpuCachedBytes = std::map; diff --git a/src/cudadev/CUDACore/getCachingDeviceAllocator.h b/src/cudadev/CUDACore/getCachingDeviceAllocator.h index 95f4c90b1..190d0991a 100644 --- a/src/cudadev/CUDACore/getCachingDeviceAllocator.h +++ b/src/cudadev/CUDACore/getCachingDeviceAllocator.h @@ -7,8 +7,11 @@ #include #include "CUDACore/cudaCheck.h" +#include "CUDACore/currentDevice.h" #include "CUDACore/deviceCount.h" -#include "CachingDeviceAllocator.h" +#include "CUDACore/eventWorkHasCompleted.h" +#include "CUDACore/GenericCachingAllocator.h" +#include "CUDACore/ScopedSetDevice.h" namespace cms::cuda::allocator { // Use caching or not @@ -50,7 +53,64 @@ namespace cms::cuda::allocator { return ret; } - inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { + struct DeviceTraits { + using DeviceType = int; + using QueueType = cudaStream_t; + using EventType = cudaEvent_t; + + static constexpr DeviceType kInvalidDevice = -1; + + static DeviceType currentDevice() { return cms::cuda::currentDevice(); } + + static cms::cuda::ScopedSetDevice setDevice(DeviceType device) { return cms::cuda::ScopedSetDevice(device); } + + static bool canReuseInDevice(DeviceType a, DeviceType b) { return a == b; } + + static bool canReuseInQueue(QueueType a, QueueType b) { return a == b; } + + static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); } + + static EventType createEvent() { + EventType e; + cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + return e; + } + + static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); } + + static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { + throw std::runtime_error("CUDADeviceTraits::recreateEvent() should never be called"); + } + + static void recordEvent(EventType e, QueueType queue) { cudaCheck(cudaEventRecord(e, queue)); } + + static std::ostream& printDevice(std::ostream& os, DeviceType device) { + os << "Device " << device; + return os; + } + + static void* allocate(size_t bytes) { + void* ptr; + cudaCheck(cudaMalloc(&ptr, bytes)); + return ptr; + } + + static void* tryAllocate(size_t bytes) { + void* ptr; + auto error = cudaMalloc(&ptr, bytes); + if (error == cudaErrorMemoryAllocation) { + return nullptr; + } + cudaCheck(error); + return ptr; + } + + static void free(void* ptr) { cudaCheck(cudaFree(ptr)); } + }; + + using CachingDeviceAllocator = GenericCachingAllocator; + + inline CachingDeviceAllocator& getCachingDeviceAllocator() { if (debug) { std::cout << "cub::CachingDeviceAllocator settings\n" << " bin growth " << binGrowth << "\n" @@ -58,7 +118,7 @@ namespace cms::cuda::allocator { << " max bin " << maxBin << "\n" << " resulting bins:\n"; for (auto bin = minBin; bin <= maxBin; ++bin) { - auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + auto binSize = ::allocator::intPow(binGrowth, bin); if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { @@ -73,12 +133,7 @@ namespace cms::cuda::allocator { } // the public interface is thread safe - static notcub::CachingDeviceAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; + static CachingDeviceAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; return allocator; } } // namespace cms::cuda::allocator From 8529091fff37c2ecb30bee06623e89e3187ab350 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 4 Nov 2020 23:19:10 +0100 Subject: [PATCH 2/4] Use GenericCachingAllocator for pinned host memory --- .../CUDACore/GenericCachingAllocator.h | 93 ++----------------- src/cudadev/CUDACore/allocate_host.cc | 4 +- .../CUDACore/getCachingHostAllocator.h | 85 +++++++++++++++-- 3 files changed, 84 insertions(+), 98 deletions(-) diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h index 32584f055..a90a8acad 100644 --- a/src/cudadev/CUDACore/GenericCachingAllocator.h +++ b/src/cudadev/CUDACore/GenericCachingAllocator.h @@ -15,87 +15,6 @@ // Inspired by cub::CachingDeviceAllocator -/* -struct CUDAHostTraits { - using DeviceType = int; - using QueueType = cudaStream_t; - using EventType = cudaEvent_t; - struct Dummy {}; - - constexpr DeviceType kInvalidDevice = -1; - - static DeviceType currentDevice() { - return cms::cuda::currentDevice(); - } - - static Dummy setDevice(DeviceType device) { - return {}; - } - - static bool canReuseInDevice(DeviceType a, DeviceType b) { - // Pinned host memory can be reused in any device, but in case of - // changes the event must be re-created - return true; - } - - static bool canReuseInQueue(QueueType a, QueueType b) { - // For pinned host memory a freed block without completed event - // can not be re-used even for operations in the same queue - return false; - } - - static bool eventWorkHasCompleted(EventType e) { - return cms::cuda::eventWorkHasCompleted(e); - } - - static EventType createEvent() { - EventType e; - cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); - return e; - } - - static void destroyEvent(EventType e) { - cudaCheck(cudaEventDestroy(e)); - } - - static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { - cudaCheck(cudaSetDevice(prev)); - destroyEvent(e); - cudaCheck(cudaSetDevice(next)); - return createEvent(); - } - - static EventType recordEvent(EventType e, QueueType queue) { - cudaCheck(cudaEventRecord(e, queue)); - } - - static std::ostream& printDevice(std::ostream& os, DeviceType dev) { - os << "Host"; - return os; - } - - static void* allocate(size_t bytes) { - void* ptr; - cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault)); - return ptr; - } - - static void* tryAllocate(size_t bytes) { - void* ptr; - auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault); - if (error == cudaErrorMemoryAllocation) { - return nullptr; - } - cudaCheck(error); - return ptr; - } - - static void free(void* ptr) { - cudaCheck(cudaFreeHost(ptr)); - } -} -*/ - namespace allocator { inline unsigned int intPow(unsigned int base, unsigned int exp) { unsigned int ret = 1; @@ -183,7 +102,7 @@ class GenericCachingAllocator { // allocate if necessary if (searchKey.ptr == nullptr) { - auto scopedSetDevice = Traits::setDevice(device); + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device); searchKey.ptr = Traits::tryAllocate(searchKey.bytes); if (searchKey.ptr == nullptr) { @@ -233,7 +152,7 @@ class GenericCachingAllocator { searchKey.device = device; searchKey.ptr = ptr; - auto scopedSetDevice = Traits::setDevice(device); + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(device); { std::scoped_lock lock(mutex_); @@ -385,7 +304,7 @@ class GenericCachingAllocator { while (not cachedBlocks_.empty()) { auto iBlock = cachedBlocks_.begin(); - auto scopedSetDevice = Traits::setDevice(iBlock->device); + [[maybe_unused]] auto scopedSetDevice = Traits::setDevice(iBlock->device); Traits::free(iBlock->ptr); Traits::destroyEvent(iBlock->readyEvent); cachedBytes_[iBlock->device].free -= iBlock->bytes; @@ -393,9 +312,9 @@ class GenericCachingAllocator { if (debug_) { std::cout << "\t"; Traits::printDevice(std::cout, iBlock->device) - << " freed " << iBlock->bytes << " bytes.\n\t\t " << (cachedBlocks_.size()-1) << " available blocks cached (" - << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() << " live blocks (" - << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl; + << " freed " << iBlock->bytes << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) + << " available blocks cached (" << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() + << " live blocks (" << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl; } cachedBlocks_.erase(iBlock); diff --git a/src/cudadev/CUDACore/allocate_host.cc b/src/cudadev/CUDACore/allocate_host.cc index a03121918..ff58e98a2 100644 --- a/src/cudadev/CUDACore/allocate_host.cc +++ b/src/cudadev/CUDACore/allocate_host.cc @@ -17,7 +17,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream)); + ptr = allocator::getCachingHostAllocator().allocate(allocator::HostTraits::kHostDevice, nbytes, stream); } else { cudaCheck(cudaMallocHost(&ptr, nbytes)); } @@ -26,7 +26,7 @@ namespace cms::cuda { void free_host(void *ptr) { if constexpr (allocator::policy == allocator::Policy::Caching) { - cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr)); + allocator::getCachingHostAllocator().free(allocator::HostTraits::kHostDevice, ptr); } else { cudaCheck(cudaFreeHost(ptr)); } diff --git a/src/cudadev/CUDACore/getCachingHostAllocator.h b/src/cudadev/CUDACore/getCachingHostAllocator.h index d29080795..a6524bedd 100644 --- a/src/cudadev/CUDACore/getCachingHostAllocator.h +++ b/src/cudadev/CUDACore/getCachingHostAllocator.h @@ -5,12 +5,84 @@ #include #include "CUDACore/cudaCheck.h" -#include "CachingHostAllocator.h" #include "getCachingDeviceAllocator.h" namespace cms::cuda::allocator { - inline notcub::CachingHostAllocator& getCachingHostAllocator() { + struct HostTraits { + using DeviceType = int; + using QueueType = cudaStream_t; + using EventType = cudaEvent_t; + struct Dummy {}; + + static constexpr DeviceType kInvalidDevice = -1; + static constexpr DeviceType kHostDevice = 0; + + static DeviceType currentDevice() { return cms::cuda::currentDevice(); } + + static Dummy setDevice(DeviceType device) { return {}; } + + static bool canReuseInDevice(DeviceType a, DeviceType b) { + // Pinned host memory can be reused in any device, but in case of + // changes the event must be re-created + return true; + } + + static bool canReuseInQueue(QueueType a, QueueType b) { + // For pinned host memory a freed block without completed event + // can not be re-used even for operations in the same queue + return false; + } + + static bool eventWorkHasCompleted(EventType e) { return cms::cuda::eventWorkHasCompleted(e); } + + static EventType createEvent() { + EventType e; + cudaCheck(cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + return e; + } + + static void destroyEvent(EventType e) { cudaCheck(cudaEventDestroy(e)); } + + static EventType recreateEvent(EventType e, DeviceType prev, DeviceType next) { + cudaCheck(cudaSetDevice(prev)); + destroyEvent(e); + cudaCheck(cudaSetDevice(next)); + return createEvent(); + } + + static EventType recordEvent(EventType e, QueueType queue) { + cudaCheck(cudaEventRecord(e, queue)); + return e; + } + + static std::ostream& printDevice(std::ostream& os, DeviceType dev) { + os << "Host"; + return os; + } + + static void* allocate(size_t bytes) { + void* ptr; + cudaCheck(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault)); + return ptr; + } + + static void* tryAllocate(size_t bytes) { + void* ptr; + auto error = cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault); + if (error == cudaErrorMemoryAllocation) { + return nullptr; + } + cudaCheck(error); + return ptr; + } + + static void free(void* ptr) { cudaCheck(cudaFreeHost(ptr)); } + }; + + using CachingHostAllocator = GenericCachingAllocator; + + inline CachingHostAllocator& getCachingHostAllocator() { if (debug) { std::cout << "cub::CachingHostAllocator settings\n" << " bin growth " << binGrowth << "\n" @@ -18,7 +90,7 @@ namespace cms::cuda::allocator { << " max bin " << maxBin << "\n" << " resulting bins:\n"; for (auto bin = minBin; bin <= maxBin; ++bin) { - auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin); + auto binSize = ::allocator::intPow(binGrowth, bin); if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) { std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n"; } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) { @@ -33,12 +105,7 @@ namespace cms::cuda::allocator { } // the public interface is thread safe - static notcub::CachingHostAllocator allocator{binGrowth, - minBin, - maxBin, - minCachedBytes(), - false, // do not skip cleanup - debug}; + static CachingHostAllocator allocator{binGrowth, minBin, maxBin, minCachedBytes(), debug}; return allocator; } } // namespace cms::cuda::allocator From 03e690b865b4b4ba066648f2fa9e5cf26e4a52dc Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 1 Sep 2021 09:13:13 -0700 Subject: [PATCH 3/4] Fix comment --- src/cudadev/CUDACore/GenericCachingAllocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h index a90a8acad..36a931d47 100644 --- a/src/cudadev/CUDACore/GenericCachingAllocator.h +++ b/src/cudadev/CUDACore/GenericCachingAllocator.h @@ -76,7 +76,7 @@ class GenericCachingAllocator { return cachedBytes_; } - // Allocate given number of bytes on the current device associated to queue + // Allocate given number of bytes on the given device associated to given queue void* allocate(DeviceType device, size_t bytes, QueueType queue) { if (bytes > maxBinBytes_) { throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + From fd96d4e20fd5c3211c2a43b70112bdb177e92e21 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 1 Sep 2021 10:10:50 -0700 Subject: [PATCH 4/4] Improve device information printing --- .../CUDACore/GenericCachingAllocator.h | 79 +++++++++---------- .../CUDACore/getCachingDeviceAllocator.h | 16 +++- .../CUDACore/getCachingHostAllocator.h | 14 +++- 3 files changed, 58 insertions(+), 51 deletions(-) diff --git a/src/cudadev/CUDACore/GenericCachingAllocator.h b/src/cudadev/CUDACore/GenericCachingAllocator.h index 36a931d47..607472485 100644 --- a/src/cudadev/CUDACore/GenericCachingAllocator.h +++ b/src/cudadev/CUDACore/GenericCachingAllocator.h @@ -108,10 +108,9 @@ class GenericCachingAllocator { if (searchKey.ptr == nullptr) { // The allocation attempt failed: free all cached blocks on device and retry if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " failed to allocate " << searchKey.bytes << " bytes for queue " << searchKey.associatedQueue - << ", retrying after freeing cached allocations" << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " failed to allocate " << searchKey.bytes + << " bytes for queue " << searchKey.associatedQueue << ", retrying after freeing cached allocations" + << std::endl; } freeCachedBlocksOnDevice(device); @@ -129,10 +128,9 @@ class GenericCachingAllocator { } if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " allocated new block at " << searchKey.ptr << " (" << searchKey.bytes << " bytes associated with queue " - << searchKey.associatedQueue << ", event " << searchKey.readyEvent << "." << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " allocated new block at " << searchKey.ptr << " (" + << searchKey.bytes << " bytes associated with queue " << searchKey.associatedQueue << ", event " + << searchKey.readyEvent << "." << std::endl; } } @@ -174,14 +172,13 @@ class GenericCachingAllocator { cachedBytes_[device].free += searchKey.bytes; if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " returned " << searchKey.bytes << " bytes at " << ptr << " from associated queue " - << searchKey.associatedQueue << " , event " << searchKey.readyEvent << " .\n\t\t " << cachedBlocks_.size() - << " available " - "blocks cached (" - << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. (" - << cachedBytes_[device].live << " bytes)" << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " returned " << searchKey.bytes << " bytes at " << ptr + << " from associated queue " << searchKey.associatedQueue << " , event " << searchKey.readyEvent + << " .\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks outstanding. (" + << cachedBytes_[device].live << " bytes)" << std::endl; } } @@ -194,14 +191,13 @@ class GenericCachingAllocator { Traits::free(ptr); Traits::destroyEvent(searchKey.readyEvent); if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " freed " << searchKey.bytes << " bytes at " << ptr << " from associated queue " - << searchKey.associatedQueue << ", event " << searchKey.readyEvent << ".\n\t\t " << cachedBlocks_.size() - << " available " - "blocks cached (" - << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks (" - << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " freed " << searchKey.bytes << " bytes at " << ptr + << " from associated queue " << searchKey.associatedQueue << ", event " << searchKey.readyEvent + << ".\n\t\t " << cachedBlocks_.size() + << " available " + "blocks cached (" + << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() << " live blocks (" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; } } } @@ -257,13 +253,12 @@ class GenericCachingAllocator { cachedBytes_[device].live += searchKey.bytesRequested; if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " reused cached block at " << searchKey.ptr << " (" << searchKey.bytes << "bytes) for queue " - << searchKey.associatedQueue << ", event " << searchKey.readyEvent - << " (previously " - "associated with stream " - << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " reused cached block at " << searchKey.ptr << " (" + << searchKey.bytes << "bytes) for queue " << searchKey.associatedQueue << ", event " + << searchKey.readyEvent + << " (previously " + "associated with stream " + << iBlock->associatedQueue << " , event " << iBlock->readyEvent << ")." << std::endl; } cachedBlocks_.erase(iBlock); @@ -286,13 +281,12 @@ class GenericCachingAllocator { cachedBytes_[device].free -= iBlock->bytes; if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, device) - << " freed " << iBlock->bytes << " bytes.\n\t\t " << cachedBlocks_.size() << " available blocks cached (" - << cachedBytes_[device].free << " bytes), " << liveBlocks_.size() - << " live blocks " - "(" - << cachedBytes_[device].live << " bytes) outstanding." << std::endl; + std::cout << "\t" << Traits::printDevice(device) << " freed " << iBlock->bytes << " bytes.\n\t\t " + << cachedBlocks_.size() << " available blocks cached (" << cachedBytes_[device].free << " bytes), " + << liveBlocks_.size() + << " live blocks " + "(" + << cachedBytes_[device].live << " bytes) outstanding." << std::endl; } iBlock = cachedBlocks_.erase(iBlock); @@ -310,11 +304,10 @@ class GenericCachingAllocator { cachedBytes_[iBlock->device].free -= iBlock->bytes; if (debug_) { - std::cout << "\t"; - Traits::printDevice(std::cout, iBlock->device) - << " freed " << iBlock->bytes << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) - << " available blocks cached (" << cachedBytes_[iBlock->device].free << " bytes), " << liveBlocks_.size() - << " live blocks (" << cachedBytes_[iBlock->device].live << " bytes) outstanding." << std::endl; + std::cout << "\t" << Traits::printDevice(iBlock->device) << " freed " << iBlock->bytes << " bytes.\n\t\t " + << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_[iBlock->device].free + << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_[iBlock->device].live + << " bytes) outstanding." << std::endl; } cachedBlocks_.erase(iBlock); diff --git a/src/cudadev/CUDACore/getCachingDeviceAllocator.h b/src/cudadev/CUDACore/getCachingDeviceAllocator.h index 190d0991a..62471b453 100644 --- a/src/cudadev/CUDACore/getCachingDeviceAllocator.h +++ b/src/cudadev/CUDACore/getCachingDeviceAllocator.h @@ -84,10 +84,13 @@ namespace cms::cuda::allocator { static void recordEvent(EventType e, QueueType queue) { cudaCheck(cudaEventRecord(e, queue)); } - static std::ostream& printDevice(std::ostream& os, DeviceType device) { - os << "Device " << device; - return os; - } + struct DevicePrinter { + DevicePrinter(DeviceType device) : device_(device) {} + void write(std::ostream& os) const { os << "Device " << device_; } + DeviceType device_; + }; + + static DevicePrinter printDevice(DeviceType device) { return DevicePrinter(device); } static void* allocate(size_t bytes) { void* ptr; @@ -108,6 +111,11 @@ namespace cms::cuda::allocator { static void free(void* ptr) { cudaCheck(cudaFree(ptr)); } }; + inline std::ostream& operator<<(std::ostream& os, DeviceTraits::DevicePrinter const& pr) { + pr.write(os); + return os; + } + using CachingDeviceAllocator = GenericCachingAllocator; inline CachingDeviceAllocator& getCachingDeviceAllocator() { diff --git a/src/cudadev/CUDACore/getCachingHostAllocator.h b/src/cudadev/CUDACore/getCachingHostAllocator.h index a6524bedd..0692c1686 100644 --- a/src/cudadev/CUDACore/getCachingHostAllocator.h +++ b/src/cudadev/CUDACore/getCachingHostAllocator.h @@ -56,10 +56,11 @@ namespace cms::cuda::allocator { return e; } - static std::ostream& printDevice(std::ostream& os, DeviceType dev) { - os << "Host"; - return os; - } + struct DevicePrinter { + static void write(std::ostream& os) { os << "Host"; } + }; + + static DevicePrinter printDevice(DeviceType dev) { return DevicePrinter(); } static void* allocate(size_t bytes) { void* ptr; @@ -80,6 +81,11 @@ namespace cms::cuda::allocator { static void free(void* ptr) { cudaCheck(cudaFreeHost(ptr)); } }; + inline std::ostream& operator<<(std::ostream& os, HostTraits::DevicePrinter const& pr) { + pr.write(os); + return os; + } + using CachingHostAllocator = GenericCachingAllocator; inline CachingHostAllocator& getCachingHostAllocator() {