diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc index 575fcf63b8eaa..5136af4724736 100644 --- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -4,6 +4,6 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { - data_d_ = cms::cuda::make_device_unique(stream); + data_d_ = cms::cuda::make_device_unique(); cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream)); } diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h index 6fec0026dfaa1..4a22cdd23a43b 100644 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -56,29 +56,19 @@ namespace cudaCompat { template using unique_ptr = cms::cuda::device::unique_ptr; - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + template + static auto make_unique(Args &&... args) { + return cms::cuda::make_device_unique(std::forward(args)...); } - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + template + static auto make_host_unique(Args &&... args) { + return cms::cuda::make_host_unique(std::forward(args)...); } - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + template + static auto make_device_unique(Args &&... args) { + return cms::cuda::make_device_unique(std::forward(args)...); } }; @@ -86,24 +76,19 @@ namespace cudaCompat { template using unique_ptr = cms::cuda::host::unique_ptr; - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + template + static auto make_unique(Args &&... args) { + return cms::cuda::make_host_unique(std::forward(args)...); } - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); + template + static auto make_host_unique(Args &&... args) { + return cms::cuda::make_host_unique(std::forward(args)...); } - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); + template + static auto make_device_unique(Args &&... args) { + return cms::cuda::make_device_unique(std::forward(args)...); } }; @@ -111,26 +96,46 @@ namespace cudaCompat { template using unique_ptr = std::unique_ptr; + template + static auto make_unique() { + return std::make_unique(); + } template static auto make_unique(cudaStream_t) { return std::make_unique(); } + template + static auto make_unique(size_t size) { + return std::make_unique(size); + } template static auto make_unique(size_t size, cudaStream_t) { return std::make_unique(size); } + template + static auto make_host_unique() { + return std::make_unique(); + } template static auto make_host_unique(cudaStream_t) { return std::make_unique(); } + template + static auto make_device_unique() { + return std::make_unique(); + } template static auto make_device_unique(cudaStream_t) { return std::make_unique(); } + template + static auto make_device_unique(size_t size) { + return std::make_unique(size); + } template static auto make_device_unique(size_t size, cudaStream_t) { return std::make_unique(size); @@ -146,13 +151,12 @@ class HeterogeneousSoAImpl { template using unique_ptr = typename Traits::template unique_ptr; - HeterogeneousSoAImpl() = default; // make root happy + HeterogeneousSoAImpl(); ~HeterogeneousSoAImpl() = default; HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default; HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default; explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {} - explicit HeterogeneousSoAImpl(cudaStream_t stream); T const *get() const { return m_ptr.get(); } @@ -165,8 +169,8 @@ class HeterogeneousSoAImpl { }; template -HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) { - m_ptr = Traits::template make_unique(stream); +HeterogeneousSoAImpl::HeterogeneousSoAImpl() { + m_ptr = Traits::template make_unique(); } // in reality valid only for GPU version... diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index 7bef9d0d8a52f..978b1ae776fa4 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -5,17 +5,19 @@ #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) { - moduleStart_d = cms::cuda::make_device_unique(maxClusters + 1, stream); - clusInModule_d = cms::cuda::make_device_unique(maxClusters, stream); - moduleId_d = cms::cuda::make_device_unique(maxClusters, stream); - clusModuleStart_d = cms::cuda::make_device_unique(maxClusters + 1, stream); + moduleStart_d = cms::cuda::make_device_unique(maxClusters + 1); + clusInModule_d = cms::cuda::make_device_unique(maxClusters); + moduleId_d = cms::cuda::make_device_unique(maxClusters); + clusModuleStart_d = cms::cuda::make_device_unique(maxClusters + 1); + // device-side ownership to guarantee that the host memory is alive + // until the copy finishes auto view = cms::cuda::make_host_unique(stream); view->moduleStart_ = moduleStart_d.get(); view->clusInModule_ = clusInModule_d.get(); view->moduleId_ = moduleId_d.get(); view->clusModuleStart_ = clusModuleStart_d.get(); - view_d = cms::cuda::make_device_unique(stream); + view_d = cms::cuda::make_device_unique(); cms::cuda::copyAsync(view_d, view, stream); } diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc index ffef71092f6c9..abf5124f80db4 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc @@ -9,11 +9,13 @@ SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream) : formatterErrors_h(std::move(errors)) { - error_d = cms::cuda::make_device_unique>(stream); - data_d = cms::cuda::make_device_unique(maxFedWords, stream); + error_d = cms::cuda::make_device_unique>(); + data_d = cms::cuda::make_device_unique(maxFedWords); cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream); + // device-side ownership to guarantee that the host memory is alive + // until the copy finishes error_h = cms::cuda::make_host_unique>(stream); GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); assert(error_h->empty()); diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 664364b6ff25a..656d69d4da5d8 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -5,15 +5,17 @@ #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) { - xx_d = cms::cuda::make_device_unique(maxFedWords, stream); - yy_d = cms::cuda::make_device_unique(maxFedWords, stream); - adc_d = cms::cuda::make_device_unique(maxFedWords, stream); - moduleInd_d = cms::cuda::make_device_unique(maxFedWords, stream); - clus_d = cms::cuda::make_device_unique(maxFedWords, stream); + xx_d = cms::cuda::make_device_unique(maxFedWords); + yy_d = cms::cuda::make_device_unique(maxFedWords); + adc_d = cms::cuda::make_device_unique(maxFedWords); + moduleInd_d = cms::cuda::make_device_unique(maxFedWords); + clus_d = cms::cuda::make_device_unique(maxFedWords); - pdigi_d = cms::cuda::make_device_unique(maxFedWords, stream); - rawIdArr_d = cms::cuda::make_device_unique(maxFedWords, stream); + pdigi_d = cms::cuda::make_device_unique(maxFedWords); + rawIdArr_d = cms::cuda::make_device_unique(maxFedWords); + // device-side ownership to guarantee that the host memory is alive + // until the copy finishes auto view = cms::cuda::make_host_unique(stream); view->xx_ = xx_d.get(); view->yy_ = yy_d.get(); @@ -21,7 +23,7 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) { view->moduleInd_ = moduleInd_d.get(); view->clus_ = clus_d.get(); - view_d = cms::cuda::make_device_unique(stream); + view_d = cms::cuda::make_device_unique(); cms::cuda::copyAsync(view_d, view, stream); } diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 955f97ca6bd54..1221db3219ef8 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -73,11 +73,13 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH uint32_t const* hitsModuleStart, cudaStream_t stream) : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) { + // device-side ownership to guarantee that the host memory is alive + // until the copy finishes auto view = Traits::template make_host_unique(stream); view->m_nHits = nHits; - m_view = Traits::template make_device_unique(stream); - m_AverageGeometryStore = Traits::template make_device_unique(stream); + m_view = Traits::template make_device_unique(); + m_AverageGeometryStore = Traits::template make_device_unique(); view->m_averageGeometry = m_AverageGeometryStore.get(); view->m_cpeParams = cpeParams; view->m_hitsModuleStart = hitsModuleStart; @@ -101,9 +103,9 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // if ordering is relevant they may have to be stored phi-ordered by layer or so // this will break 1to1 correspondence with cluster and module locality // so unless proven VERY inefficient we keep it ordered as generated - m_store16 = Traits::template make_device_unique(nHits * n16, stream); - m_store32 = Traits::template make_device_unique(nHits * n32 + 11, stream); - m_HistStore = Traits::template make_device_unique(stream); + m_store16 = Traits::template make_device_unique(nHits * n16); + m_store32 = Traits::template make_device_unique(nHits * n32 + 11); + m_HistStore = Traits::template make_device_unique(); auto get16 = [&](int i) { return m_store16.get() + i * nHits; }; auto get32 = [&](int i) { return m_store32.get() + i * nHits; }; diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h index 9c271fc2fbff1..1cc014978ecb1 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h +++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h @@ -6,6 +6,14 @@ namespace cms { namespace cuda { // Allocate device memory + // This variant does not create device-side ownership + void *allocate_device(int dev, size_t nbytes); + + // Allocate device memory + // This variant creates device-side ownership. When freed, all work + // in the stream up to the freeing point must be finished for the + // memory block to be considered free (except for new allocation in + // the same stream) void *allocate_device(int dev, size_t nbytes, cudaStream_t stream); // Free device memory (to be called from unique_ptr) diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h index 1bba4580028d3..7e6f87bd9b04e 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h +++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h @@ -6,6 +6,14 @@ namespace cms { namespace cuda { // Allocate pinned host memory (to be called from unique_ptr) + // This variant does not create device-side ownership + void *allocate_host(size_t nbytes); + + // Allocate pinned host memory (to be called from unique_ptr) + // This variant creates device-side ownership. When freed, all work + // in the stream up to the freeing point must be finished for the + // memory block to be considered free (except for new allocation in + // the same stream) void *allocate_host(size_t nbytes, cudaStream_t stream); // Free pinned host memory (to be called from unique_ptr) diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h index eb86c05be465c..756991c323de0 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h +++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h @@ -48,6 +48,16 @@ namespace cms { } // namespace impl } // namespace device + template + typename device::impl::make_device_unique_selector::non_array make_device_unique() { + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the device memory is not supported"); + int dev = currentDevice(); + void *mem = allocate_device(dev, sizeof(T)); + return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), + device::impl::DeviceDeleter{dev}}; + } + template typename device::impl::make_device_unique_selector::non_array make_device_unique(cudaStream_t stream) { static_assert(std::is_trivially_constructible::value, @@ -58,6 +68,17 @@ namespace cms { device::impl::DeviceDeleter{dev}}; } + template + typename device::impl::make_device_unique_selector::unbounded_array make_device_unique(size_t n) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the device memory is not supported"); + int dev = currentDevice(); + void *mem = allocate_device(dev, n * sizeof(element_type)); + return typename device::impl::make_device_unique_selector::unbounded_array{ + reinterpret_cast(mem), device::impl::DeviceDeleter{dev}}; + } + template typename device::impl::make_device_unique_selector::unbounded_array make_device_unique(size_t n, cudaStream_t stream) { @@ -74,6 +95,14 @@ namespace cms { typename device::impl::make_device_unique_selector::bounded_array make_device_unique(Args &&...) = delete; // No check for the trivial constructor, make it clear in the interface + template + typename device::impl::make_device_unique_selector::non_array make_device_unique_uninitialized() { + int dev = currentDevice(); + void *mem = allocate_device(dev, sizeof(T)); + return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), + device::impl::DeviceDeleter{dev}}; + } + template typename device::impl::make_device_unique_selector::non_array make_device_unique_uninitialized( cudaStream_t stream) { @@ -83,6 +112,15 @@ namespace cms { device::impl::DeviceDeleter{dev}}; } + template + typename device::impl::make_device_unique_selector::unbounded_array make_device_unique_uninitialized(size_t n) { + using element_type = typename std::remove_extent::type; + int dev = currentDevice(); + void *mem = allocate_device(dev, n * sizeof(element_type)); + return typename device::impl::make_device_unique_selector::unbounded_array{ + reinterpret_cast(mem), device::impl::DeviceDeleter{dev}}; + } + template typename device::impl::make_device_unique_selector::unbounded_array make_device_unique_uninitialized( size_t n, cudaStream_t stream) { diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h index a64b7c9869d6c..d408fd7081c4d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h +++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h @@ -37,6 +37,14 @@ namespace cms { } // namespace host // Allocate pinned host memory + template + typename host::impl::make_host_unique_selector::non_array make_host_unique() { + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the pinned host memory is not supported"); + void *mem = allocate_host(sizeof(T)); + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + } + template typename host::impl::make_host_unique_selector::non_array make_host_unique(cudaStream_t stream) { static_assert(std::is_trivially_constructible::value, @@ -45,6 +53,15 @@ namespace cms { return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; } + template + typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n) { + using element_type = typename std::remove_extent::type; + static_assert(std::is_trivially_constructible::value, + "Allocating with non-trivial constructor on the pinned host memory is not supported"); + void *mem = allocate_host(n * sizeof(element_type)); + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + } + template typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n, cudaStream_t stream) { using element_type = typename std::remove_extent::type; @@ -58,12 +75,25 @@ namespace cms { typename host::impl::make_host_unique_selector::bounded_array make_host_unique(Args &&...) = delete; // No check for the trivial constructor, make it clear in the interface + template + typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized() { + void *mem = allocate_host(sizeof(T)); + return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; + } + template typename host::impl::make_host_unique_selector::non_array make_host_unique_uninitialized(cudaStream_t stream) { void *mem = allocate_host(sizeof(T), stream); return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; } + template + typename host::impl::make_host_unique_selector::unbounded_array make_host_unique_uninitialized(size_t n) { + using element_type = typename std::remove_extent::type; + void *mem = allocate_host(n * sizeof(element_type)); + return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; + } + template typename host::impl::make_host_unique_selector::unbounded_array make_host_unique_uninitialized( size_t n, cudaStream_t stream) { diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h index 075d568f21039..e884ae2039cff 100644 --- a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h @@ -126,19 +126,29 @@ namespace notcub { int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + bool has_stream; // Whether or not the block has an associated stream (CMS: support for no stream association) // Constructor (suitable for searching maps for a specific block, given its pointer and device) + // CMS: add initialization of has_stream BlockDescriptor(void *d_ptr, int device) - : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {} + : d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(nullptr), + ready_event(nullptr), + has_stream(false) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + // CMS: add initialization of has_stream BlockDescriptor(int device) : d_ptr(nullptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), - ready_event(nullptr) {} + ready_event(nullptr), + has_stream(false) {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { @@ -321,9 +331,10 @@ namespace notcub { * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceAllocate( - int device, ///< [in] Device on which to place the allocation - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + bool has_stream, ///< [in] Whether or not associate the stream with this allocation (CMS: support for no stream association) cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { *d_ptr = nullptr; @@ -339,6 +350,7 @@ namespace notcub { // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); + search_key.has_stream = has_stream; // CMS: support for no stream association search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); @@ -364,12 +376,14 @@ namespace notcub { (block_itr->bin == search_key.bin)) { // To prevent races with reusing blocks returned by the host but still // in use by the device, only consider cached blocks that are - // either (from the active stream) or (from an idle stream) - if ((active_stream == block_itr->associated_stream) || + // either (not associated to a stream) or (from the active stream) or (from an idle stream) + // CMS: do stream association check only if block is associated to a stream + if ((!block_itr->has_stream) || (has_stream && active_stream == block_itr->associated_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { // Reuse existing cache block. Insert into live blocks. found = true; search_key = *block_itr; + search_key.has_stream = has_stream; // CMS: support for no stream association search_key.associated_stream = active_stream; live_blocks.insert(search_key); @@ -377,18 +391,35 @@ namespace notcub { cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; - if (debug) + if (debug) { // CMS: improved debug message - _CubLog( - "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " - "associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)block_itr->associated_stream, - (long long)block_itr->ready_event); + // CMS: support for no stream association + if (search_key.has_stream) { + _CubLog( + "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " + "associated %d with stream %lld, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + block_itr->has_stream, + (long long)block_itr->associated_stream, + (long long)block_itr->ready_event); + } else { + _CubLog( + "\tDevice %d reused cached block at %p (%lld bytes) without stream association, event %lld " + "(previously " + "associated %d with stream %lld, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.ready_event, + block_itr->has_stream, + (long long)block_itr->associated_stream, + (long long)block_itr->ready_event); + } + } cached_blocks.erase(block_itr); @@ -414,12 +445,23 @@ namespace notcub { // Attempt to allocate if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry - if (debug) - _CubLog( - "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", - device, - (long long)search_key.bytes, - (long long)search_key.associated_stream); + if (debug) { + // CMS: support for no stream association + if (search_key.has_stream) { + _CubLog( + "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached " + "allocations", + device, + (long long)search_key.bytes, + (long long)search_key.associated_stream); + } else { + _CubLog( + "\tDevice %d failed to allocate %lld bytes without stream association, retrying after freeing cached " + "allocations", + device, + (long long)search_key.bytes); + } + } error = cudaSuccess; // Reset the error we will return cudaGetLastError(); // Reset CUDART's error @@ -483,15 +525,26 @@ namespace notcub { cached_bytes[device].live += search_key.bytes; mutex.Unlock(); - if (debug) + if (debug) { // CMS: improved debug message - _CubLog( - "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", - device, - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event); + // CMS: support for no stream association + if (search_key.has_stream) { + _CubLog( + "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event); + } else { + _CubLog( + "\tDevice %d allocated new device block at %p (%lld bytes without stream association, event %lld).\n", + device, + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.ready_event); + } + } // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { @@ -565,20 +618,35 @@ namespace notcub { cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; - if (debug) + if (debug) { // CMS: improved debug message - _CubLog( - "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, - (long long)search_key.bytes, - d_ptr, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); + // CMS: support for no stream association + if (search_key.has_stream) { + _CubLog( + "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, + (long long)search_key.bytes, + d_ptr, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } else { + _CubLog( + "\tDevice %d returned %lld bytes at %p without stream association.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, + (long long)search_key.bytes, + d_ptr, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } + } } } @@ -590,7 +658,8 @@ namespace notcub { return error; } - if (recached) { + // CMS: support for no stream association + if (recached && search_key.has_stream) { // Insert the ready event in the associated stream (must have current device set properly) if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; @@ -606,20 +675,35 @@ namespace notcub { if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - if (debug) + if (debug) { // CMS: improved debug message - _CubLog( - "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, - (long long)search_key.bytes, - d_ptr, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)cached_blocks.size(), - (long long)cached_bytes[device].free, - (long long)live_blocks.size(), - (long long)cached_bytes[device].live); + // CMS: support for no stream association + if (search_key.has_stream) { + _CubLog( + "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, + (long long)search_key.bytes, + d_ptr, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } else { + _CubLog( + "\tDevice %d freed %lld bytes at %p without stream association.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, + (long long)search_key.bytes, + d_ptr, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } + } } // Reset device diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h index 53901e1f1cb27..9f5695e6b555f 100644 --- a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h @@ -127,6 +127,7 @@ namespace notcub { int device; // device ordinal cudaStream_t associated_stream; // Associated associated_stream cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + bool has_stream; // Whether or not the block has an associated stream // Constructor (suitable for searching maps for a specific block, given its pointer) BlockDescriptor(void *d_ptr) @@ -135,7 +136,8 @@ namespace notcub { bin(INVALID_BIN), device(INVALID_DEVICE_ORDINAL), associated_stream(nullptr), - ready_event(nullptr) {} + ready_event(nullptr), + has_stream(false) {} // Constructor (suitable for searching maps for a range of suitable blocks) BlockDescriptor() @@ -144,7 +146,8 @@ namespace notcub { bin(INVALID_BIN), device(INVALID_DEVICE_ORDINAL), associated_stream(nullptr), - ready_event(nullptr) {} + ready_event(nullptr), + has_stream(false) {} // Comparison functor for comparing host pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); } @@ -312,6 +315,7 @@ namespace notcub { cudaError_t HostAllocate( void **d_ptr, ///< [out] Reference to pointer to the allocation size_t bytes, ///< [in] Minimum number of bytes for the allocation + bool has_stream, ///< [in] Whether or not associate the stream with this allocation cudaStream_t active_stream = nullptr) ///< [in] The stream to be associated with this allocation { *d_ptr = nullptr; @@ -325,6 +329,7 @@ namespace notcub { bool found = false; BlockDescriptor search_key; search_key.device = device; + search_key.has_stream = has_stream; search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); @@ -347,12 +352,15 @@ namespace notcub { // Iterate through the range of cached blocks in the same bin CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) { - // To prevent races with reusing blocks returned by the host but still - // in use for transfers, only consider cached blocks that are from an idle stream - if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { + // To prevent races with reusing blocks returned by the host + // but still in use for transfers, only consider cached + // blocks that are from an idle stream or have no stream + // association + if ((!block_itr->has_stream) or cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) { // Reuse existing cache block. Insert into live blocks. found = true; search_key = *block_itr; + search_key.has_stream = has_stream; search_key.associated_stream = active_stream; if (search_key.device != device) { // If "associated" device changes, need to re-create the event on the right device @@ -373,17 +381,29 @@ namespace notcub { cached_bytes.free -= search_key.bytes; cached_bytes.live += search_key.bytes; - if (debug) - _CubLog( - "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " - "(previously associated with stream %lld, event %lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)block_itr->associated_stream, - (long long)block_itr->ready_event); + if (debug) { + if (search_key.has_stream) { + _CubLog( + "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " + "(previously associated with stream %lld, event %lld).\n", + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)search_key.device, + (long long)block_itr->associated_stream, + (long long)block_itr->ready_event); + } else { + _CubLog( + "\tHost reused cached block at %p (%lld bytes) without stream association " + "(previously associated %d with stream %lld, event %lld).\n", + search_key.d_ptr, + (long long)search_key.bytes, + block_itr->has_stream, + (long long)block_itr->associated_stream, + (long long)block_itr->ready_event); + } + } cached_blocks.erase(block_itr); @@ -403,13 +423,21 @@ namespace notcub { if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry - if (debug) - _CubLog( - "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " - "allocations", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.device); + if (debug) { + if (search_key.has_stream) { + _CubLog( + "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " + "allocations", + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.device); + } else { + _CubLog( + "\tHost failed to allocate %lld bytes without stream association, retrying after freeing cached " + "allocations", + (long long)search_key.bytes); + } + } error = cudaSuccess; // Reset the error we will return cudaGetLastError(); // Reset CUDART's error @@ -471,15 +499,22 @@ namespace notcub { cached_bytes.live += search_key.bytes; mutex.Unlock(); - if (debug) - _CubLog( - "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " - "%lld).\n", - search_key.d_ptr, - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device); + if (debug) { + if (search_key.has_stream) { + _CubLog( + "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " + "%lld).\n", + search_key.d_ptr, + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)search_key.device); + } else { + _CubLog("\tHost allocated new host block at %p (%lld bytes without stream association).\n", + search_key.d_ptr, + (long long)search_key.bytes); + } + } } // Copy host pointer to output parameter @@ -524,18 +559,30 @@ namespace notcub { cached_blocks.insert(search_key); cached_bytes.free += search_key.bytes; - if (debug) - _CubLog( - "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " - "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); + if (debug) { + if (search_key.has_stream) { + _CubLog( + "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " + "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)search_key.device, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } else { + _CubLog( + "\tHost returned %lld bytes without stream association.\n\t\t %lld " + "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + (long long)search_key.bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } + } } } @@ -546,7 +593,7 @@ namespace notcub { return error; } - if (recached) { + if (recached && search_key.has_stream) { // Insert the ready event in the associated stream (must have current device set properly) if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; @@ -562,18 +609,30 @@ namespace notcub { if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - if (debug) - _CubLog( - "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " - "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - (long long)search_key.bytes, - (long long)search_key.associated_stream, - (long long)search_key.ready_event, - (long long)search_key.device, - (long long)cached_blocks.size(), - (long long)cached_bytes.free, - (long long)live_blocks.size(), - (long long)cached_bytes.live); + if (debug) { + if (search_key.has_stream) { + _CubLog( + "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)search_key.ready_event, + (long long)search_key.device, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } else { + _CubLog( + "\tHost freed %lld bytes without stream association.\n\t\t %lld available " + "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + (long long)search_key.bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes.free, + (long long)live_blocks.size(), + (long long)cached_bytes.live); + } + } } // Reset device diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc index c3a33fcee3553..68870f2bdb91f 100644 --- a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc +++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc @@ -13,6 +13,21 @@ namespace { } namespace cms::cuda { + void *allocate_device(int dev, size_t nbytes) { + void *ptr = nullptr; + if constexpr (allocator::useCaching) { + if (UNLIKELY(nbytes > maxAllocationSize)) { + throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); + } + cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, false)); + } else { + ScopedSetDevice setDeviceForThisScope(dev); + cudaCheck(cudaMalloc(&ptr, nbytes)); + } + return ptr; + } + void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) { void *ptr = nullptr; if constexpr (allocator::useCaching) { @@ -20,7 +35,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream)); + cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, true, stream)); } else { ScopedSetDevice setDeviceForThisScope(dev); cudaCheck(cudaMalloc(&ptr, nbytes)); diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc index 1e8c8f9cd33c5..746e998296865 100644 --- a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc +++ b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc @@ -12,6 +12,20 @@ namespace { } namespace cms::cuda { + void *allocate_host(size_t nbytes) { + void *ptr = nullptr; + if constexpr (allocator::useCaching) { + if (UNLIKELY(nbytes > maxAllocationSize)) { + throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); + } + cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, false)); + } else { + cudaCheck(cudaMallocHost(&ptr, nbytes)); + } + return ptr; + } + void *allocate_host(size_t nbytes, cudaStream_t stream) { void *ptr = nullptr; if constexpr (allocator::useCaching) { @@ -19,7 +33,7 @@ namespace cms::cuda { throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) + " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize)); } - cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream)); + cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, true, stream)); } else { cudaCheck(cudaMallocHost(&ptr, nbytes)); } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 53af26ac7527d..dd43c04753fbb 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -549,7 +549,7 @@ namespace pixelgpudetails { } clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream); - nModules_Clusters_h = cms::cuda::make_host_unique(2, stream); + nModules_Clusters_h = cms::cuda::make_host_unique(2); if (wordCounter) // protect in case of empty event.... {