Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add public interface for constructing and freeing caching allocators #34958

Merged
merged 1 commit into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
// read the current device
Expand Down Expand Up @@ -300,8 +299,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {

// Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
if constexpr (cms::cuda::allocator::useCaching) {
cms::cuda::allocator::getCachingDeviceAllocator();
cms::cuda::allocator::getCachingHostAllocator();
cms::cuda::allocator::cachingAllocatorsConstruct();
}
cms::cuda::getEventCache().clear();
cms::cuda::getStreamCache().clear();
Expand All @@ -319,8 +317,7 @@ CUDAService::~CUDAService() {
if (enabled_) {
// Explicitly destruct the allocator before the device resets below
if constexpr (cms::cuda::allocator::useCaching) {
cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
cms::cuda::allocator::cachingAllocatorsFreeCached();
}
cms::cuda::getEventCache().clear();
cms::cuda::getStreamCache().clear();
Expand Down
13 changes: 13 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
#define HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h

namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;

// these intended to be called only from CUDAService
void cachingAllocatorsConstruct();
void cachingAllocatorsFreeCached();
} // namespace cms::cuda::allocator

#endif
42 changes: 42 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
#define HeterogeneousCore_CUDACore_src_cachingAllocatorCommon

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"

#include <algorithm>
#include <limits>

namespace cms::cuda::allocator {
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
constexpr double maxCachedFraction = 0.8;
constexpr bool debug = false;

inline size_t minCachedBytes() {
size_t ret = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
const int numberOfDevices = deviceCount();
for (int i = 0; i < numberOfDevices; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
}
cudaCheck(cudaSetDevice(currentDevice));
if (maxCachedBytes > 0) {
ret = std::min(ret, maxCachedBytes);
}
return ret;
}
} // namespace cms::cuda::allocator

#endif
16 changes: 16 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "getCachingDeviceAllocator.h"
#include "getCachingHostAllocator.h"

namespace cms::cuda::allocator {
void cachingAllocatorsConstruct() {
cms::cuda::allocator::getCachingDeviceAllocator();
cms::cuda::allocator::getCachingHostAllocator();
}

void cachingAllocatorsFreeCached() {
cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
}
} // namespace cms::cuda::allocator
36 changes: 3 additions & 33 deletions HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,14 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "CachingDeviceAllocator.h"
#include "cachingAllocatorCommon.h"

#include <iomanip>

namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
constexpr double maxCachedFraction = 0.8;
constexpr bool debug = false;

inline size_t minCachedBytes() {
size_t ret = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
const int numberOfDevices = deviceCount();
for (int i = 0; i < numberOfDevices; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
}
cudaCheck(cudaSetDevice(currentDevice));
if (maxCachedBytes > 0) {
ret = std::min(ret, maxCachedBytes);
}
return ret;
}

inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
LogDebug("CachingDeviceAllocator").log([](auto& log) {
log << "cub::CachingDeviceAllocator settings\n"
Expand Down
6 changes: 4 additions & 2 deletions HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "CachingHostAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "getCachingDeviceAllocator.h"
#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
#include "cachingAllocatorCommon.h"

#include <iomanip>

Expand Down