Skip to content

Commit

Permalink
Review issues; add is_supported to cuda_malloc_aync_memory_resource.
Browse files Browse the repository at this point in the history
Signed-off-by: Michal Zientkiewicz <[email protected]>
  • Loading branch information
mzient committed Jun 12, 2023
1 parent a0c5e48 commit a27223d
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 50 deletions.
15 changes: 15 additions & 0 deletions dali/core/mm/async_pool_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -522,26 +522,41 @@ TEST_F(MMAsyncPoolTest, VM_CrossStreamWithHogs) {
#if CUDA_VERSION >= 11020

TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiThreadedSingleStreamRandom) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";

using MR = cuda_malloc_async_memory_resource;
this->MultiThreadedSingleStreamRandom<MR>();
}

TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiThreadedMultiStreamRandom) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";

using MR = cuda_malloc_async_memory_resource;
this->MultiThreadedMultiStreamRandom<MR>();
}

TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiStreamRandomWithGPUHogs) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";

using MR = cuda_malloc_async_memory_resource;
this->MultiStreamRandomWithGPUHogs<MR>();
}

TEST_F(MMAsyncPoolTest, cudaMallocAsync_CrossStream) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";

using MR = cuda_malloc_async_memory_resource;
this->CrossStream<MR>();
}

TEST_F(MMAsyncPoolTest, cudaMallocAsync_CrossStreamWithHogs) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";

using MR = cuda_malloc_async_memory_resource;
this->CrossStreamWithHogs<MR>();
}
Expand Down
3 changes: 3 additions & 0 deletions dali/core/mm/default_resources.cc
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ inline std::shared_ptr<device_async_resource> CreateDefaultDeviceResource() {
CUDA_CALL(cudaGetDevice(&device_id));
if (MMEnv::get().use_cuda_malloc_async) {
#if CUDA_VERSION >= 11020
if (!cuda_malloc_async_memory_resource::is_supported(device_id))
throw std::invalid_argument(make_string(
"cudaMallocAsync is not supported on device ", device_id));
return std::make_shared<mm::cuda_malloc_async_memory_resource>(device_id);
#else
throw std::invalid_argument(
Expand Down
61 changes: 14 additions & 47 deletions dali/core/mm/perf_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,50 +29,19 @@
#include "dali/core/cuda_event.h"
#include "dali/core/cuda_error.h"
#include "dali/core/device_guard.h"
#include "dali/test/timing.h"

namespace dali {
namespace mm {
namespace test {

using perf_timer = std::chrono::high_resolution_clock;

inline void print_time(std::ostream &os, double seconds) {
if (seconds < 1e-6) {
os << seconds * 1e+9 << " ns";
} else if (seconds < 1e-3) {
os << seconds * 1e+6 << " µs";
} else if (seconds < 1.0) {
os << seconds * 1e+3 << " ms";
} else {
os << seconds << " s";
}
}

template <typename Rep, typename Period>
double seconds(std::chrono::duration<Rep, Period> time) {
return std::chrono::duration_cast<std::chrono::duration<double>>(time).count();
}

template <typename Rep, typename Period>
void print_time(std::ostream &os, std::chrono::duration<Rep, Period> time) {
return format_time(seconds(time));
}

inline std::string format_time(double seconds) {
std::stringstream ss;
print_time(ss, seconds);
return ss.str();
}

template <typename Rep, typename Period>
std::string format_time(std::chrono::duration<Rep, Period> time) {
return format_time(seconds(time));
}
using dali::test::format_time;
using dali::test::perf_timer;
using dali::test::seconds;

void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
int num_threads,
int num_streams,
double test_time) {
int num_streams) {
std::vector<CUDAStreamLease> streams;
streams.reserve(num_streams);
for (int i = 0; i < num_streams; i++)
Expand All @@ -97,14 +66,12 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,

std::vector<std::thread> threads;
for (int tid = 0; tid < num_threads; tid++) {
threads.emplace_back([&, tid]() {
(void)tid; // Make clang shut up; I prefer to keep this explicitly captured by value, even
// if not used, than end up with it being captured by reference when I need it.
threads.emplace_back([&, tid /* to avoid future bugs */]() {
(void)tid; // Silence a terribly ill-advised warning from clang.
std::mt19937_64 rng;
std::uniform_int_distribution<int> stream_dist(-1, num_streams - 1);
std::uniform_real_distribution<float> size_log_dist(4, 28);
std::uniform_real_distribution<float> size_log_dist(4, 24);
std::bernoulli_distribution action_dist(0.5);
auto test_start = perf_timer::now();

perf_timer::duration alloc_time = {};
perf_timer::duration dealloc_time = {};
Expand All @@ -113,10 +80,9 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
int64_t num_allocs = 0, num_deallocs = 0;
int64_t num_async_allocs = 0, num_async_deallocs = 0;

// while (seconds(perf_timer::now() - test_start) < test_time) {
for (int iter = 0; iter < 10000; iter++) {
bool is_free = action_dist(rng);
for (int iter = 0; iter < 100000; iter++) {
cudaDeviceSynchronize();
bool is_free = action_dist(rng);
if (is_free) {
Alloc alloc;

Expand Down Expand Up @@ -217,14 +183,15 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
TEST(MMPerfTest, DefaultGPUAlloc) {
auto *res = mm::GetDefaultDeviceResource(0);

RunBenchmark(res, 1, 1, 1);
RunBenchmark(res, 1, 1);
}

#if CUDA_VERSION >= 11020
TEST(MMPerfTest, CudaMallocAsync) {
if (!cuda_malloc_async_memory_resource::is_supported())
GTEST_SKIP() << "cudaMallocAsync not supported";
cuda_malloc_async_memory_resource res;

RunBenchmark(&res, 1, 1, 1);
RunBenchmark(&res, 1, 1);
}
#endif

Expand Down
42 changes: 39 additions & 3 deletions include/dali/core/mm/malloc_resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

#include <stdlib.h>
#include <malloc.h>
#include <vector>
#include "dali/core/mm/memory_resource.h"
#include "dali/core/cuda_error.h"
#include "dali/core/cuda_stream.h"
#include "dali/core/cuda_stream_pool.h"
#include "dali/core/mm/detail/align.h"
#include "dali/core/device_guard.h"

Expand Down Expand Up @@ -207,14 +209,48 @@ class managed_malloc_memory_resource : public managed_async_resource {
class cuda_malloc_async_memory_resource
: public mm::async_memory_resource<mm::memory_kind::device> {
public:
explicit cuda_malloc_async_memory_resource(int device_id = -1) {
cuda_malloc_async_memory_resource() : cuda_malloc_async_memory_resource(-1) {}

explicit cuda_malloc_async_memory_resource(int device_id) {
if (device_id < 0) {
CUDA_CALL(cudaGetDevice(&device_id));
}

device_id_ = device_id;
DeviceGuard dg(device_id_);
dummy_host_stream_ = CUDAStream::Create(true);
dummy_host_stream_ = CUDAStreamPool::instance().Get(device_id_);
}

static bool is_supported(int device_id = -1) {
static const int num_devices = []() {
int ndev;
CUDA_CALL(cudaGetDeviceCount(&ndev));
return ndev;
}();
enum Support {
unintialized = 0,
unsuppoerted = -1,
supported = 1
};
static vector<Support> support(num_devices);
if (device_id < 0)
CUDA_CALL(cudaGetDevice(&device_id));

if (!support[device_id]) {
auto stream = CUDAStreamPool::instance().Get(device_id);
try {
void *ptr;
CUDA_CALL(cudaMallocAsync(&ptr, 16, stream));
CUDA_CALL(cudaFreeAsync(ptr, stream));
support[device_id] = supported;
} catch (const CUDAError &e) {
if (e.rt_error() == cudaErrorNotSupported)
support[device_id] = unsuppoerted;
else
throw;
}
}
return support[device_id] == supported;
}

private:
Expand Down Expand Up @@ -244,7 +280,7 @@ class cuda_malloc_async_memory_resource
}

int device_id_;
CUDAStream dummy_host_stream_;
CUDAStreamLease dummy_host_stream_;
};

#endif
Expand Down

0 comments on commit a27223d

Please sign in to comment.