Review issues; add is_supported to cuda_malloc_aync_memory_resource.

Signed-off-by: Michal Zientkiewicz <[email protected]>
NVIDIA · Jun 12, 2023 · a27223d · a27223d
1 parent a0c5e48
commit a27223d
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 50 deletions.
diff --git a/dali/core/mm/async_pool_test.cu b/dali/core/mm/async_pool_test.cu
@@ -522,26 +522,41 @@ TEST_F(MMAsyncPoolTest, VM_CrossStreamWithHogs) {
 #if CUDA_VERSION >= 11020
 
 TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiThreadedSingleStreamRandom) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
+
   using MR = cuda_malloc_async_memory_resource;
   this->MultiThreadedSingleStreamRandom<MR>();
 }
 
 TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiThreadedMultiStreamRandom) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
+
   using MR = cuda_malloc_async_memory_resource;
   this->MultiThreadedMultiStreamRandom<MR>();
 }
 
 TEST_F(MMAsyncPoolTest, cudaMallocAsync_MultiStreamRandomWithGPUHogs) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
+
   using MR = cuda_malloc_async_memory_resource;
   this->MultiStreamRandomWithGPUHogs<MR>();
 }
 
 TEST_F(MMAsyncPoolTest, cudaMallocAsync_CrossStream) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
+
   using MR = cuda_malloc_async_memory_resource;
   this->CrossStream<MR>();
 }
 
 TEST_F(MMAsyncPoolTest, cudaMallocAsync_CrossStreamWithHogs) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
+
   using MR = cuda_malloc_async_memory_resource;
   this->CrossStreamWithHogs<MR>();
 }

diff --git a/dali/core/mm/default_resources.cc b/dali/core/mm/default_resources.cc
@@ -235,6 +235,9 @@ inline std::shared_ptr<device_async_resource> CreateDefaultDeviceResource() {
   CUDA_CALL(cudaGetDevice(&device_id));
   if (MMEnv::get().use_cuda_malloc_async) {
     #if CUDA_VERSION >= 11020
+      if (!cuda_malloc_async_memory_resource::is_supported(device_id))
+        throw std::invalid_argument(make_string(
+            "cudaMallocAsync is not supported on device ", device_id));
       return std::make_shared<mm::cuda_malloc_async_memory_resource>(device_id);
     #else
       throw std::invalid_argument(

diff --git a/dali/core/mm/perf_test.cu b/dali/core/mm/perf_test.cu
@@ -29,50 +29,19 @@
 #include "dali/core/cuda_event.h"
 #include "dali/core/cuda_error.h"
 #include "dali/core/device_guard.h"
+#include "dali/test/timing.h"
 
 namespace dali {
 namespace mm {
 namespace test {
 
-using perf_timer = std::chrono::high_resolution_clock;
-
-inline void print_time(std::ostream &os, double seconds) {
-  if (seconds < 1e-6) {
-    os << seconds * 1e+9 << " ns";
-  } else if (seconds < 1e-3) {
-    os << seconds * 1e+6 << " µs";
-  } else if (seconds < 1.0) {
-    os << seconds * 1e+3 << " ms";
-  } else {
-    os << seconds << " s";
-  }
-}
-
-template <typename Rep, typename Period>
-double seconds(std::chrono::duration<Rep, Period> time) {
-  return std::chrono::duration_cast<std::chrono::duration<double>>(time).count();
-}
-
-template <typename Rep, typename Period>
-void print_time(std::ostream &os, std::chrono::duration<Rep, Period> time) {
-  return format_time(seconds(time));
-}
-
-inline std::string format_time(double seconds) {
-  std::stringstream ss;
-  print_time(ss, seconds);
-  return ss.str();
-}
-
-template <typename Rep, typename Period>
-std::string format_time(std::chrono::duration<Rep, Period> time) {
-  return format_time(seconds(time));
-}
+using dali::test::format_time;
+using dali::test::perf_timer;
+using dali::test::seconds;
 
 void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
                   int num_threads,
-                  int num_streams,
-                  double test_time) {
+                  int num_streams) {
   std::vector<CUDAStreamLease> streams;
   streams.reserve(num_streams);
   for (int i = 0; i < num_streams; i++)
@@ -97,14 +66,12 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
 
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_threads; tid++) {
-    threads.emplace_back([&, tid]() {
-      (void)tid;  // Make clang shut up; I prefer to keep this explicitly captured by value, even
-                  // if not used, than end up with it being captured by reference when I need it.
+    threads.emplace_back([&, tid /* to avoid future bugs */]() {
+      (void)tid;  // Silence a terribly ill-advised warning from clang.
       std::mt19937_64 rng;
       std::uniform_int_distribution<int> stream_dist(-1, num_streams - 1);
-      std::uniform_real_distribution<float> size_log_dist(4, 28);
+      std::uniform_real_distribution<float> size_log_dist(4, 24);
       std::bernoulli_distribution action_dist(0.5);
-      auto test_start = perf_timer::now();
 
       perf_timer::duration alloc_time = {};
       perf_timer::duration dealloc_time = {};
@@ -113,10 +80,9 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
       int64_t num_allocs = 0, num_deallocs = 0;
       int64_t num_async_allocs = 0, num_async_deallocs = 0;
 
-      // while (seconds(perf_timer::now() - test_start) < test_time) {
-      for (int iter = 0; iter < 10000; iter++) {
-        bool is_free = action_dist(rng);
+      for (int iter = 0; iter < 100000; iter++) {
         cudaDeviceSynchronize();
+        bool is_free = action_dist(rng);
         if (is_free) {
           Alloc alloc;
 
@@ -217,14 +183,15 @@ void RunBenchmark(mm::async_memory_resource<mm::memory_kind::device> *res,
 TEST(MMPerfTest, DefaultGPUAlloc) {
   auto *res = mm::GetDefaultDeviceResource(0);
 
-  RunBenchmark(res, 1, 1, 1);
+  RunBenchmark(res, 1, 1);
 }
 
 #if CUDA_VERSION >= 11020
 TEST(MMPerfTest, CudaMallocAsync) {
+  if (!cuda_malloc_async_memory_resource::is_supported())
+    GTEST_SKIP() << "cudaMallocAsync not supported";
   cuda_malloc_async_memory_resource res;
-
-  RunBenchmark(&res, 1, 1, 1);
+  RunBenchmark(&res, 1, 1);
 }
 #endif
 

diff --git a/include/dali/core/mm/malloc_resource.h b/include/dali/core/mm/malloc_resource.h
@@ -17,9 +17,11 @@
 
 #include <stdlib.h>
 #include <malloc.h>
+#include <vector>
 #include "dali/core/mm/memory_resource.h"
 #include "dali/core/cuda_error.h"
 #include "dali/core/cuda_stream.h"
+#include "dali/core/cuda_stream_pool.h"
 #include "dali/core/mm/detail/align.h"
 #include "dali/core/device_guard.h"
 
@@ -207,14 +209,48 @@ class managed_malloc_memory_resource : public managed_async_resource {
 class cuda_malloc_async_memory_resource
 : public mm::async_memory_resource<mm::memory_kind::device> {
  public:
-  explicit cuda_malloc_async_memory_resource(int device_id = -1) {
+  cuda_malloc_async_memory_resource() : cuda_malloc_async_memory_resource(-1) {}
+
+  explicit cuda_malloc_async_memory_resource(int device_id) {
     if (device_id < 0) {
       CUDA_CALL(cudaGetDevice(&device_id));
     }
 
     device_id_ = device_id;
     DeviceGuard dg(device_id_);
-    dummy_host_stream_ = CUDAStream::Create(true);
+    dummy_host_stream_ = CUDAStreamPool::instance().Get(device_id_);
+  }
+
+  static bool is_supported(int device_id = -1) {
+    static const int num_devices = []() {
+      int ndev;
+      CUDA_CALL(cudaGetDeviceCount(&ndev));
+      return ndev;
+    }();
+    enum Support {
+      unintialized = 0,
+      unsuppoerted = -1,
+      supported = 1
+    };
+    static vector<Support> support(num_devices);
+    if (device_id < 0)
+      CUDA_CALL(cudaGetDevice(&device_id));
+
+    if (!support[device_id]) {
+      auto stream = CUDAStreamPool::instance().Get(device_id);
+      try {
+      void *ptr;
+        CUDA_CALL(cudaMallocAsync(&ptr, 16, stream));
+        CUDA_CALL(cudaFreeAsync(ptr, stream));
+        support[device_id] = supported;
+      } catch (const CUDAError &e) {
+        if (e.rt_error() == cudaErrorNotSupported)
+          support[device_id] = unsuppoerted;
+        else
+          throw;
+      }
+    }
+    return support[device_id] == supported;
   }
 
  private:
@@ -244,7 +280,7 @@ class cuda_malloc_async_memory_resource
   }
 
   int device_id_;
-  CUDAStream dummy_host_stream_;
+  CUDAStreamLease dummy_host_stream_;
 };
 
 #endif