From ee604aef33b4e40c609d464f889928108b04f54c Mon Sep 17 00:00:00 2001
From: Anis Ladram <aladram@nvidia.com>
Date: Fri, 11 Mar 2022 19:28:11 -0500
Subject: [PATCH 1/2] Fix free-before-alloc in multithreaded test

Uses a CUDA event to make sure streamA and streamB are correctly synchronized.
---
 tests/mr/device/mr_multithreaded_tests.cpp | 31 +++++++++++++++++-----
 1 file changed, 25 insertions(+), 6 deletions(-)
diff --git a/tests/mr/device/mr_multithreaded_tests.cpp b/tests/mr/device/mr_multithreaded_tests.cpp
index 38c34d93f..b56468f15 100644
--- a/tests/mr/device/mr_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_multithreaded_tests.cpp
@@ -179,6 +179,7 @@ void allocate_loop(rmm::mr::device_memory_resource* mr,
                    std::size_t num_allocations,
                    std::list<allocation>& allocations,
                    std::mutex& mtx,
+                   cudaEvent_t& event,
                    rmm::cuda_stream_view stream)
 {
   constexpr std::size_t max_size{1_MiB};
@@ -191,6 +192,7 @@ void allocate_loop(rmm::mr::device_memory_resource* mr,
     void* ptr        = mr->allocate(size, stream);
     {
       std::lock_guard<std::mutex> lock(mtx);
+      EXPECT_EQ(cudaSuccess, cudaEventRecord(event, stream.value()));
       allocations.emplace_back(ptr, size);
     }
   }
@@ -200,12 +202,14 @@ void deallocate_loop(rmm::mr::device_memory_resource* mr,
                      std::size_t num_allocations,
                      std::list<allocation>& allocations,
                      std::mutex& mtx,
+                     cudaEvent_t& event,
                      rmm::cuda_stream_view stream)
 {
   for (std::size_t i = 0; i < num_allocations;) {
     std::lock_guard<std::mutex> lock(mtx);
     if (allocations.empty()) { continue; }
     i++;
+    EXPECT_EQ(cudaSuccess, cudaStreamWaitEvent(stream.value(), event));
     allocation alloc = allocations.front();
     allocations.pop_front();
     mr->deallocate(alloc.ptr, alloc.size, stream);
@@ -220,15 +224,30 @@ void test_allocate_free_different_threads(rmm::mr::device_memory_resource* mr,
 
   std::mutex mtx;
   std::list<allocation> allocations;
-
-  std::thread producer(
-    allocate_loop, mr, num_allocations, std::ref(allocations), std::ref(mtx), streamA);
-
-  std::thread consumer(
-    deallocate_loop, mr, num_allocations, std::ref(allocations), std::ref(mtx), streamB);
+  cudaEvent_t event;
+
+  EXPECT_EQ(cudaSuccess, cudaEventCreate(&event));
+
+  std::thread producer(allocate_loop,
+                       mr,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamA);
+
+  std::thread consumer(deallocate_loop,
+                       mr,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamB);
 
   producer.join();
   consumer.join();
+
+  EXPECT_EQ(cudaSuccess, cudaEventDestroy(event));
 }
 
 TEST_P(mr_test_mt, AllocFreeDifferentThreadsDefaultStream)

From 23f8e5b5f2fab8b80552211376998ce64acace5b Mon Sep 17 00:00:00 2001
From: Anis Ladram <aladram@nvidia.com>
Date: Tue, 15 Mar 2022 10:08:08 -0400
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: Mark Harris <mharris@nvidia.com>
---
 tests/mr/device/mr_multithreaded_tests.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/mr/device/mr_multithreaded_tests.cpp b/tests/mr/device/mr_multithreaded_tests.cpp
index b56468f15..6d6d8edc2 100644
--- a/tests/mr/device/mr_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_multithreaded_tests.cpp
@@ -192,7 +192,7 @@ void allocate_loop(rmm::mr::device_memory_resource* mr,
     void* ptr        = mr->allocate(size, stream);
     {
       std::lock_guard<std::mutex> lock(mtx);
-      EXPECT_EQ(cudaSuccess, cudaEventRecord(event, stream.value()));
+      RMM_CUDA_TRY(cudaEventRecord(event, stream.value()));
       allocations.emplace_back(ptr, size);
     }
   }
@@ -209,7 +209,7 @@ void deallocate_loop(rmm::mr::device_memory_resource* mr,
     std::lock_guard<std::mutex> lock(mtx);
     if (allocations.empty()) { continue; }
     i++;
-    EXPECT_EQ(cudaSuccess, cudaStreamWaitEvent(stream.value(), event));
+    RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event));
     allocation alloc = allocations.front();
     allocations.pop_front();
     mr->deallocate(alloc.ptr, alloc.size, stream);
@@ -226,7 +226,7 @@ void test_allocate_free_different_threads(rmm::mr::device_memory_resource* mr,
   std::list<allocation> allocations;
   cudaEvent_t event;
 
-  EXPECT_EQ(cudaSuccess, cudaEventCreate(&event));
+  RMM_CUDA_TRY(cudaEventCreate(&event));
 
   std::thread producer(allocate_loop,
                        mr,
@@ -247,7 +247,7 @@ void test_allocate_free_different_threads(rmm::mr::device_memory_resource* mr,
   producer.join();
   consumer.join();
 
-  EXPECT_EQ(cudaSuccess, cudaEventDestroy(event));
+  RMM_CUDA_TRY(cudaEventDestroy(event));
 }
 
 TEST_P(mr_test_mt, AllocFreeDifferentThreadsDefaultStream)