rapidsai · rapids-bot · Oct 24, 2022 · Oct 15, 2022 · Oct 20, 2022 · Oct 20, 2022
@@ -149,9 +149,6 @@ public static boolean isInitialized() throws RmmException {
    * Resets a local maximum counter of RMM memory used to keep track of usage between
    * sections code while debugging.
    *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
-   *
    * @param initialValue an initial value (in Bytes) to use for this local counter
    */
   public static void resetLocalMaximumOutstanding(long initialValue) {
@@ -162,9 +159,6 @@ public static void resetLocalMaximumOutstanding(long initialValue) {
    * Resets a local maximum counter of RMM memory used to keep track of usage between
    * sections code while debugging.
    *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
-   *
    * This resets the counter to 0 Bytes.
    */
   public static void resetLocalMaximumOutstanding() {
@@ -180,7 +174,11 @@ public static void resetLocalMaximumOutstanding() {
    *
    * Note that this result is meaningful when a single thread is using the GPU, or
    * when we have joined all threads and CUDA synchronized with all streams.
-   * @return
+   *
+   * If the memory used is net negative (for example if only frees happened since
+   * reset, and we reset to 0), then result will be 0 until we reset
+   *
+   * @return the local maximum in Bytes
    */
   public static native long getLocalMaximumOutstanding();
 

@@ -85,22 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  std::size_t get_max_outstanding() override { return max_outstanding.load(); }
+  std::size_t get_max_outstanding() override { return max_outstanding; }
 
   void reset_local_max_outstanding(std::size_t initial_value) override {
     local_max_outstanding = initial_value;
-    local_allocated = initial_value;
+    // keep track of where we currently are when the reset call is issued
+    local_allocated = total_allocated;
   }
 
-  std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); }
+  std::size_t get_local_max_outstanding() override { return local_max_outstanding; }
 
 private:
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
-  std::atomic_size_t max_outstanding{0};
-  std::atomic_size_t local_allocated{0};
-  std::atomic_size_t local_max_outstanding{0};
+  std::size_t max_outstanding{0};
+  std::size_t local_allocated{0};
+  std::size_t local_max_outstanding{0};
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -109,11 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
-      local_allocated += num_bytes;
 
-      // Note: this is not thread safe.
-      max_outstanding.store(std::max(total_allocated, max_outstanding));
-      local_max_outstanding.store(std::max(local_allocated, local_max_outstanding));
+      // Note: none of the below is thread safe. It is only meaningful when
+      // a single thread is used.
+      max_outstanding = std::max(total_allocated.load(), max_outstanding);
+
+      // `total_allocated - local_allocated` can be negative in the case where we free
+      // after we call `reset_local_max_outstanding`
+      std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
+      local_max_outstanding = std::max(local_diff, local_max_outstanding);
     }
     return result;
   }
@@ -125,7 +130,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
-      local_allocated -= size;
     }
   }
 

@@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
+      assertEquals(1024, Rmm.getMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(1024, Rmm.getMaximumOutstanding());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+
+    Rmm.resetLocalMaximumOutstanding(0);
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(1024, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+
+    // a non-zero value is the new minimum
+    DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+    ignored2.close();
+    Rmm.resetLocalMaximumOutstanding(10000);
+    assertEquals(10000, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
+      Rmm.resetLocalMaximumOutstanding(1024);
+      try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
+        assertEquals(20480, Rmm.getLocalMaximumOutstanding());
+        assertEquals(21504, Rmm.getMaximumOutstanding());
+      }
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+      Rmm.resetLocalMaximumOutstanding();
+      assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    }
+    // because we allocated a net -2048 Bytes since reset
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+
+    // if we allocate 2KB and then 256B we start seeing a positive local maximum
+    try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
+         DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
+      assertEquals(256, Rmm.getLocalMaximumOutstanding());
+    }
+  }
+
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,