rapidsai · rapids-bot · Oct 24, 2022 · Oct 15, 2022 · Oct 20, 2022 · Oct 20, 2022
@@ -136,6 +136,49 @@ public static boolean isInitialized() throws RmmException {
    */
   public static native long getTotalBytesAllocated();
 
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding during the
+   * lifetime of the process.
+   */
+  public static native long getMaximumTotalBytesAllocated();
+
+  /**
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
+   * code sections while debugging.
+   *
+   * @param initialValue an initial value (in Bytes) to use for this scoped counter
+   */
+  public static void resetScopedMaximumBytesAllocated(long initialValue) {
+    resetScopedMaximumBytesAllocatedInternal(initialValue);
+  }
+
+  /**
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
+   * code sections while debugging.
+   *
+   * This resets the counter to 0 Bytes.
+   */
+  public static void resetScopedMaximumBytesAllocated() {
+    resetScopedMaximumBytesAllocatedInternal(0L);
+  }
+
+  private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue);
+
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
+   * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
+   * maximum amount seen since the last reset).
+   *
+   * If the memory used is net negative (for example if only frees happened since
+   * reset, and we reset to 0), then result will be 0.
+   *
+   * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
+   * program and is equivalent to `getMaximumTotalBytesAllocated`.
+   *
+   * @return the scoped maximum bytes allocated
+   */
+  public static native long getScopedMaximumBytesAllocated();
+
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
    * @param handler event handler to invoke on RMM events or null to clear an existing handler

@@ -19,6 +19,7 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <mutex>
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -50,6 +51,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
+
+  virtual std::size_t get_max_total_allocated() = 0;
+
+  virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0;
+
+  virtual std::size_t get_scoped_max_total_allocated() = 0;
 };
 
 /**
@@ -79,18 +86,47 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
+  std::size_t get_max_total_allocated() override { return max_total_allocated; }
+
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+    std::scoped_lock lock(max_total_allocated_mutex);
+    scoped_allocated = 0;
+    scoped_max_total_allocated = initial_value;
+  }
+
+  std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; }
+
 private:
   Upstream *const resource;
   std::size_t const size_align;
+  // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
 
+  // the maximum total allocated for the lifetime of this class
+  std::size_t max_total_allocated{0};
+
+  // the sum of what is currently outstanding from the last
+  // `reset_scoped_max_total_allocated` call. This can be negative.
+  std::atomic_long scoped_allocated{0};
+
+  // the maximum total allocated relative to the last
+  // `reset_scoped_max_total_allocated` call.
+  long scoped_max_total_allocated{0};
+
+  std::mutex max_total_allocated_mutex;
+
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
 
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
+      scoped_allocated += num_bytes;
+
+      std::scoped_lock lock(max_total_allocated_mutex);
+      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
@@ -102,6 +138,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
+      scoped_allocated -= size;
     }
   }
 
@@ -132,6 +169,26 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
+std::size_t get_max_total_allocated() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_max_total_allocated();
+  }
+  return 0;
+}
+
+void reset_scoped_max_total_allocated(std::size_t initial_value) {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value);
+  }
+}
+
+std::size_t get_scoped_max_total_allocated() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_scoped_max_total_allocated();
+  }
+  return 0;
+}
+
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
@@ -455,6 +512,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) {
+  return get_max_total_allocated();
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal(
+    JNIEnv *env, jclass, long initialValue) {
+  reset_scoped_max_total_allocated(initialValue);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env,
+                                                                               jclass) {
+  return get_scoped_max_total_allocated();
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
                                                               jlong stream) {
   try {

@@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
+      assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testScopedMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+
+    Rmm.resetScopedMaximumBytesAllocated();
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(1024, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+
+    // a non-zero value is the new minimum
+    DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+    ignored2.close();
+    Rmm.resetScopedMaximumBytesAllocated(10000);
+    assertEquals(10000, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+
+    try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
+      Rmm.resetScopedMaximumBytesAllocated(1024);
+      try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
+        assertEquals(20480, Rmm.getScopedMaximumBytesAllocated());
+        assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
+      }
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+      Rmm.resetScopedMaximumBytesAllocated();
+      assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    }
+    // because we allocated a net -2048 Bytes since reset
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+
+    // if we allocate 2KB and then 256B we start seeing a positive local maximum
+    try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
+         DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
+      assertEquals(256, Rmm.getScopedMaximumBytesAllocated());
+    }
+  }
+
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,