rapidsai · rapids-bot · Oct 24, 2022 · Oct 15, 2022 · Oct 20, 2022 · Oct 20, 2022
@@ -136,6 +136,52 @@ public static boolean isInitialized() throws RmmException {
    */
   public static native long getTotalBytesAllocated();
 
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding during the
+   * lifetime of the process.
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   */
+  public static native long getMaximumOutstanding();
+
+  /**
+   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * sections code while debugging.
+   *
+   * @param initialValue an initial value (in Bytes) to use for this local counter
+   */
+  public static void resetLocalMaximumOutstanding(long initialValue) {
+    resetLocalMaximumOutstandingInternal(initialValue);
+  }
+
+  /**
+   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * sections code while debugging.
+   *
+   * This resets the counter to 0 Bytes.
+   */
+  public static void resetLocalMaximumOutstanding() {
+    resetLocalMaximumOutstandingInternal(0L);
+  }
+
+  public static native void resetLocalMaximumOutstandingInternal(long initialValue);
+
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
+   * `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the
+   * maximum amount seen between reset and get calls).
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   *
+   * If the memory used is net negative (for example if only frees happened since
+   * reset, and we reset to 0), then result will be 0 until we reset
+   *
+   * @return the local maximum in Bytes
+   */
+  public static native long getLocalMaximumOutstanding();
+
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
    * @param handler event handler to invoke on RMM events or null to clear an existing handler

@@ -50,6 +50,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
+
+  virtual std::size_t get_max_outstanding() = 0;
+
+  virtual void reset_local_max_outstanding(std::size_t initial_value) = 0;
+
+  virtual std::size_t get_local_max_outstanding() = 0;
 };
 
 /**
@@ -79,10 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
+  std::size_t get_max_outstanding() override { return max_outstanding; }
+
+  void reset_local_max_outstanding(std::size_t initial_value) override {
+    local_max_outstanding = initial_value;
+    // keep track of where we currently are when the reset call is issued
+    local_allocated = total_allocated;
+  }
+
+  std::size_t get_local_max_outstanding() override { return local_max_outstanding; }
+
 private:
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
+  std::size_t max_outstanding{0};
+  std::size_t local_allocated{0};
+  std::size_t local_max_outstanding{0};
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -91,6 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
+
+      // Note: none of the below is thread safe. It is only meaningful when
+      // a single thread is used.
+      max_outstanding = std::max(total_allocated.load(), max_outstanding);
+
+      // `total_allocated - local_allocated` can be negative in the case where we free
+      // after we call `reset_local_max_outstanding`
+      std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
+      local_max_outstanding = std::max(local_diff, local_max_outstanding);
     }
     return result;
   }
@@ -132,6 +160,26 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
+std::size_t get_max_outstanding() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_max_outstanding();
+  }
+  return 0;
+}
+
+void reset_local_max_outstanding(std::size_t initial_value) {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->reset_local_max_outstanding(initial_value);
+  }
+}
+
+std::size_t get_local_max_outstanding() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_local_max_outstanding();
+  }
+  return 0;
+}
+
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
@@ -455,6 +503,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) {
+  return get_max_outstanding();
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal(
+    JNIEnv *env, jclass, long initialValue) {
+  reset_local_max_outstanding(initialValue);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) {
+  return get_local_max_outstanding();
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
                                                               jlong stream) {
   try {

@@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
+      assertEquals(1024, Rmm.getMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(1024, Rmm.getMaximumOutstanding());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+
+    Rmm.resetLocalMaximumOutstanding(0);
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(1024, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+
+    // a non-zero value is the new minimum
+    DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+    ignored2.close();
+    Rmm.resetLocalMaximumOutstanding(10000);
+    assertEquals(10000, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
+      Rmm.resetLocalMaximumOutstanding(1024);
+      try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
+        assertEquals(20480, Rmm.getLocalMaximumOutstanding());
+        assertEquals(21504, Rmm.getMaximumOutstanding());
+      }
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+      Rmm.resetLocalMaximumOutstanding();
+      assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    }
+    // because we allocated a net -2048 Bytes since reset
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+
+    // if we allocate 2KB and then 256B we start seeing a positive local maximum
+    try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
+         DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
+      assertEquals(256, Rmm.getLocalMaximumOutstanding());
+    }
+  }
+
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,