From 4f9a18ec75c65bc3a990f9a37554288f32eee840 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sat, 15 Oct 2022 16:25:55 -0500
Subject: [PATCH 01/14] Add gpu memory tracking api in JNI to track maximum
 memory usage

---
 .../java/ai/rapids/cudf/GpuMemoryTracker.java | 86 +++++++++++++++++++
 java/src/main/java/ai/rapids/cudf/Rmm.java    |  2 +
 java/src/main/native/src/RmmJni.cpp           | 73 ++++++++++++++++
 3 files changed, 161 insertions(+)
 create mode 100644 java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java
diff --git a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java
new file mode 100644
index 00000000000..57b37be2238
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+import java.util.Optional;
+
+/**
+ * This is a helper class to track the maximum amount of GPU memory outstanding
+ * for the current thread (stream in PTDS). If free ocurrs while tracking, and the 
+ * free is for memory that wasn't created in the scope, or it was created in a different
+ * thread, it will be ignored.
+ *
+ * The constructor enables a new memory tracking scope and .close stops tracking, and collects
+ * the result.
+ *
+ * If `ai.rapids.cudf.gpuMemoryTracking.enabled` is false (default), the result of 
+ * `getMaxOutstanding` is an empty java Optional<long>.
+ *
+ * Usage:
+ *
+ * <pre>
+ *   try (GpuMemoryTracker a = new GpuMemoryTracker()) {
+ *     ...
+ *     try (GpuMemoryTracker b = new GpuMemoryTracker()) {
+ *       ...
+ *       // bMaxMemory is the maximum memory used while b is not closed
+ *       Optional<long> bMaxMemory = b.getMaxOutsanding();
+ *     }
+ *     ...
+ *
+ *     // aMaxMemory is the maximum memory used while a is not closed
+ *     // which includes bMaxMemory.
+ *     Optional<long> aMaxMemory = a.getMaxOutsanding();
+ *   }
+ * </pre>
+ *
+ * Instances should be associated with a single thread and should be at a fine
+ * granularity. Tracking memory when there could be free of buffers created in different
+ * streams will have undeserired results.
+ */
+public class GpuMemoryTracker implements AutoCloseable {
+  private static final boolean isEnabled = 
+    Boolean.getBoolean("ai.rapids.cudf.gpuMemoryTracking.enabled");
+
+  private long maxOutstanding;
+
+  static {
+    if (isEnabled) {
+      NativeDepsLoader.loadNativeDeps();
+    }
+  }
+
+  public GpuMemoryTracker() {
+    if (isEnabled) {
+      Rmm.pushThreadMemoryTracker();
+    }
+  }
+
+  @Override
+  public void close() {
+    if (isEnabled) {
+      maxOutstanding = Rmm.popThreadMemoryTracker();
+    }
+  }
+
+  public Optional<Long> getMaxOutstanding() {
+    if (isEnabled) {
+      return Optional.of(maxOutstanding);
+    } else {
+      return Optional.empty();
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 730f82f0047..6273de334a9 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -135,6 +135,8 @@ public static boolean isInitialized() throws RmmException {
    * the result will always be a lower bound on the amount allocated.
    */
   public static native long getTotalBytesAllocated();
+  public static native void pushThreadMemoryTracker();
+  public static native long popThreadMemoryTracker();
 
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 2b4c5ae59f5..d98ab7495eb 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -19,6 +19,7 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <stack>
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -50,8 +51,18 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
+  virtual void push_thread_memory_tracker() = 0;
+  virtual long pop_thread_memory_tracker() = 0;
 };
 
+struct memory_tracker {
+  long current_outstanding;
+  long max_outstanding;
+};
+
+thread_local std::stack<memory_tracker> memory_tracker_stack = std::stack<memory_tracker>();
+thread_local std::unordered_map<long, std::size_t> alloc_map;
+
 /**
  * @brief An RMM device memory resource that delegates to another resource
  * while tracking the amount of memory allocated.
@@ -79,11 +90,50 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
+  void push_thread_memory_tracker() override { 
+    memory_tracker_stack.emplace();
+  }
+
+  long pop_thread_memory_tracker() override { 
+    auto top_tracker = memory_tracker_stack.top();
+    auto ret = top_tracker.max_outstanding;
+    memory_tracker_stack.pop();
+    if (memory_tracker_stack.empty()) {
+      alloc_map.clear();
+    } else {
+      // carry the max to the next level
+      memory_tracker_stack.top().max_outstanding += ret;
+    }
+    return ret;
+  }
+
 private:
+
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
 
+  void thread_allocated(long addr, std::size_t num_bytes) {
+    if (!memory_tracker_stack.empty()) {
+      alloc_map[addr] = num_bytes;                  
+      memory_tracker& tracker = memory_tracker_stack.top();
+      tracker.current_outstanding += num_bytes;
+      tracker.max_outstanding = 
+        std::max(tracker.current_outstanding, tracker.max_outstanding);
+    }
+  }
+
+  void thread_freed(long addr, std::size_t num_bytes) {
+    if (!memory_tracker_stack.empty()) {
+      auto it = alloc_map.find(addr);
+      if (it != alloc_map.end()) {
+        auto tracker = memory_tracker_stack.top();
+        tracker.current_outstanding -= it->second;
+        alloc_map.erase(it);
+      }
+    }
+  }
+
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
@@ -91,6 +141,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
+      thread_allocated(reinterpret_cast<long>(result), num_bytes);
     }
     return result;
   }
@@ -102,6 +153,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
+      thread_freed(reinterpret_cast<long>(p), size);
     }
   }
 
@@ -132,6 +184,19 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
+void push_thread_memory_tracker() {
+  if (Tracking_memory_resource) {
+    Tracking_memory_resource->push_thread_memory_tracker();
+  }
+}
+
+long pop_thread_memory_tracker() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->pop_thread_memory_tracker();
+  }
+  return 0;
+}
+
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
@@ -455,6 +520,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_pushThreadMemoryTracker(JNIEnv *env, jclass) {
+  push_thread_memory_tracker();
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_popThreadMemoryTracker(JNIEnv *env, jclass) {
+  return pop_thread_memory_tracker();
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
                                                               jlong stream) {
   try {

From 543080e4731e3b7d8dc5ec2dca7e008fda77939d Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 10:07:45 -0500
Subject: [PATCH 02/14] clang-format changes

---
 java/src/main/native/src/RmmJni.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index d98ab7495eb..b8231ce63b3 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -90,11 +90,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  void push_thread_memory_tracker() override { 
-    memory_tracker_stack.emplace();
-  }
+  void push_thread_memory_tracker() override { memory_tracker_stack.emplace(); }
 
-  long pop_thread_memory_tracker() override { 
+  long pop_thread_memory_tracker() override {
     auto top_tracker = memory_tracker_stack.top();
     auto ret = top_tracker.max_outstanding;
     memory_tracker_stack.pop();
@@ -108,18 +106,16 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   }
 
 private:
-
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
 
   void thread_allocated(long addr, std::size_t num_bytes) {
     if (!memory_tracker_stack.empty()) {
-      alloc_map[addr] = num_bytes;                  
-      memory_tracker& tracker = memory_tracker_stack.top();
+      alloc_map[addr] = num_bytes;
+      memory_tracker &tracker = memory_tracker_stack.top();
       tracker.current_outstanding += num_bytes;
-      tracker.max_outstanding = 
-        std::max(tracker.current_outstanding, tracker.max_outstanding);
+      tracker.max_outstanding = std::max(tracker.current_outstanding, tracker.max_outstanding);
     }
   }
 

From 7dfac3321d29c75cc436b08dd65f4721c4dea29e Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 13:26:23 -0500
Subject: [PATCH 03/14] Adds global/local high memory usage watermark functions
 to JNI

---
 .../java/ai/rapids/cudf/GpuMemoryTracker.java | 86 ------------------
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 50 +++++++++-
 java/src/main/native/src/RmmJni.cpp           | 91 ++++++++-----------
 3 files changed, 88 insertions(+), 139 deletions(-)
 delete mode 100644 java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java

diff --git a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java
deleted file mode 100644
index 57b37be2238..00000000000
--- a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package ai.rapids.cudf;
-
-import java.util.Optional;
-
-/**
- * This is a helper class to track the maximum amount of GPU memory outstanding
- * for the current thread (stream in PTDS). If free ocurrs while tracking, and the 
- * free is for memory that wasn't created in the scope, or it was created in a different
- * thread, it will be ignored.
- *
- * The constructor enables a new memory tracking scope and .close stops tracking, and collects
- * the result.
- *
- * If `ai.rapids.cudf.gpuMemoryTracking.enabled` is false (default), the result of 
- * `getMaxOutstanding` is an empty java Optional<long>.
- *
- * Usage:
- *
- * <pre>
- *   try (GpuMemoryTracker a = new GpuMemoryTracker()) {
- *     ...
- *     try (GpuMemoryTracker b = new GpuMemoryTracker()) {
- *       ...
- *       // bMaxMemory is the maximum memory used while b is not closed
- *       Optional<long> bMaxMemory = b.getMaxOutsanding();
- *     }
- *     ...
- *
- *     // aMaxMemory is the maximum memory used while a is not closed
- *     // which includes bMaxMemory.
- *     Optional<long> aMaxMemory = a.getMaxOutsanding();
- *   }
- * </pre>
- *
- * Instances should be associated with a single thread and should be at a fine
- * granularity. Tracking memory when there could be free of buffers created in different
- * streams will have undeserired results.
- */
-public class GpuMemoryTracker implements AutoCloseable {
-  private static final boolean isEnabled = 
-    Boolean.getBoolean("ai.rapids.cudf.gpuMemoryTracking.enabled");
-
-  private long maxOutstanding;
-
-  static {
-    if (isEnabled) {
-      NativeDepsLoader.loadNativeDeps();
-    }
-  }
-
-  public GpuMemoryTracker() {
-    if (isEnabled) {
-      Rmm.pushThreadMemoryTracker();
-    }
-  }
-
-  @Override
-  public void close() {
-    if (isEnabled) {
-      maxOutstanding = Rmm.popThreadMemoryTracker();
-    }
-  }
-
-  public Optional<Long> getMaxOutstanding() {
-    if (isEnabled) {
-      return Optional.of(maxOutstanding);
-    } else {
-      return Optional.empty();
-    }
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 6273de334a9..e64e1150da4 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -135,8 +135,54 @@ public static boolean isInitialized() throws RmmException {
    * the result will always be a lower bound on the amount allocated.
    */
   public static native long getTotalBytesAllocated();
-  public static native void pushThreadMemoryTracker();
-  public static native long popThreadMemoryTracker();
+
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding during the
+   * lifetime of the process.
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   */
+  public static native long getMaximumOutstanding();
+
+  /**
+   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * sections code while debugging.
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   *
+   * @param initialValue an initial value (in Bytes) to use for this local counter
+   */
+  public static void resetLocalMaximumOutstanding(long initialValue) {
+    resetLocalMaximumOutstanding(initialValue);
+  }
+
+  /**
+   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * sections code while debugging.
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   *
+   * This resets the counter to 0 Bytes.
+   */
+  public static void resetLocalMaximumOutstanding() {
+    resetLocalMaximumOutstandingInternal(0L);
+  }
+
+  public static native void resetLocalMaximumOutstandingNative(long initialValue);
+
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
+   * `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the
+   * maximum amount seen between reset and get calls).
+   *
+   * Note that this result is meaningful when a single thread is using the GPU, or
+   * when we have joined all threads and CUDA synchronized with all streams.
+   * @return
+   */
+  public static native long getLocalMaximumOutstanding();
 
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index b8231ce63b3..d8196b64ddf 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -51,17 +51,13 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
-  virtual void push_thread_memory_tracker() = 0;
-  virtual long pop_thread_memory_tracker() = 0;
-};
 
-struct memory_tracker {
-  long current_outstanding;
-  long max_outstanding;
-};
+  virtual std::size_t get_max_outstanding() = 0;
+
+  virtual void reset_local_max_outstanding(std::size_t initial_value) = 0;
 
-thread_local std::stack<memory_tracker> memory_tracker_stack = std::stack<memory_tracker>();
-thread_local std::unordered_map<long, std::size_t> alloc_map;
+  virtual std::size_t get_local_max_outstanding() = 0;
+};
 
 /**
  * @brief An RMM device memory resource that delegates to another resource
@@ -90,45 +86,22 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  void push_thread_memory_tracker() override { memory_tracker_stack.emplace(); }
+  std::size_t get_max_outstanding() override { return max_outstanding.load(); }
 
-  long pop_thread_memory_tracker() override {
-    auto top_tracker = memory_tracker_stack.top();
-    auto ret = top_tracker.max_outstanding;
-    memory_tracker_stack.pop();
-    if (memory_tracker_stack.empty()) {
-      alloc_map.clear();
-    } else {
-      // carry the max to the next level
-      memory_tracker_stack.top().max_outstanding += ret;
-    }
-    return ret;
+  void reset_local_max_outstanding(std::size_t initial_value) override {
+    local_max_outstanding = initial_value;
+    local_allocated = initial_value;
   }
 
+  std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); }
+
 private:
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
-
-  void thread_allocated(long addr, std::size_t num_bytes) {
-    if (!memory_tracker_stack.empty()) {
-      alloc_map[addr] = num_bytes;
-      memory_tracker &tracker = memory_tracker_stack.top();
-      tracker.current_outstanding += num_bytes;
-      tracker.max_outstanding = std::max(tracker.current_outstanding, tracker.max_outstanding);
-    }
-  }
-
-  void thread_freed(long addr, std::size_t num_bytes) {
-    if (!memory_tracker_stack.empty()) {
-      auto it = alloc_map.find(addr);
-      if (it != alloc_map.end()) {
-        auto tracker = memory_tracker_stack.top();
-        tracker.current_outstanding -= it->second;
-        alloc_map.erase(it);
-      }
-    }
-  }
+  std::atomic_size_t max_outstanding{0};
+  std::atomic_size_t local_allocated{0};
+  std::atomic_size_t local_max_outstanding{0};
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -137,7 +110,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
-      thread_allocated(reinterpret_cast<long>(result), num_bytes);
+      local_allocated += num_bytes;
+
+      // Note: this is not thread safe.
+      max_outstanding.store(std::max(total_allocated, max_outstanding));
+      local_max_outstanding.store(std::max(local_allocated, local_max_outstanding));
     }
     return result;
   }
@@ -149,7 +126,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
-      thread_freed(reinterpret_cast<long>(p), size);
+      local_allocated -= size;
     }
   }
 
@@ -180,15 +157,22 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
-void push_thread_memory_tracker() {
+std::size_t get_max_outstanding() {
   if (Tracking_memory_resource) {
-    Tracking_memory_resource->push_thread_memory_tracker();
+    return Tracking_memory_resource->get_max_outstanding();
   }
+  return 0;
 }
 
-long pop_thread_memory_tracker() {
+void reset_local_max_outstanding(std::size_t initial_value) {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->pop_thread_memory_tracker();
+    return Tracking_memory_resource->reset_local_max_outstanding(initial_value);
+  }
+}
+
+std::size_t get_local_max_outstanding() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_local_max_outstanding();
   }
   return 0;
 }
@@ -516,12 +500,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_pushThreadMemoryTracker(JNIEnv *env, jclass) {
-  push_thread_memory_tracker();
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) {
+  return get_max_outstanding();
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal(
+    JNIEnv *env, jclass, long initialValue) {
+  reset_local_max_outstanding(initialValue);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_popThreadMemoryTracker(JNIEnv *env, jclass) {
-  return pop_thread_memory_tracker();
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) {
+  return get_local_max_outstanding();
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,

From cf851c318bc704802c8df1ec115b68fdd8a395f6 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 13:28:13 -0500
Subject: [PATCH 04/14] Fix typo

---
 java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index e64e1150da4..e521078478f 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -171,7 +171,7 @@ public static void resetLocalMaximumOutstanding() {
     resetLocalMaximumOutstandingInternal(0L);
   }
 
-  public static native void resetLocalMaximumOutstandingNative(long initialValue);
+  public static native void resetLocalMaximumOutstandingInternal(long initialValue);
 
   /**
    * Returns the maximum amount of RMM memory (Bytes) outstanding since the last

From 9171b1c958d64f68e3f4a01b9d6bc169f5595d0b Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 13:29:26 -0500
Subject: [PATCH 05/14] Remove unnecesary include

---
 java/src/main/native/src/RmmJni.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index d8196b64ddf..2324b579d40 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -19,7 +19,6 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
-#include <stack>
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>

From 5aaa9790ee560e071abcf175372d055fa0a673a3 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 13:38:25 -0500
Subject: [PATCH 06/14] Make sure we call the native function

---
 java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index e521078478f..7064e726882 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -155,7 +155,7 @@ public static boolean isInitialized() throws RmmException {
    * @param initialValue an initial value (in Bytes) to use for this local counter
    */
   public static void resetLocalMaximumOutstanding(long initialValue) {
-    resetLocalMaximumOutstanding(initialValue);
+    resetLocalMaximumOutstandingInternal(initialValue);
   }
 
   /**

From d9d4b530f6e63611d1487b26689be86d496a37ee Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 15:42:20 -0500
Subject: [PATCH 07/14] Fix overflow issue

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 12 ++-
 java/src/main/native/src/RmmJni.cpp           | 26 +++---
 .../src/test/java/ai/rapids/cudf/RmmTest.java | 83 +++++++++++++++++++
 3 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 7064e726882..bf4fad7ec57 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -149,9 +149,6 @@ public static boolean isInitialized() throws RmmException {
    * Resets a local maximum counter of RMM memory used to keep track of usage between
    * sections code while debugging.
    *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
-   *
    * @param initialValue an initial value (in Bytes) to use for this local counter
    */
   public static void resetLocalMaximumOutstanding(long initialValue) {
@@ -162,9 +159,6 @@ public static void resetLocalMaximumOutstanding(long initialValue) {
    * Resets a local maximum counter of RMM memory used to keep track of usage between
    * sections code while debugging.
    *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
-   *
    * This resets the counter to 0 Bytes.
    */
   public static void resetLocalMaximumOutstanding() {
@@ -180,7 +174,11 @@ public static void resetLocalMaximumOutstanding() {
    *
    * Note that this result is meaningful when a single thread is using the GPU, or
    * when we have joined all threads and CUDA synchronized with all streams.
-   * @return
+   *
+   * If the memory used is net negative (for example if only frees happened since
+   * reset, and we reset to 0), then result will be 0 until we reset
+   *
+   * @return the local maximum in Bytes
    */
   public static native long getLocalMaximumOutstanding();
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 2324b579d40..d115e7cd921 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -85,22 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  std::size_t get_max_outstanding() override { return max_outstanding.load(); }
+  std::size_t get_max_outstanding() override { return max_outstanding; }
 
   void reset_local_max_outstanding(std::size_t initial_value) override {
     local_max_outstanding = initial_value;
-    local_allocated = initial_value;
+    // keep track of where we currently are when the reset call is issued
+    local_allocated = total_allocated;
   }
 
-  std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); }
+  std::size_t get_local_max_outstanding() override { return local_max_outstanding; }
 
 private:
   Upstream *const resource;
   std::size_t const size_align;
   std::atomic_size_t total_allocated{0};
-  std::atomic_size_t max_outstanding{0};
-  std::atomic_size_t local_allocated{0};
-  std::atomic_size_t local_max_outstanding{0};
+  std::size_t max_outstanding{0};
+  std::size_t local_allocated{0};
+  std::size_t local_max_outstanding{0};
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -109,11 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
-      local_allocated += num_bytes;
 
-      // Note: this is not thread safe.
-      max_outstanding.store(std::max(total_allocated, max_outstanding));
-      local_max_outstanding.store(std::max(local_allocated, local_max_outstanding));
+      // Note: none of the below is thread safe. It is only meaningful when
+      // a single thread is used.
+      max_outstanding = std::max(total_allocated.load(), max_outstanding);
+
+      // `total_allocated - local_allocated` can be negative in the case where we free
+      // after we call `reset_local_max_outstanding`
+      std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
+      local_max_outstanding = std::max(local_diff, local_max_outstanding);
     }
     return result;
   }
@@ -125,7 +130,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
-      local_allocated -= size;
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index 09fbedd8a1c..4af50759ac3 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
+      assertEquals(1024, Rmm.getMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(1024, Rmm.getMaximumOutstanding());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+
+    Rmm.resetLocalMaximumOutstanding(0);
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(1024, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+
+    // a non-zero value is the new minimum
+    DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+    ignored2.close();
+    Rmm.resetLocalMaximumOutstanding(10000);
+    assertEquals(10000, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getMaximumOutstanding());
+
+    try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
+      Rmm.resetLocalMaximumOutstanding(1024);
+      try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
+        assertEquals(20480, Rmm.getLocalMaximumOutstanding());
+        assertEquals(21504, Rmm.getMaximumOutstanding());
+      }
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumOutstanding());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+      Rmm.resetLocalMaximumOutstanding();
+      assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    }
+    // because we allocated a net -2048 Bytes since reset
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+
+    // if we allocate 2KB and then 256B we start seeing a positive local maximum
+    try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
+         DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
+      assertEquals(256, Rmm.getLocalMaximumOutstanding());
+    }
+  }
+
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,

From 155f227dc28de186bef6cea0afe976395e1be6b0 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 17:38:41 -0500
Subject: [PATCH 08/14] Address review comments

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 14 ++--
 java/src/main/native/src/RmmJni.cpp           | 66 +++++++++++--------
 .../src/test/java/ai/rapids/cudf/RmmTest.java | 48 +++++++-------
 3 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index bf4fad7ec57..7d038685d71 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -143,7 +143,7 @@ public static boolean isInitialized() throws RmmException {
    * Note that this result is meaningful when a single thread is using the GPU, or
    * when we have joined all threads and CUDA synchronized with all streams.
    */
-  public static native long getMaximumOutstanding();
+  public static native long getMaximumTotalBytesAllocated();
 
   /**
    * Resets a local maximum counter of RMM memory used to keep track of usage between
@@ -151,8 +151,8 @@ public static boolean isInitialized() throws RmmException {
    *
    * @param initialValue an initial value (in Bytes) to use for this local counter
    */
-  public static void resetLocalMaximumOutstanding(long initialValue) {
-    resetLocalMaximumOutstandingInternal(initialValue);
+  public static void resetLocalMaximumBytesAllocated(long initialValue) {
+    resetLocalMaximumBytesAllocatedInternal(initialValue);
   }
 
   /**
@@ -161,11 +161,11 @@ public static void resetLocalMaximumOutstanding(long initialValue) {
    *
    * This resets the counter to 0 Bytes.
    */
-  public static void resetLocalMaximumOutstanding() {
-    resetLocalMaximumOutstandingInternal(0L);
+  public static void resetLocalMaximumBytesAllocated() {
+    resetLocalMaximumBytesAllocatedInternal(0L);
   }
 
-  public static native void resetLocalMaximumOutstandingInternal(long initialValue);
+  private static native void resetLocalMaximumBytesAllocatedInternal(long initialValue);
 
   /**
    * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
@@ -180,7 +180,7 @@ public static void resetLocalMaximumOutstanding() {
    *
    * @return the local maximum in Bytes
    */
-  public static native long getLocalMaximumOutstanding();
+  public static native long getLocalMaximumBytesAllocated();
 
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index d115e7cd921..9950f139454 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -19,6 +19,7 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <mutex>
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -51,11 +52,11 @@ class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
 
-  virtual std::size_t get_max_outstanding() = 0;
+  virtual std::size_t get_max_total_allocated() = 0;
 
-  virtual void reset_local_max_outstanding(std::size_t initial_value) = 0;
+  virtual void reset_local_max_total_allocated(std::size_t initial_value) = 0;
 
-  virtual std::size_t get_local_max_outstanding() = 0;
+  virtual std::size_t get_local_max_total_allocated() = 0;
 };
 
 /**
@@ -85,23 +86,33 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  std::size_t get_max_outstanding() override { return max_outstanding; }
+  std::size_t get_max_total_allocated() override { return max_outstanding; }
 
-  void reset_local_max_outstanding(std::size_t initial_value) override {
+  void reset_local_max_total_allocated(std::size_t initial_value) override {
+    local_allocated = 0;
     local_max_outstanding = initial_value;
-    // keep track of where we currently are when the reset call is issued
-    local_allocated = total_allocated;
   }
 
-  std::size_t get_local_max_outstanding() override { return local_max_outstanding; }
+  std::size_t get_local_max_total_allocated() override { return local_max_outstanding; }
 
 private:
   Upstream *const resource;
   std::size_t const size_align;
+  // sum of what is currently outstanding
   std::atomic_size_t total_allocated{0};
+
+  // the maximum outstanding for the lifetime of this class
   std::size_t max_outstanding{0};
-  std::size_t local_allocated{0};
-  std::size_t local_max_outstanding{0};
+
+  // the local sum of what is currently outstanding from the last
+  // `reset_local_max_outstanding` call. This can be negative.
+  std::atomic_long local_allocated{0};
+
+  // the maximum maximum outstanding relative to the last
+  // `reset_local_max_outstanding` call.
+  long local_max_outstanding{0};
+
+  std::mutex max_outstanding_mutex;
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -110,15 +121,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
+      local_allocated += num_bytes;
 
-      // Note: none of the below is thread safe. It is only meaningful when
-      // a single thread is used.
+      std::scoped_lock lock(max_outstanding_mutex);
       max_outstanding = std::max(total_allocated.load(), max_outstanding);
-
-      // `total_allocated - local_allocated` can be negative in the case where we free
-      // after we call `reset_local_max_outstanding`
-      std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
-      local_max_outstanding = std::max(local_diff, local_max_outstanding);
+      local_max_outstanding = std::max(local_allocated.load(), local_max_outstanding);
     }
     return result;
   }
@@ -130,6 +137,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
+      local_allocated -= size;
     }
   }
 
@@ -160,22 +168,22 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
-std::size_t get_max_outstanding() {
+std::size_t get_max_total_allocated() {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->get_max_outstanding();
+    return Tracking_memory_resource->get_max_total_allocated();
   }
   return 0;
 }
 
-void reset_local_max_outstanding(std::size_t initial_value) {
+void reset_local_max_total_allocated(std::size_t initial_value) {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->reset_local_max_outstanding(initial_value);
+    return Tracking_memory_resource->reset_local_max_total_allocated(initial_value);
   }
 }
 
-std::size_t get_local_max_outstanding() {
+std::size_t get_local_max_total_allocated() {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->get_local_max_outstanding();
+    return Tracking_memory_resource->get_local_max_total_allocated();
   }
   return 0;
 }
@@ -503,17 +511,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) {
-  return get_max_outstanding();
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) {
+  return get_max_total_allocated();
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal(
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumBytesAllocatedInternal(
     JNIEnv *env, jclass, long initialValue) {
-  reset_local_max_outstanding(initialValue);
+  reset_local_max_total_allocated(initialValue);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) {
-  return get_local_max_outstanding();
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumBytesAllocated(JNIEnv *env, jclass) {
+  return get_local_max_total_allocated();
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index 4af50759ac3..b12ac675a55 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -72,12 +72,12 @@ public void testTotalAllocated(int rmmAllocMode) {
       RmmAllocationMode.ARENA})
   public void testMaxOutstanding(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
-    assertEquals(0, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
-      assertEquals(1024, Rmm.getMaximumOutstanding());
+      assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    assertEquals(1024, Rmm.getMaximumOutstanding());
+    assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
   }
 
   @ParameterizedTest
@@ -87,36 +87,36 @@ public void testMaxOutstanding(int rmmAllocMode) {
       RmmAllocationMode.ARENA})
   public void testLocalMaxOutstanding(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
-    assertEquals(0, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
          DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
-      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+      assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    assertEquals(2048, Rmm.getLocalMaximumOutstanding());
+    assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
 
-    Rmm.resetLocalMaximumOutstanding(0);
-    assertEquals(0, Rmm.getLocalMaximumOutstanding());
-    assertEquals(2048, Rmm.getMaximumOutstanding());
+    Rmm.resetLocalMaximumBytesAllocated();
+    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
 
     DeviceMemoryBuffer ignored = Rmm.alloc(1024);
     ignored.close();
-    assertEquals(1024, Rmm.getLocalMaximumOutstanding());
-    assertEquals(2048, Rmm.getMaximumOutstanding());
+    assertEquals(1024, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
     assertEquals(0, Rmm.getTotalBytesAllocated());
 
     // a non-zero value is the new minimum
     DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
     ignored2.close();
-    Rmm.resetLocalMaximumOutstanding(10000);
-    assertEquals(10000, Rmm.getLocalMaximumOutstanding());
-    assertEquals(2048, Rmm.getMaximumOutstanding());
+    Rmm.resetLocalMaximumBytesAllocated(10000);
+    assertEquals(10000, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
 
     try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
-      Rmm.resetLocalMaximumOutstanding(1024);
+      Rmm.resetLocalMaximumBytesAllocated(1024);
       try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
-        assertEquals(20480, Rmm.getLocalMaximumOutstanding());
-        assertEquals(21504, Rmm.getMaximumOutstanding());
+        assertEquals(20480, Rmm.getLocalMaximumBytesAllocated());
+        assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
       }
     }
   }
@@ -128,23 +128,23 @@ public void testLocalMaxOutstanding(int rmmAllocMode) {
       RmmAllocationMode.ARENA})
   public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
-    assertEquals(0, Rmm.getMaximumOutstanding());
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
          DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
-      assertEquals(2048, Rmm.getLocalMaximumOutstanding());
-      Rmm.resetLocalMaximumOutstanding();
-      assertEquals(0, Rmm.getLocalMaximumOutstanding());
+      assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
+      Rmm.resetLocalMaximumBytesAllocated();
+      assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
     }
     // because we allocated a net -2048 Bytes since reset
-    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
     DeviceMemoryBuffer ignored = Rmm.alloc(1024);
     ignored.close();
-    assertEquals(0, Rmm.getLocalMaximumOutstanding());
+    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
 
     // if we allocate 2KB and then 256B we start seeing a positive local maximum
     try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
          DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
-      assertEquals(256, Rmm.getLocalMaximumOutstanding());
+      assertEquals(256, Rmm.getLocalMaximumBytesAllocated());
     }
   }
 

From 4d2a602a91228695f979eebbc18347590d30819f Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 17:41:29 -0500
Subject: [PATCH 09/14] Fix javadoc

---
 java/src/main/java/ai/rapids/cudf/Rmm.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 7d038685d71..651f2e135e7 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -147,7 +147,7 @@ public static boolean isInitialized() throws RmmException {
 
   /**
    * Resets a local maximum counter of RMM memory used to keep track of usage between
-   * sections code while debugging.
+   * code sections while debugging.
    *
    * @param initialValue an initial value (in Bytes) to use for this local counter
    */
@@ -157,7 +157,7 @@ public static void resetLocalMaximumBytesAllocated(long initialValue) {
 
   /**
    * Resets a local maximum counter of RMM memory used to keep track of usage between
-   * sections code while debugging.
+   * code sections while debugging.
    *
    * This resets the counter to 0 Bytes.
    */

From 9c92699ff6856a263d357a46ce3386664fb886e5 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 20 Oct 2022 17:48:44 -0500
Subject: [PATCH 10/14] Made the RmmJni code more consistent with the Java
 calling code

---
 java/src/main/native/src/RmmJni.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 9950f139454..663c248717f 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -86,33 +86,33 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
-  std::size_t get_max_total_allocated() override { return max_outstanding; }
+  std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
   void reset_local_max_total_allocated(std::size_t initial_value) override {
     local_allocated = 0;
-    local_max_outstanding = initial_value;
+    local_max_total_allocated = initial_value;
   }
 
-  std::size_t get_local_max_total_allocated() override { return local_max_outstanding; }
+  std::size_t get_local_max_total_allocated() override { return local_max_total_allocated; }
 
 private:
   Upstream *const resource;
   std::size_t const size_align;
-  // sum of what is currently outstanding
+  // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
 
-  // the maximum outstanding for the lifetime of this class
-  std::size_t max_outstanding{0};
+  // the maximum total allocated for the lifetime of this class
+  std::size_t max_total_allocated{0};
 
   // the local sum of what is currently outstanding from the last
-  // `reset_local_max_outstanding` call. This can be negative.
+  // `reset_local_max_total_allocated` call. This can be negative.
   std::atomic_long local_allocated{0};
 
-  // the maximum maximum outstanding relative to the last
-  // `reset_local_max_outstanding` call.
-  long local_max_outstanding{0};
+  // the maximum total allocated relative to the last
+  // `reset_local_max_total_allocated` call.
+  long local_max_total_allocated{0};
 
-  std::mutex max_outstanding_mutex;
+  std::mutex max_total_allocated_mutex;
 
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
@@ -123,9 +123,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
       total_allocated += num_bytes;
       local_allocated += num_bytes;
 
-      std::scoped_lock lock(max_outstanding_mutex);
-      max_outstanding = std::max(total_allocated.load(), max_outstanding);
-      local_max_outstanding = std::max(local_allocated.load(), local_max_outstanding);
+      std::scoped_lock lock(max_total_allocated_mutex);
+      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      local_max_total_allocated = std::max(local_allocated.load(), local_max_total_allocated);
     }
     return result;
   }

From d8eae3bf98b6b2170c68267d943e31e45bde1aaf Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 21 Oct 2022 10:02:14 -0500
Subject: [PATCH 11/14] local->scoped

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 25 ++++++-----
 java/src/main/native/src/RmmJni.cpp           | 44 +++++++++----------
 .../src/test/java/ai/rapids/cudf/RmmTest.java | 34 +++++++-------
 3 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 651f2e135e7..355debd8747 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -146,30 +146,30 @@ public static boolean isInitialized() throws RmmException {
   public static native long getMaximumTotalBytesAllocated();
 
   /**
-   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
    * code sections while debugging.
    *
-   * @param initialValue an initial value (in Bytes) to use for this local counter
+   * @param initialValue an initial value (in Bytes) to use for this scoped counter
    */
-  public static void resetLocalMaximumBytesAllocated(long initialValue) {
-    resetLocalMaximumBytesAllocatedInternal(initialValue);
+  public static void resetScopedMaximumBytesAllocated(long initialValue) {
+    resetScopedMaximumBytesAllocatedInternal(initialValue);
   }
 
   /**
-   * Resets a local maximum counter of RMM memory used to keep track of usage between
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
    * code sections while debugging.
    *
    * This resets the counter to 0 Bytes.
    */
-  public static void resetLocalMaximumBytesAllocated() {
-    resetLocalMaximumBytesAllocatedInternal(0L);
+  public static void resetScopedMaximumBytesAllocated() {
+    resetScopedMaximumBytesAllocatedInternal(0L);
   }
 
-  private static native void resetLocalMaximumBytesAllocatedInternal(long initialValue);
+  private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue);
 
   /**
    * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
-   * `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the
+   * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
    * maximum amount seen between reset and get calls).
    *
    * Note that this result is meaningful when a single thread is using the GPU, or
@@ -178,9 +178,12 @@ public static void resetLocalMaximumBytesAllocated() {
    * If the memory used is net negative (for example if only frees happened since
    * reset, and we reset to 0), then result will be 0 until we reset
    *
-   * @return the local maximum in Bytes
+   * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
+   * program and it should be equivalent to `getMaximumTotalBytesAllocated`
+   *
+   * @return the scoped maximum bytes allocated
    */
-  public static native long getLocalMaximumBytesAllocated();
+  public static native long getScopedMaximumBytesAllocated();
 
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 663c248717f..5914f9c9d86 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -54,9 +54,9 @@ class base_tracking_resource_adaptor : public device_memory_resource {
 
   virtual std::size_t get_max_total_allocated() = 0;
 
-  virtual void reset_local_max_total_allocated(std::size_t initial_value) = 0;
+  virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0;
 
-  virtual std::size_t get_local_max_total_allocated() = 0;
+  virtual std::size_t get_scoped_max_total_allocated() = 0;
 };
 
 /**
@@ -88,12 +88,12 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
-  void reset_local_max_total_allocated(std::size_t initial_value) override {
-    local_allocated = 0;
-    local_max_total_allocated = initial_value;
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+    scoped_allocated = 0;
+    scoped_max_total_allocated = initial_value;
   }
 
-  std::size_t get_local_max_total_allocated() override { return local_max_total_allocated; }
+  std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; }
 
 private:
   Upstream *const resource;
@@ -104,13 +104,13 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   // the maximum total allocated for the lifetime of this class
   std::size_t max_total_allocated{0};
 
-  // the local sum of what is currently outstanding from the last
-  // `reset_local_max_total_allocated` call. This can be negative.
-  std::atomic_long local_allocated{0};
+  // the sum of what is currently outstanding from the last
+  // `reset_scoped_max_total_allocated` call. This can be negative.
+  std::atomic_long scoped_allocated{0};
 
   // the maximum total allocated relative to the last
-  // `reset_local_max_total_allocated` call.
-  long local_max_total_allocated{0};
+  // `reset_scoped_max_total_allocated` call.
+  long scoped_max_total_allocated{0};
 
   std::mutex max_total_allocated_mutex;
 
@@ -121,11 +121,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
-      local_allocated += num_bytes;
+      scoped_allocated += num_bytes;
 
       std::scoped_lock lock(max_total_allocated_mutex);
       max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
-      local_max_total_allocated = std::max(local_allocated.load(), local_max_total_allocated);
+      scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
@@ -137,7 +137,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
-      local_allocated -= size;
+      scoped_allocated -= size;
     }
   }
 
@@ -175,15 +175,15 @@ std::size_t get_max_total_allocated() {
   return 0;
 }
 
-void reset_local_max_total_allocated(std::size_t initial_value) {
+void reset_scoped_max_total_allocated(std::size_t initial_value) {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->reset_local_max_total_allocated(initial_value);
+    return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value);
   }
 }
 
-std::size_t get_local_max_total_allocated() {
+std::size_t get_scoped_max_total_allocated() {
   if (Tracking_memory_resource) {
-    return Tracking_memory_resource->get_local_max_total_allocated();
+    return Tracking_memory_resource->get_scoped_max_total_allocated();
   }
   return 0;
 }
@@ -515,13 +515,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JN
   return get_max_total_allocated();
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumBytesAllocatedInternal(
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal(
     JNIEnv *env, jclass, long initialValue) {
-  reset_local_max_total_allocated(initialValue);
+  reset_scoped_max_total_allocated(initialValue);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumBytesAllocated(JNIEnv *env, jclass) {
-  return get_local_max_total_allocated();
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, jclass) {
+  return get_scoped_max_total_allocated();
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index b12ac675a55..18ff5f4081e 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -85,37 +85,37 @@ public void testMaxOutstanding(int rmmAllocMode) {
       RmmAllocationMode.CUDA_DEFAULT,
       RmmAllocationMode.POOL,
       RmmAllocationMode.ARENA})
-  public void testLocalMaxOutstanding(int rmmAllocMode) {
+  public void testScopedMaxOutstanding(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
     assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
          DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
-      assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
 
-    Rmm.resetLocalMaximumBytesAllocated();
-    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
+    Rmm.resetScopedMaximumBytesAllocated();
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
     assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
 
     DeviceMemoryBuffer ignored = Rmm.alloc(1024);
     ignored.close();
-    assertEquals(1024, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(1024, Rmm.getScopedMaximumBytesAllocated());
     assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
     assertEquals(0, Rmm.getTotalBytesAllocated());
 
     // a non-zero value is the new minimum
     DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
     ignored2.close();
-    Rmm.resetLocalMaximumBytesAllocated(10000);
-    assertEquals(10000, Rmm.getLocalMaximumBytesAllocated());
+    Rmm.resetScopedMaximumBytesAllocated(10000);
+    assertEquals(10000, Rmm.getScopedMaximumBytesAllocated());
     assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
 
     try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
-      Rmm.resetLocalMaximumBytesAllocated(1024);
+      Rmm.resetScopedMaximumBytesAllocated(1024);
       try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
-        assertEquals(20480, Rmm.getLocalMaximumBytesAllocated());
+        assertEquals(20480, Rmm.getScopedMaximumBytesAllocated());
         assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
       }
     }
@@ -126,25 +126,25 @@ public void testLocalMaxOutstanding(int rmmAllocMode) {
       RmmAllocationMode.CUDA_DEFAULT,
       RmmAllocationMode.POOL,
       RmmAllocationMode.ARENA})
-  public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
+  public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
     assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
          DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
-      assertEquals(2048, Rmm.getLocalMaximumBytesAllocated());
-      Rmm.resetLocalMaximumBytesAllocated();
-      assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+      Rmm.resetScopedMaximumBytesAllocated();
+      assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
     }
     // because we allocated a net -2048 Bytes since reset
-    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
     DeviceMemoryBuffer ignored = Rmm.alloc(1024);
     ignored.close();
-    assertEquals(0, Rmm.getLocalMaximumBytesAllocated());
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
 
     // if we allocate 2KB and then 256B we start seeing a positive local maximum
     try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
          DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
-      assertEquals(256, Rmm.getLocalMaximumBytesAllocated());
+      assertEquals(256, Rmm.getScopedMaximumBytesAllocated());
     }
   }
 

From c2c6c8ec5ae579e0d2a614aed0ff880297aaf0f4 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 21 Oct 2022 10:37:07 -0500
Subject: [PATCH 12/14] clang-style fixes

---
 java/src/main/native/src/RmmJni.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5914f9c9d86..331496b78de 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -520,7 +520,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedI
   reset_scoped_max_total_allocated(initialValue);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env,
+                                                                               jclass) {
   return get_scoped_max_total_allocated();
 }
 

From 22fa6e6d6a6b5d13ddb9da32ebf78560357e5179 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 21 Oct 2022 14:25:29 -0500
Subject: [PATCH 13/14] Apply code review comments

---
 java/src/main/java/ai/rapids/cudf/Rmm.java | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 355debd8747..0b825937815 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -139,9 +139,6 @@ public static boolean isInitialized() throws RmmException {
   /**
    * Returns the maximum amount of RMM memory (Bytes) outstanding during the
    * lifetime of the process.
-   *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
    */
   public static native long getMaximumTotalBytesAllocated();
 
@@ -170,16 +167,13 @@ public static void resetScopedMaximumBytesAllocated() {
   /**
    * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
    * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
-   * maximum amount seen between reset and get calls).
-   *
-   * Note that this result is meaningful when a single thread is using the GPU, or
-   * when we have joined all threads and CUDA synchronized with all streams.
+   * maximum amount seen since the last reset).
    *
    * If the memory used is net negative (for example if only frees happened since
-   * reset, and we reset to 0), then result will be 0 until we reset
+   * reset, and we reset to 0), then result will be 0.
    *
    * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
-   * program and it should be equivalent to `getMaximumTotalBytesAllocated`
+   * program and is equivalent to `getMaximumTotalBytesAllocated`.
    *
    * @return the scoped maximum bytes allocated
    */

From bec98181dd2b6592c0acf9aa688deba89ae4d7f5 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 24 Oct 2022 09:49:05 -0500
Subject: [PATCH 14/14] Lock while resetting max scoped usage

---
 java/src/main/native/src/RmmJni.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 331496b78de..529345b6bd8 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -89,6 +89,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
   void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+    std::scoped_lock lock(max_total_allocated_mutex);
     scoped_allocated = 0;
     scoped_max_total_allocated = initial_value;
   }