From 4f9a18ec75c65bc3a990f9a37554288f32eee840 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sat, 15 Oct 2022 16:25:55 -0500 Subject: [PATCH 01/14] Add gpu memory tracking api in JNI to track maximum memory usage --- .../java/ai/rapids/cudf/GpuMemoryTracker.java | 86 +++++++++++++++++++ java/src/main/java/ai/rapids/cudf/Rmm.java | 2 + java/src/main/native/src/RmmJni.cpp | 73 ++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java diff --git a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java new file mode 100644 index 00000000000..57b37be2238 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ai.rapids.cudf; + +import java.util.Optional; + +/** + * This is a helper class to track the maximum amount of GPU memory outstanding + * for the current thread (stream in PTDS). If free ocurrs while tracking, and the + * free is for memory that wasn't created in the scope, or it was created in a different + * thread, it will be ignored. + * + * The constructor enables a new memory tracking scope and .close stops tracking, and collects + * the result. + * + * If `ai.rapids.cudf.gpuMemoryTracking.enabled` is false (default), the result of + * `getMaxOutstanding` is an empty java Optional. + * + * Usage: + * + *
+ *   try (GpuMemoryTracker a = new GpuMemoryTracker()) {
+ *     ...
+ *     try (GpuMemoryTracker b = new GpuMemoryTracker()) {
+ *       ...
+ *       // bMaxMemory is the maximum memory used while b is not closed
+ *       Optional bMaxMemory = b.getMaxOutsanding();
+ *     }
+ *     ...
+ *
+ *     // aMaxMemory is the maximum memory used while a is not closed
+ *     // which includes bMaxMemory.
+ *     Optional aMaxMemory = a.getMaxOutsanding();
+ *   }
+ * 
+ * + * Instances should be associated with a single thread and should be at a fine + * granularity. Tracking memory when there could be free of buffers created in different + * streams will have undeserired results. + */ +public class GpuMemoryTracker implements AutoCloseable { + private static final boolean isEnabled = + Boolean.getBoolean("ai.rapids.cudf.gpuMemoryTracking.enabled"); + + private long maxOutstanding; + + static { + if (isEnabled) { + NativeDepsLoader.loadNativeDeps(); + } + } + + public GpuMemoryTracker() { + if (isEnabled) { + Rmm.pushThreadMemoryTracker(); + } + } + + @Override + public void close() { + if (isEnabled) { + maxOutstanding = Rmm.popThreadMemoryTracker(); + } + } + + public Optional getMaxOutstanding() { + if (isEnabled) { + return Optional.of(maxOutstanding); + } else { + return Optional.empty(); + } + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 730f82f0047..6273de334a9 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -135,6 +135,8 @@ public static boolean isInitialized() throws RmmException { * the result will always be a lower bound on the amount allocated. */ public static native long getTotalBytesAllocated(); + public static native void pushThreadMemoryTracker(); + public static native long popThreadMemoryTracker(); /** * Sets the event handler to be called on RMM events (e.g.: allocation failure). diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 2b4c5ae59f5..d98ab7495eb 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -50,8 +51,18 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException"; class base_tracking_resource_adaptor : public device_memory_resource { public: virtual std::size_t get_total_allocated() = 0; + virtual void push_thread_memory_tracker() = 0; + virtual long pop_thread_memory_tracker() = 0; }; +struct memory_tracker { + long current_outstanding; + long max_outstanding; +}; + +thread_local std::stack memory_tracker_stack = std::stack(); +thread_local std::unordered_map alloc_map; + /** * @brief An RMM device memory resource that delegates to another resource * while tracking the amount of memory allocated. @@ -79,11 +90,50 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } + void push_thread_memory_tracker() override { + memory_tracker_stack.emplace(); + } + + long pop_thread_memory_tracker() override { + auto top_tracker = memory_tracker_stack.top(); + auto ret = top_tracker.max_outstanding; + memory_tracker_stack.pop(); + if (memory_tracker_stack.empty()) { + alloc_map.clear(); + } else { + // carry the max to the next level + memory_tracker_stack.top().max_outstanding += ret; + } + return ret; + } + private: + Upstream *const resource; std::size_t const size_align; std::atomic_size_t total_allocated{0}; + void thread_allocated(long addr, std::size_t num_bytes) { + if (!memory_tracker_stack.empty()) { + alloc_map[addr] = num_bytes; + memory_tracker& tracker = memory_tracker_stack.top(); + tracker.current_outstanding += num_bytes; + tracker.max_outstanding = + std::max(tracker.current_outstanding, tracker.max_outstanding); + } + } + + void thread_freed(long addr, std::size_t num_bytes) { + if (!memory_tracker_stack.empty()) { + auto it = alloc_map.find(addr); + if (it != alloc_map.end()) { + auto tracker = memory_tracker_stack.top(); + tracker.current_outstanding -= it->second; + alloc_map.erase(it); + } + } + } + void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment num_bytes = (num_bytes + size_align - 1) / size_align * size_align; @@ -91,6 +141,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; + thread_allocated(reinterpret_cast(result), num_bytes); } return result; } @@ -102,6 +153,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; + thread_freed(reinterpret_cast(p), size); } } @@ -132,6 +184,19 @@ std::size_t get_total_bytes_allocated() { return 0; } +void push_thread_memory_tracker() { + if (Tracking_memory_resource) { + Tracking_memory_resource->push_thread_memory_tracker(); + } +} + +long pop_thread_memory_tracker() { + if (Tracking_memory_resource) { + return Tracking_memory_resource->pop_thread_memory_tracker(); + } + return 0; +} + /** * @brief An RMM device memory resource adaptor that delegates to the wrapped resource * for most operations but will call Java to handle certain situations (e.g.: allocation failure). @@ -455,6 +520,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e return get_total_bytes_allocated(); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_pushThreadMemoryTracker(JNIEnv *env, jclass) { + push_thread_memory_tracker(); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_popThreadMemoryTracker(JNIEnv *env, jclass) { + return pop_thread_memory_tracker(); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size, jlong stream) { try { From 543080e4731e3b7d8dc5ec2dca7e008fda77939d Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 10:07:45 -0500 Subject: [PATCH 02/14] clang-format changes --- java/src/main/native/src/RmmJni.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index d98ab7495eb..b8231ce63b3 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -90,11 +90,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } - void push_thread_memory_tracker() override { - memory_tracker_stack.emplace(); - } + void push_thread_memory_tracker() override { memory_tracker_stack.emplace(); } - long pop_thread_memory_tracker() override { + long pop_thread_memory_tracker() override { auto top_tracker = memory_tracker_stack.top(); auto ret = top_tracker.max_outstanding; memory_tracker_stack.pop(); @@ -108,18 +106,16 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { } private: - Upstream *const resource; std::size_t const size_align; std::atomic_size_t total_allocated{0}; void thread_allocated(long addr, std::size_t num_bytes) { if (!memory_tracker_stack.empty()) { - alloc_map[addr] = num_bytes; - memory_tracker& tracker = memory_tracker_stack.top(); + alloc_map[addr] = num_bytes; + memory_tracker &tracker = memory_tracker_stack.top(); tracker.current_outstanding += num_bytes; - tracker.max_outstanding = - std::max(tracker.current_outstanding, tracker.max_outstanding); + tracker.max_outstanding = std::max(tracker.current_outstanding, tracker.max_outstanding); } } From 7dfac3321d29c75cc436b08dd65f4721c4dea29e Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 13:26:23 -0500 Subject: [PATCH 03/14] Adds global/local high memory usage watermark functions to JNI --- .../java/ai/rapids/cudf/GpuMemoryTracker.java | 86 ------------------ java/src/main/java/ai/rapids/cudf/Rmm.java | 50 +++++++++- java/src/main/native/src/RmmJni.cpp | 91 ++++++++----------- 3 files changed, 88 insertions(+), 139 deletions(-) delete mode 100644 java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java diff --git a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java b/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java deleted file mode 100644 index 57b37be2238..00000000000 --- a/java/src/main/java/ai/rapids/cudf/GpuMemoryTracker.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ai.rapids.cudf; - -import java.util.Optional; - -/** - * This is a helper class to track the maximum amount of GPU memory outstanding - * for the current thread (stream in PTDS). If free ocurrs while tracking, and the - * free is for memory that wasn't created in the scope, or it was created in a different - * thread, it will be ignored. - * - * The constructor enables a new memory tracking scope and .close stops tracking, and collects - * the result. - * - * If `ai.rapids.cudf.gpuMemoryTracking.enabled` is false (default), the result of - * `getMaxOutstanding` is an empty java Optional. - * - * Usage: - * - *
- *   try (GpuMemoryTracker a = new GpuMemoryTracker()) {
- *     ...
- *     try (GpuMemoryTracker b = new GpuMemoryTracker()) {
- *       ...
- *       // bMaxMemory is the maximum memory used while b is not closed
- *       Optional bMaxMemory = b.getMaxOutsanding();
- *     }
- *     ...
- *
- *     // aMaxMemory is the maximum memory used while a is not closed
- *     // which includes bMaxMemory.
- *     Optional aMaxMemory = a.getMaxOutsanding();
- *   }
- * 
- * - * Instances should be associated with a single thread and should be at a fine - * granularity. Tracking memory when there could be free of buffers created in different - * streams will have undeserired results. - */ -public class GpuMemoryTracker implements AutoCloseable { - private static final boolean isEnabled = - Boolean.getBoolean("ai.rapids.cudf.gpuMemoryTracking.enabled"); - - private long maxOutstanding; - - static { - if (isEnabled) { - NativeDepsLoader.loadNativeDeps(); - } - } - - public GpuMemoryTracker() { - if (isEnabled) { - Rmm.pushThreadMemoryTracker(); - } - } - - @Override - public void close() { - if (isEnabled) { - maxOutstanding = Rmm.popThreadMemoryTracker(); - } - } - - public Optional getMaxOutstanding() { - if (isEnabled) { - return Optional.of(maxOutstanding); - } else { - return Optional.empty(); - } - } -} diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 6273de334a9..e64e1150da4 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -135,8 +135,54 @@ public static boolean isInitialized() throws RmmException { * the result will always be a lower bound on the amount allocated. */ public static native long getTotalBytesAllocated(); - public static native void pushThreadMemoryTracker(); - public static native long popThreadMemoryTracker(); + + /** + * Returns the maximum amount of RMM memory (Bytes) outstanding during the + * lifetime of the process. + * + * Note that this result is meaningful when a single thread is using the GPU, or + * when we have joined all threads and CUDA synchronized with all streams. + */ + public static native long getMaximumOutstanding(); + + /** + * Resets a local maximum counter of RMM memory used to keep track of usage between + * sections code while debugging. + * + * Note that this result is meaningful when a single thread is using the GPU, or + * when we have joined all threads and CUDA synchronized with all streams. + * + * @param initialValue an initial value (in Bytes) to use for this local counter + */ + public static void resetLocalMaximumOutstanding(long initialValue) { + resetLocalMaximumOutstanding(initialValue); + } + + /** + * Resets a local maximum counter of RMM memory used to keep track of usage between + * sections code while debugging. + * + * Note that this result is meaningful when a single thread is using the GPU, or + * when we have joined all threads and CUDA synchronized with all streams. + * + * This resets the counter to 0 Bytes. + */ + public static void resetLocalMaximumOutstanding() { + resetLocalMaximumOutstandingInternal(0L); + } + + public static native void resetLocalMaximumOutstandingNative(long initialValue); + + /** + * Returns the maximum amount of RMM memory (Bytes) outstanding since the last + * `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the + * maximum amount seen between reset and get calls). + * + * Note that this result is meaningful when a single thread is using the GPU, or + * when we have joined all threads and CUDA synchronized with all streams. + * @return + */ + public static native long getLocalMaximumOutstanding(); /** * Sets the event handler to be called on RMM events (e.g.: allocation failure). diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index b8231ce63b3..d8196b64ddf 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -51,17 +51,13 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException"; class base_tracking_resource_adaptor : public device_memory_resource { public: virtual std::size_t get_total_allocated() = 0; - virtual void push_thread_memory_tracker() = 0; - virtual long pop_thread_memory_tracker() = 0; -}; -struct memory_tracker { - long current_outstanding; - long max_outstanding; -}; + virtual std::size_t get_max_outstanding() = 0; + + virtual void reset_local_max_outstanding(std::size_t initial_value) = 0; -thread_local std::stack memory_tracker_stack = std::stack(); -thread_local std::unordered_map alloc_map; + virtual std::size_t get_local_max_outstanding() = 0; +}; /** * @brief An RMM device memory resource that delegates to another resource @@ -90,45 +86,22 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } - void push_thread_memory_tracker() override { memory_tracker_stack.emplace(); } + std::size_t get_max_outstanding() override { return max_outstanding.load(); } - long pop_thread_memory_tracker() override { - auto top_tracker = memory_tracker_stack.top(); - auto ret = top_tracker.max_outstanding; - memory_tracker_stack.pop(); - if (memory_tracker_stack.empty()) { - alloc_map.clear(); - } else { - // carry the max to the next level - memory_tracker_stack.top().max_outstanding += ret; - } - return ret; + void reset_local_max_outstanding(std::size_t initial_value) override { + local_max_outstanding = initial_value; + local_allocated = initial_value; } + std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); } + private: Upstream *const resource; std::size_t const size_align; std::atomic_size_t total_allocated{0}; - - void thread_allocated(long addr, std::size_t num_bytes) { - if (!memory_tracker_stack.empty()) { - alloc_map[addr] = num_bytes; - memory_tracker &tracker = memory_tracker_stack.top(); - tracker.current_outstanding += num_bytes; - tracker.max_outstanding = std::max(tracker.current_outstanding, tracker.max_outstanding); - } - } - - void thread_freed(long addr, std::size_t num_bytes) { - if (!memory_tracker_stack.empty()) { - auto it = alloc_map.find(addr); - if (it != alloc_map.end()) { - auto tracker = memory_tracker_stack.top(); - tracker.current_outstanding -= it->second; - alloc_map.erase(it); - } - } - } + std::atomic_size_t max_outstanding{0}; + std::atomic_size_t local_allocated{0}; + std::atomic_size_t local_max_outstanding{0}; void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment @@ -137,7 +110,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; - thread_allocated(reinterpret_cast(result), num_bytes); + local_allocated += num_bytes; + + // Note: this is not thread safe. + max_outstanding.store(std::max(total_allocated, max_outstanding)); + local_max_outstanding.store(std::max(local_allocated, local_max_outstanding)); } return result; } @@ -149,7 +126,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; - thread_freed(reinterpret_cast(p), size); + local_allocated -= size; } } @@ -180,15 +157,22 @@ std::size_t get_total_bytes_allocated() { return 0; } -void push_thread_memory_tracker() { +std::size_t get_max_outstanding() { if (Tracking_memory_resource) { - Tracking_memory_resource->push_thread_memory_tracker(); + return Tracking_memory_resource->get_max_outstanding(); } + return 0; } -long pop_thread_memory_tracker() { +void reset_local_max_outstanding(std::size_t initial_value) { if (Tracking_memory_resource) { - return Tracking_memory_resource->pop_thread_memory_tracker(); + return Tracking_memory_resource->reset_local_max_outstanding(initial_value); + } +} + +std::size_t get_local_max_outstanding() { + if (Tracking_memory_resource) { + return Tracking_memory_resource->get_local_max_outstanding(); } return 0; } @@ -516,12 +500,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e return get_total_bytes_allocated(); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_pushThreadMemoryTracker(JNIEnv *env, jclass) { - push_thread_memory_tracker(); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) { + return get_max_outstanding(); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal( + JNIEnv *env, jclass, long initialValue) { + reset_local_max_outstanding(initialValue); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_popThreadMemoryTracker(JNIEnv *env, jclass) { - return pop_thread_memory_tracker(); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) { + return get_local_max_outstanding(); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size, From cf851c318bc704802c8df1ec115b68fdd8a395f6 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 13:28:13 -0500 Subject: [PATCH 04/14] Fix typo --- java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index e64e1150da4..e521078478f 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -171,7 +171,7 @@ public static void resetLocalMaximumOutstanding() { resetLocalMaximumOutstandingInternal(0L); } - public static native void resetLocalMaximumOutstandingNative(long initialValue); + public static native void resetLocalMaximumOutstandingInternal(long initialValue); /** * Returns the maximum amount of RMM memory (Bytes) outstanding since the last From 9171b1c958d64f68e3f4a01b9d6bc169f5595d0b Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 13:29:26 -0500 Subject: [PATCH 05/14] Remove unnecesary include --- java/src/main/native/src/RmmJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index d8196b64ddf..2324b579d40 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include From 5aaa9790ee560e071abcf175372d055fa0a673a3 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 13:38:25 -0500 Subject: [PATCH 06/14] Make sure we call the native function --- java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index e521078478f..7064e726882 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -155,7 +155,7 @@ public static boolean isInitialized() throws RmmException { * @param initialValue an initial value (in Bytes) to use for this local counter */ public static void resetLocalMaximumOutstanding(long initialValue) { - resetLocalMaximumOutstanding(initialValue); + resetLocalMaximumOutstandingInternal(initialValue); } /** From d9d4b530f6e63611d1487b26689be86d496a37ee Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 15:42:20 -0500 Subject: [PATCH 07/14] Fix overflow issue --- java/src/main/java/ai/rapids/cudf/Rmm.java | 12 ++- java/src/main/native/src/RmmJni.cpp | 26 +++--- .../src/test/java/ai/rapids/cudf/RmmTest.java | 83 +++++++++++++++++++ 3 files changed, 103 insertions(+), 18 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 7064e726882..bf4fad7ec57 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -149,9 +149,6 @@ public static boolean isInitialized() throws RmmException { * Resets a local maximum counter of RMM memory used to keep track of usage between * sections code while debugging. * - * Note that this result is meaningful when a single thread is using the GPU, or - * when we have joined all threads and CUDA synchronized with all streams. - * * @param initialValue an initial value (in Bytes) to use for this local counter */ public static void resetLocalMaximumOutstanding(long initialValue) { @@ -162,9 +159,6 @@ public static void resetLocalMaximumOutstanding(long initialValue) { * Resets a local maximum counter of RMM memory used to keep track of usage between * sections code while debugging. * - * Note that this result is meaningful when a single thread is using the GPU, or - * when we have joined all threads and CUDA synchronized with all streams. - * * This resets the counter to 0 Bytes. */ public static void resetLocalMaximumOutstanding() { @@ -180,7 +174,11 @@ public static void resetLocalMaximumOutstanding() { * * Note that this result is meaningful when a single thread is using the GPU, or * when we have joined all threads and CUDA synchronized with all streams. - * @return + * + * If the memory used is net negative (for example if only frees happened since + * reset, and we reset to 0), then result will be 0 until we reset + * + * @return the local maximum in Bytes */ public static native long getLocalMaximumOutstanding(); diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 2324b579d40..d115e7cd921 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -85,22 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } - std::size_t get_max_outstanding() override { return max_outstanding.load(); } + std::size_t get_max_outstanding() override { return max_outstanding; } void reset_local_max_outstanding(std::size_t initial_value) override { local_max_outstanding = initial_value; - local_allocated = initial_value; + // keep track of where we currently are when the reset call is issued + local_allocated = total_allocated; } - std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); } + std::size_t get_local_max_outstanding() override { return local_max_outstanding; } private: Upstream *const resource; std::size_t const size_align; std::atomic_size_t total_allocated{0}; - std::atomic_size_t max_outstanding{0}; - std::atomic_size_t local_allocated{0}; - std::atomic_size_t local_max_outstanding{0}; + std::size_t max_outstanding{0}; + std::size_t local_allocated{0}; + std::size_t local_max_outstanding{0}; void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment @@ -109,11 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; - local_allocated += num_bytes; - // Note: this is not thread safe. - max_outstanding.store(std::max(total_allocated, max_outstanding)); - local_max_outstanding.store(std::max(local_allocated, local_max_outstanding)); + // Note: none of the below is thread safe. It is only meaningful when + // a single thread is used. + max_outstanding = std::max(total_allocated.load(), max_outstanding); + + // `total_allocated - local_allocated` can be negative in the case where we free + // after we call `reset_local_max_outstanding` + std::size_t local_diff = std::max(static_cast(total_allocated - local_allocated), 0L); + local_max_outstanding = std::max(local_diff, local_max_outstanding); } return result; } @@ -125,7 +130,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; - local_allocated -= size; } } diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index 09fbedd8a1c..4af50759ac3 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) { assertEquals(0, Rmm.getTotalBytesAllocated()); } + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testMaxOutstanding(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumOutstanding()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) { + assertEquals(1024, Rmm.getMaximumOutstanding()); + } + assertEquals(0, Rmm.getTotalBytesAllocated()); + assertEquals(1024, Rmm.getMaximumOutstanding()); + } + + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testLocalMaxOutstanding(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumOutstanding()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertEquals(2048, Rmm.getLocalMaximumOutstanding()); + } + assertEquals(0, Rmm.getTotalBytesAllocated()); + assertEquals(2048, Rmm.getLocalMaximumOutstanding()); + + Rmm.resetLocalMaximumOutstanding(0); + assertEquals(0, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getMaximumOutstanding()); + + DeviceMemoryBuffer ignored = Rmm.alloc(1024); + ignored.close(); + assertEquals(1024, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getMaximumOutstanding()); + assertEquals(0, Rmm.getTotalBytesAllocated()); + + // a non-zero value is the new minimum + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024); + ignored2.close(); + Rmm.resetLocalMaximumOutstanding(10000); + assertEquals(10000, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getMaximumOutstanding()); + + try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) { + Rmm.resetLocalMaximumOutstanding(1024); + try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) { + assertEquals(20480, Rmm.getLocalMaximumOutstanding()); + assertEquals(21504, Rmm.getMaximumOutstanding()); + } + } + } + + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testLocalMaxOutstandingNegative(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumOutstanding()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertEquals(2048, Rmm.getLocalMaximumOutstanding()); + Rmm.resetLocalMaximumOutstanding(); + assertEquals(0, Rmm.getLocalMaximumOutstanding()); + } + // because we allocated a net -2048 Bytes since reset + assertEquals(0, Rmm.getLocalMaximumOutstanding()); + DeviceMemoryBuffer ignored = Rmm.alloc(1024); + ignored.close(); + assertEquals(0, Rmm.getLocalMaximumOutstanding()); + + // if we allocate 2KB and then 256B we start seeing a positive local maximum + try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048); + DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) { + assertEquals(256, Rmm.getLocalMaximumOutstanding()); + } + } + @ParameterizedTest @ValueSource(ints = { RmmAllocationMode.CUDA_DEFAULT, From 155f227dc28de186bef6cea0afe976395e1be6b0 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 17:38:41 -0500 Subject: [PATCH 08/14] Address review comments --- java/src/main/java/ai/rapids/cudf/Rmm.java | 14 ++-- java/src/main/native/src/RmmJni.cpp | 66 +++++++++++-------- .../src/test/java/ai/rapids/cudf/RmmTest.java | 48 +++++++------- 3 files changed, 68 insertions(+), 60 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index bf4fad7ec57..7d038685d71 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -143,7 +143,7 @@ public static boolean isInitialized() throws RmmException { * Note that this result is meaningful when a single thread is using the GPU, or * when we have joined all threads and CUDA synchronized with all streams. */ - public static native long getMaximumOutstanding(); + public static native long getMaximumTotalBytesAllocated(); /** * Resets a local maximum counter of RMM memory used to keep track of usage between @@ -151,8 +151,8 @@ public static boolean isInitialized() throws RmmException { * * @param initialValue an initial value (in Bytes) to use for this local counter */ - public static void resetLocalMaximumOutstanding(long initialValue) { - resetLocalMaximumOutstandingInternal(initialValue); + public static void resetLocalMaximumBytesAllocated(long initialValue) { + resetLocalMaximumBytesAllocatedInternal(initialValue); } /** @@ -161,11 +161,11 @@ public static void resetLocalMaximumOutstanding(long initialValue) { * * This resets the counter to 0 Bytes. */ - public static void resetLocalMaximumOutstanding() { - resetLocalMaximumOutstandingInternal(0L); + public static void resetLocalMaximumBytesAllocated() { + resetLocalMaximumBytesAllocatedInternal(0L); } - public static native void resetLocalMaximumOutstandingInternal(long initialValue); + private static native void resetLocalMaximumBytesAllocatedInternal(long initialValue); /** * Returns the maximum amount of RMM memory (Bytes) outstanding since the last @@ -180,7 +180,7 @@ public static void resetLocalMaximumOutstanding() { * * @return the local maximum in Bytes */ - public static native long getLocalMaximumOutstanding(); + public static native long getLocalMaximumBytesAllocated(); /** * Sets the event handler to be called on RMM events (e.g.: allocation failure). diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index d115e7cd921..9950f139454 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -51,11 +52,11 @@ class base_tracking_resource_adaptor : public device_memory_resource { public: virtual std::size_t get_total_allocated() = 0; - virtual std::size_t get_max_outstanding() = 0; + virtual std::size_t get_max_total_allocated() = 0; - virtual void reset_local_max_outstanding(std::size_t initial_value) = 0; + virtual void reset_local_max_total_allocated(std::size_t initial_value) = 0; - virtual std::size_t get_local_max_outstanding() = 0; + virtual std::size_t get_local_max_total_allocated() = 0; }; /** @@ -85,23 +86,33 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } - std::size_t get_max_outstanding() override { return max_outstanding; } + std::size_t get_max_total_allocated() override { return max_outstanding; } - void reset_local_max_outstanding(std::size_t initial_value) override { + void reset_local_max_total_allocated(std::size_t initial_value) override { + local_allocated = 0; local_max_outstanding = initial_value; - // keep track of where we currently are when the reset call is issued - local_allocated = total_allocated; } - std::size_t get_local_max_outstanding() override { return local_max_outstanding; } + std::size_t get_local_max_total_allocated() override { return local_max_outstanding; } private: Upstream *const resource; std::size_t const size_align; + // sum of what is currently outstanding std::atomic_size_t total_allocated{0}; + + // the maximum outstanding for the lifetime of this class std::size_t max_outstanding{0}; - std::size_t local_allocated{0}; - std::size_t local_max_outstanding{0}; + + // the local sum of what is currently outstanding from the last + // `reset_local_max_outstanding` call. This can be negative. + std::atomic_long local_allocated{0}; + + // the maximum maximum outstanding relative to the last + // `reset_local_max_outstanding` call. + long local_max_outstanding{0}; + + std::mutex max_outstanding_mutex; void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment @@ -110,15 +121,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; + local_allocated += num_bytes; - // Note: none of the below is thread safe. It is only meaningful when - // a single thread is used. + std::scoped_lock lock(max_outstanding_mutex); max_outstanding = std::max(total_allocated.load(), max_outstanding); - - // `total_allocated - local_allocated` can be negative in the case where we free - // after we call `reset_local_max_outstanding` - std::size_t local_diff = std::max(static_cast(total_allocated - local_allocated), 0L); - local_max_outstanding = std::max(local_diff, local_max_outstanding); + local_max_outstanding = std::max(local_allocated.load(), local_max_outstanding); } return result; } @@ -130,6 +137,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; + local_allocated -= size; } } @@ -160,22 +168,22 @@ std::size_t get_total_bytes_allocated() { return 0; } -std::size_t get_max_outstanding() { +std::size_t get_max_total_allocated() { if (Tracking_memory_resource) { - return Tracking_memory_resource->get_max_outstanding(); + return Tracking_memory_resource->get_max_total_allocated(); } return 0; } -void reset_local_max_outstanding(std::size_t initial_value) { +void reset_local_max_total_allocated(std::size_t initial_value) { if (Tracking_memory_resource) { - return Tracking_memory_resource->reset_local_max_outstanding(initial_value); + return Tracking_memory_resource->reset_local_max_total_allocated(initial_value); } } -std::size_t get_local_max_outstanding() { +std::size_t get_local_max_total_allocated() { if (Tracking_memory_resource) { - return Tracking_memory_resource->get_local_max_outstanding(); + return Tracking_memory_resource->get_local_max_total_allocated(); } return 0; } @@ -503,17 +511,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e return get_total_bytes_allocated(); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) { - return get_max_outstanding(); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) { + return get_max_total_allocated(); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal( +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumBytesAllocatedInternal( JNIEnv *env, jclass, long initialValue) { - reset_local_max_outstanding(initialValue); + reset_local_max_total_allocated(initialValue); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) { - return get_local_max_outstanding(); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumBytesAllocated(JNIEnv *env, jclass) { + return get_local_max_total_allocated(); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size, diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index 4af50759ac3..b12ac675a55 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -72,12 +72,12 @@ public void testTotalAllocated(int rmmAllocMode) { RmmAllocationMode.ARENA}) public void testMaxOutstanding(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); - assertEquals(0, Rmm.getMaximumOutstanding()); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) { - assertEquals(1024, Rmm.getMaximumOutstanding()); + assertEquals(1024, Rmm.getMaximumTotalBytesAllocated()); } assertEquals(0, Rmm.getTotalBytesAllocated()); - assertEquals(1024, Rmm.getMaximumOutstanding()); + assertEquals(1024, Rmm.getMaximumTotalBytesAllocated()); } @ParameterizedTest @@ -87,36 +87,36 @@ public void testMaxOutstanding(int rmmAllocMode) { RmmAllocationMode.ARENA}) public void testLocalMaxOutstanding(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); - assertEquals(0, Rmm.getMaximumOutstanding()); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { - assertEquals(2048, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); } assertEquals(0, Rmm.getTotalBytesAllocated()); - assertEquals(2048, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); - Rmm.resetLocalMaximumOutstanding(0); - assertEquals(0, Rmm.getLocalMaximumOutstanding()); - assertEquals(2048, Rmm.getMaximumOutstanding()); + Rmm.resetLocalMaximumBytesAllocated(); + assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); DeviceMemoryBuffer ignored = Rmm.alloc(1024); ignored.close(); - assertEquals(1024, Rmm.getLocalMaximumOutstanding()); - assertEquals(2048, Rmm.getMaximumOutstanding()); + assertEquals(1024, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); assertEquals(0, Rmm.getTotalBytesAllocated()); // a non-zero value is the new minimum DeviceMemoryBuffer ignored2 = Rmm.alloc(1024); ignored2.close(); - Rmm.resetLocalMaximumOutstanding(10000); - assertEquals(10000, Rmm.getLocalMaximumOutstanding()); - assertEquals(2048, Rmm.getMaximumOutstanding()); + Rmm.resetLocalMaximumBytesAllocated(10000); + assertEquals(10000, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) { - Rmm.resetLocalMaximumOutstanding(1024); + Rmm.resetLocalMaximumBytesAllocated(1024); try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) { - assertEquals(20480, Rmm.getLocalMaximumOutstanding()); - assertEquals(21504, Rmm.getMaximumOutstanding()); + assertEquals(20480, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(21504, Rmm.getMaximumTotalBytesAllocated()); } } } @@ -128,23 +128,23 @@ public void testLocalMaxOutstanding(int rmmAllocMode) { RmmAllocationMode.ARENA}) public void testLocalMaxOutstandingNegative(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); - assertEquals(0, Rmm.getMaximumOutstanding()); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { - assertEquals(2048, Rmm.getLocalMaximumOutstanding()); - Rmm.resetLocalMaximumOutstanding(); - assertEquals(0, Rmm.getLocalMaximumOutstanding()); + assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); + Rmm.resetLocalMaximumBytesAllocated(); + assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); } // because we allocated a net -2048 Bytes since reset - assertEquals(0, Rmm.getLocalMaximumOutstanding()); + assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); DeviceMemoryBuffer ignored = Rmm.alloc(1024); ignored.close(); - assertEquals(0, Rmm.getLocalMaximumOutstanding()); + assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); // if we allocate 2KB and then 256B we start seeing a positive local maximum try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048); DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) { - assertEquals(256, Rmm.getLocalMaximumOutstanding()); + assertEquals(256, Rmm.getLocalMaximumBytesAllocated()); } } From 4d2a602a91228695f979eebbc18347590d30819f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 17:41:29 -0500 Subject: [PATCH 09/14] Fix javadoc --- java/src/main/java/ai/rapids/cudf/Rmm.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 7d038685d71..651f2e135e7 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -147,7 +147,7 @@ public static boolean isInitialized() throws RmmException { /** * Resets a local maximum counter of RMM memory used to keep track of usage between - * sections code while debugging. + * code sections while debugging. * * @param initialValue an initial value (in Bytes) to use for this local counter */ @@ -157,7 +157,7 @@ public static void resetLocalMaximumBytesAllocated(long initialValue) { /** * Resets a local maximum counter of RMM memory used to keep track of usage between - * sections code while debugging. + * code sections while debugging. * * This resets the counter to 0 Bytes. */ From 9c92699ff6856a263d357a46ce3386664fb886e5 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 20 Oct 2022 17:48:44 -0500 Subject: [PATCH 10/14] Made the RmmJni code more consistent with the Java calling code --- java/src/main/native/src/RmmJni.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 9950f139454..663c248717f 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -86,33 +86,33 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } - std::size_t get_max_total_allocated() override { return max_outstanding; } + std::size_t get_max_total_allocated() override { return max_total_allocated; } void reset_local_max_total_allocated(std::size_t initial_value) override { local_allocated = 0; - local_max_outstanding = initial_value; + local_max_total_allocated = initial_value; } - std::size_t get_local_max_total_allocated() override { return local_max_outstanding; } + std::size_t get_local_max_total_allocated() override { return local_max_total_allocated; } private: Upstream *const resource; std::size_t const size_align; - // sum of what is currently outstanding + // sum of what is currently allocated std::atomic_size_t total_allocated{0}; - // the maximum outstanding for the lifetime of this class - std::size_t max_outstanding{0}; + // the maximum total allocated for the lifetime of this class + std::size_t max_total_allocated{0}; // the local sum of what is currently outstanding from the last - // `reset_local_max_outstanding` call. This can be negative. + // `reset_local_max_total_allocated` call. This can be negative. std::atomic_long local_allocated{0}; - // the maximum maximum outstanding relative to the last - // `reset_local_max_outstanding` call. - long local_max_outstanding{0}; + // the maximum total allocated relative to the last + // `reset_local_max_total_allocated` call. + long local_max_total_allocated{0}; - std::mutex max_outstanding_mutex; + std::mutex max_total_allocated_mutex; void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment @@ -123,9 +123,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { total_allocated += num_bytes; local_allocated += num_bytes; - std::scoped_lock lock(max_outstanding_mutex); - max_outstanding = std::max(total_allocated.load(), max_outstanding); - local_max_outstanding = std::max(local_allocated.load(), local_max_outstanding); + std::scoped_lock lock(max_total_allocated_mutex); + max_total_allocated = std::max(total_allocated.load(), max_total_allocated); + local_max_total_allocated = std::max(local_allocated.load(), local_max_total_allocated); } return result; } From d8eae3bf98b6b2170c68267d943e31e45bde1aaf Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 21 Oct 2022 10:02:14 -0500 Subject: [PATCH 11/14] local->scoped --- java/src/main/java/ai/rapids/cudf/Rmm.java | 25 ++++++----- java/src/main/native/src/RmmJni.cpp | 44 +++++++++---------- .../src/test/java/ai/rapids/cudf/RmmTest.java | 34 +++++++------- 3 files changed, 53 insertions(+), 50 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 651f2e135e7..355debd8747 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -146,30 +146,30 @@ public static boolean isInitialized() throws RmmException { public static native long getMaximumTotalBytesAllocated(); /** - * Resets a local maximum counter of RMM memory used to keep track of usage between + * Resets a scoped maximum counter of RMM memory used to keep track of usage between * code sections while debugging. * - * @param initialValue an initial value (in Bytes) to use for this local counter + * @param initialValue an initial value (in Bytes) to use for this scoped counter */ - public static void resetLocalMaximumBytesAllocated(long initialValue) { - resetLocalMaximumBytesAllocatedInternal(initialValue); + public static void resetScopedMaximumBytesAllocated(long initialValue) { + resetScopedMaximumBytesAllocatedInternal(initialValue); } /** - * Resets a local maximum counter of RMM memory used to keep track of usage between + * Resets a scoped maximum counter of RMM memory used to keep track of usage between * code sections while debugging. * * This resets the counter to 0 Bytes. */ - public static void resetLocalMaximumBytesAllocated() { - resetLocalMaximumBytesAllocatedInternal(0L); + public static void resetScopedMaximumBytesAllocated() { + resetScopedMaximumBytesAllocatedInternal(0L); } - private static native void resetLocalMaximumBytesAllocatedInternal(long initialValue); + private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue); /** * Returns the maximum amount of RMM memory (Bytes) outstanding since the last - * `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the + * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the * maximum amount seen between reset and get calls). * * Note that this result is meaningful when a single thread is using the GPU, or @@ -178,9 +178,12 @@ public static void resetLocalMaximumBytesAllocated() { * If the memory used is net negative (for example if only frees happened since * reset, and we reset to 0), then result will be 0 until we reset * - * @return the local maximum in Bytes + * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole + * program and it should be equivalent to `getMaximumTotalBytesAllocated` + * + * @return the scoped maximum bytes allocated */ - public static native long getLocalMaximumBytesAllocated(); + public static native long getScopedMaximumBytesAllocated(); /** * Sets the event handler to be called on RMM events (e.g.: allocation failure). diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 663c248717f..5914f9c9d86 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -54,9 +54,9 @@ class base_tracking_resource_adaptor : public device_memory_resource { virtual std::size_t get_max_total_allocated() = 0; - virtual void reset_local_max_total_allocated(std::size_t initial_value) = 0; + virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0; - virtual std::size_t get_local_max_total_allocated() = 0; + virtual std::size_t get_scoped_max_total_allocated() = 0; }; /** @@ -88,12 +88,12 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_max_total_allocated() override { return max_total_allocated; } - void reset_local_max_total_allocated(std::size_t initial_value) override { - local_allocated = 0; - local_max_total_allocated = initial_value; + void reset_scoped_max_total_allocated(std::size_t initial_value) override { + scoped_allocated = 0; + scoped_max_total_allocated = initial_value; } - std::size_t get_local_max_total_allocated() override { return local_max_total_allocated; } + std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; } private: Upstream *const resource; @@ -104,13 +104,13 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { // the maximum total allocated for the lifetime of this class std::size_t max_total_allocated{0}; - // the local sum of what is currently outstanding from the last - // `reset_local_max_total_allocated` call. This can be negative. - std::atomic_long local_allocated{0}; + // the sum of what is currently outstanding from the last + // `reset_scoped_max_total_allocated` call. This can be negative. + std::atomic_long scoped_allocated{0}; // the maximum total allocated relative to the last - // `reset_local_max_total_allocated` call. - long local_max_total_allocated{0}; + // `reset_scoped_max_total_allocated` call. + long scoped_max_total_allocated{0}; std::mutex max_total_allocated_mutex; @@ -121,11 +121,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; - local_allocated += num_bytes; + scoped_allocated += num_bytes; std::scoped_lock lock(max_total_allocated_mutex); max_total_allocated = std::max(total_allocated.load(), max_total_allocated); - local_max_total_allocated = std::max(local_allocated.load(), local_max_total_allocated); + scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated); } return result; } @@ -137,7 +137,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; - local_allocated -= size; + scoped_allocated -= size; } } @@ -175,15 +175,15 @@ std::size_t get_max_total_allocated() { return 0; } -void reset_local_max_total_allocated(std::size_t initial_value) { +void reset_scoped_max_total_allocated(std::size_t initial_value) { if (Tracking_memory_resource) { - return Tracking_memory_resource->reset_local_max_total_allocated(initial_value); + return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value); } } -std::size_t get_local_max_total_allocated() { +std::size_t get_scoped_max_total_allocated() { if (Tracking_memory_resource) { - return Tracking_memory_resource->get_local_max_total_allocated(); + return Tracking_memory_resource->get_scoped_max_total_allocated(); } return 0; } @@ -515,13 +515,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JN return get_max_total_allocated(); } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumBytesAllocatedInternal( +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal( JNIEnv *env, jclass, long initialValue) { - reset_local_max_total_allocated(initialValue); + reset_scoped_max_total_allocated(initialValue); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumBytesAllocated(JNIEnv *env, jclass) { - return get_local_max_total_allocated(); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, jclass) { + return get_scoped_max_total_allocated(); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size, diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index b12ac675a55..18ff5f4081e 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -85,37 +85,37 @@ public void testMaxOutstanding(int rmmAllocMode) { RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL, RmmAllocationMode.ARENA}) - public void testLocalMaxOutstanding(int rmmAllocMode) { + public void testScopedMaxOutstanding(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { - assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); } assertEquals(0, Rmm.getTotalBytesAllocated()); - assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); - Rmm.resetLocalMaximumBytesAllocated(); - assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); + Rmm.resetScopedMaximumBytesAllocated(); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); DeviceMemoryBuffer ignored = Rmm.alloc(1024); ignored.close(); - assertEquals(1024, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(1024, Rmm.getScopedMaximumBytesAllocated()); assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); assertEquals(0, Rmm.getTotalBytesAllocated()); // a non-zero value is the new minimum DeviceMemoryBuffer ignored2 = Rmm.alloc(1024); ignored2.close(); - Rmm.resetLocalMaximumBytesAllocated(10000); - assertEquals(10000, Rmm.getLocalMaximumBytesAllocated()); + Rmm.resetScopedMaximumBytesAllocated(10000); + assertEquals(10000, Rmm.getScopedMaximumBytesAllocated()); assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) { - Rmm.resetLocalMaximumBytesAllocated(1024); + Rmm.resetScopedMaximumBytesAllocated(1024); try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) { - assertEquals(20480, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(20480, Rmm.getScopedMaximumBytesAllocated()); assertEquals(21504, Rmm.getMaximumTotalBytesAllocated()); } } @@ -126,25 +126,25 @@ public void testLocalMaxOutstanding(int rmmAllocMode) { RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL, RmmAllocationMode.ARENA}) - public void testLocalMaxOutstandingNegative(int rmmAllocMode) { + public void testScopedMaxOutstandingNegative(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { - assertEquals(2048, Rmm.getLocalMaximumBytesAllocated()); - Rmm.resetLocalMaximumBytesAllocated(); - assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); + Rmm.resetScopedMaximumBytesAllocated(); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); } // because we allocated a net -2048 Bytes since reset - assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); DeviceMemoryBuffer ignored = Rmm.alloc(1024); ignored.close(); - assertEquals(0, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); // if we allocate 2KB and then 256B we start seeing a positive local maximum try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048); DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) { - assertEquals(256, Rmm.getLocalMaximumBytesAllocated()); + assertEquals(256, Rmm.getScopedMaximumBytesAllocated()); } } From c2c6c8ec5ae579e0d2a614aed0ff880297aaf0f4 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 21 Oct 2022 10:37:07 -0500 Subject: [PATCH 12/14] clang-style fixes --- java/src/main/native/src/RmmJni.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 5914f9c9d86..331496b78de 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -520,7 +520,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedI reset_scoped_max_total_allocated(initialValue); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, jclass) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, + jclass) { return get_scoped_max_total_allocated(); } From 22fa6e6d6a6b5d13ddb9da32ebf78560357e5179 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 21 Oct 2022 14:25:29 -0500 Subject: [PATCH 13/14] Apply code review comments --- java/src/main/java/ai/rapids/cudf/Rmm.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 355debd8747..0b825937815 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -139,9 +139,6 @@ public static boolean isInitialized() throws RmmException { /** * Returns the maximum amount of RMM memory (Bytes) outstanding during the * lifetime of the process. - * - * Note that this result is meaningful when a single thread is using the GPU, or - * when we have joined all threads and CUDA synchronized with all streams. */ public static native long getMaximumTotalBytesAllocated(); @@ -170,16 +167,13 @@ public static void resetScopedMaximumBytesAllocated() { /** * Returns the maximum amount of RMM memory (Bytes) outstanding since the last * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the - * maximum amount seen between reset and get calls). - * - * Note that this result is meaningful when a single thread is using the GPU, or - * when we have joined all threads and CUDA synchronized with all streams. + * maximum amount seen since the last reset). * * If the memory used is net negative (for example if only frees happened since - * reset, and we reset to 0), then result will be 0 until we reset + * reset, and we reset to 0), then result will be 0. * * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole - * program and it should be equivalent to `getMaximumTotalBytesAllocated` + * program and is equivalent to `getMaximumTotalBytesAllocated`. * * @return the scoped maximum bytes allocated */ From bec98181dd2b6592c0acf9aa688deba89ae4d7f5 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 24 Oct 2022 09:49:05 -0500 Subject: [PATCH 14/14] Lock while resetting max scoped usage --- java/src/main/native/src/RmmJni.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 331496b78de..529345b6bd8 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -89,6 +89,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_max_total_allocated() override { return max_total_allocated; } void reset_scoped_max_total_allocated(std::size_t initial_value) override { + std::scoped_lock lock(max_total_allocated_mutex); scoped_allocated = 0; scoped_max_total_allocated = initial_value; }