diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 730f82f0047..0b825937815 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -136,6 +136,49 @@ public static boolean isInitialized() throws RmmException { */ public static native long getTotalBytesAllocated(); + /** + * Returns the maximum amount of RMM memory (Bytes) outstanding during the + * lifetime of the process. + */ + public static native long getMaximumTotalBytesAllocated(); + + /** + * Resets a scoped maximum counter of RMM memory used to keep track of usage between + * code sections while debugging. + * + * @param initialValue an initial value (in Bytes) to use for this scoped counter + */ + public static void resetScopedMaximumBytesAllocated(long initialValue) { + resetScopedMaximumBytesAllocatedInternal(initialValue); + } + + /** + * Resets a scoped maximum counter of RMM memory used to keep track of usage between + * code sections while debugging. + * + * This resets the counter to 0 Bytes. + */ + public static void resetScopedMaximumBytesAllocated() { + resetScopedMaximumBytesAllocatedInternal(0L); + } + + private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue); + + /** + * Returns the maximum amount of RMM memory (Bytes) outstanding since the last + * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the + * maximum amount seen since the last reset). + * + * If the memory used is net negative (for example if only frees happened since + * reset, and we reset to 0), then result will be 0. + * + * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole + * program and is equivalent to `getMaximumTotalBytesAllocated`. + * + * @return the scoped maximum bytes allocated + */ + public static native long getScopedMaximumBytesAllocated(); + /** * Sets the event handler to be called on RMM events (e.g.: allocation failure). * @param handler event handler to invoke on RMM events or null to clear an existing handler diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 2b4c5ae59f5..529345b6bd8 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException"; class base_tracking_resource_adaptor : public device_memory_resource { public: virtual std::size_t get_total_allocated() = 0; + + virtual std::size_t get_max_total_allocated() = 0; + + virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0; + + virtual std::size_t get_scoped_max_total_allocated() = 0; }; /** @@ -79,11 +86,35 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { std::size_t get_total_allocated() override { return total_allocated.load(); } + std::size_t get_max_total_allocated() override { return max_total_allocated; } + + void reset_scoped_max_total_allocated(std::size_t initial_value) override { + std::scoped_lock lock(max_total_allocated_mutex); + scoped_allocated = 0; + scoped_max_total_allocated = initial_value; + } + + std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; } + private: Upstream *const resource; std::size_t const size_align; + // sum of what is currently allocated std::atomic_size_t total_allocated{0}; + // the maximum total allocated for the lifetime of this class + std::size_t max_total_allocated{0}; + + // the sum of what is currently outstanding from the last + // `reset_scoped_max_total_allocated` call. This can be negative. + std::atomic_long scoped_allocated{0}; + + // the maximum total allocated relative to the last + // `reset_scoped_max_total_allocated` call. + long scoped_max_total_allocated{0}; + + std::mutex max_total_allocated_mutex; + void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { // adjust size of allocation based on specified size alignment num_bytes = (num_bytes + size_align - 1) / size_align * size_align; @@ -91,6 +122,11 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { auto result = resource->allocate(num_bytes, stream); if (result) { total_allocated += num_bytes; + scoped_allocated += num_bytes; + + std::scoped_lock lock(max_total_allocated_mutex); + max_total_allocated = std::max(total_allocated.load(), max_total_allocated); + scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated); } return result; } @@ -102,6 +138,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (p) { total_allocated -= size; + scoped_allocated -= size; } } @@ -132,6 +169,26 @@ std::size_t get_total_bytes_allocated() { return 0; } +std::size_t get_max_total_allocated() { + if (Tracking_memory_resource) { + return Tracking_memory_resource->get_max_total_allocated(); + } + return 0; +} + +void reset_scoped_max_total_allocated(std::size_t initial_value) { + if (Tracking_memory_resource) { + return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value); + } +} + +std::size_t get_scoped_max_total_allocated() { + if (Tracking_memory_resource) { + return Tracking_memory_resource->get_scoped_max_total_allocated(); + } + return 0; +} + /** * @brief An RMM device memory resource adaptor that delegates to the wrapped resource * for most operations but will call Java to handle certain situations (e.g.: allocation failure). @@ -455,6 +512,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e return get_total_bytes_allocated(); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) { + return get_max_total_allocated(); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal( + JNIEnv *env, jclass, long initialValue) { + reset_scoped_max_total_allocated(initialValue); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env, + jclass) { + return get_scoped_max_total_allocated(); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size, jlong stream) { try { diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index 09fbedd8a1c..18ff5f4081e 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) { assertEquals(0, Rmm.getTotalBytesAllocated()); } + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testMaxOutstanding(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) { + assertEquals(1024, Rmm.getMaximumTotalBytesAllocated()); + } + assertEquals(0, Rmm.getTotalBytesAllocated()); + assertEquals(1024, Rmm.getMaximumTotalBytesAllocated()); + } + + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testScopedMaxOutstanding(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); + } + assertEquals(0, Rmm.getTotalBytesAllocated()); + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); + + Rmm.resetScopedMaximumBytesAllocated(); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); + + DeviceMemoryBuffer ignored = Rmm.alloc(1024); + ignored.close(); + assertEquals(1024, Rmm.getScopedMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); + assertEquals(0, Rmm.getTotalBytesAllocated()); + + // a non-zero value is the new minimum + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024); + ignored2.close(); + Rmm.resetScopedMaximumBytesAllocated(10000); + assertEquals(10000, Rmm.getScopedMaximumBytesAllocated()); + assertEquals(2048, Rmm.getMaximumTotalBytesAllocated()); + + try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) { + Rmm.resetScopedMaximumBytesAllocated(1024); + try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) { + assertEquals(20480, Rmm.getScopedMaximumBytesAllocated()); + assertEquals(21504, Rmm.getMaximumTotalBytesAllocated()); + } + } + } + + @ParameterizedTest + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) + public void testScopedMaxOutstandingNegative(int rmmAllocMode) { + Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); + assertEquals(0, Rmm.getMaximumTotalBytesAllocated()); + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertEquals(2048, Rmm.getScopedMaximumBytesAllocated()); + Rmm.resetScopedMaximumBytesAllocated(); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); + } + // because we allocated a net -2048 Bytes since reset + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); + DeviceMemoryBuffer ignored = Rmm.alloc(1024); + ignored.close(); + assertEquals(0, Rmm.getScopedMaximumBytesAllocated()); + + // if we allocate 2KB and then 256B we start seeing a positive local maximum + try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048); + DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) { + assertEquals(256, Rmm.getScopedMaximumBytesAllocated()); + } + } + @ParameterizedTest @ValueSource(ints = { RmmAllocationMode.CUDA_DEFAULT,