From e5d444f24c22e88e9cebc25dc5bd7806dde531e8 Mon Sep 17 00:00:00 2001 From: Zach Puller Date: Mon, 28 Oct 2024 10:50:34 -0700 Subject: [PATCH] fix max bytes dealloc bug Signed-off-by: Zach Puller --- src/main/cpp/src/SparkResourceAdaptorJni.cpp | 4 +++- .../java/com/nvidia/spark/rapids/jni/RmmSparkTest.java | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp index 8eeb047ddc..b2d2785ebe 100644 --- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp +++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp @@ -1780,6 +1780,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { auto const thread = threads.find(tid); if (thread != threads.end()) { log_status("DEALLOC", tid, thread->second.task_id, thread->second.state); + if (!is_for_cpu) { + thread->second.gpu_memory_allocated_bytes -= num_bytes; + } } else { log_status("DEALLOC", tid, -2, thread_state::UNKNOWN); } @@ -1802,7 +1805,6 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource { if (is_for_cpu == t_state.is_cpu_alloc) { transition(t_state, thread_state::THREAD_ALLOC_FREE); } - if (!is_for_cpu) { t_state.gpu_memory_allocated_bytes -= num_bytes; } break; default: break; } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java index 987dd58534..270a4266cd 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java @@ -360,7 +360,7 @@ public void testInsertOOMsGpu() { assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close()); assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid)); assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid)); - assertEquals(ALIGNMENT * 2, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid)); + assertEquals(ALIGNMENT, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid)); // Verify that injecting OOM does not cause the block to actually happen assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId)); @@ -818,6 +818,11 @@ public void testBasicMixedBlocking() throws ExecutionException, InterruptedExcep secondGpuAlloc.waitForAlloc(); secondGpuAlloc.freeAndWait(); } + // Do one more alloc after freeing on same task to show the max allocation metric is unimpacted + try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, FIVE_MB)) { + secondGpuAlloc.waitForAlloc(); + secondGpuAlloc.freeAndWait(); + } } } } finally {