Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu memory watermark apis to JNI #11950

Merged
merged 15 commits into from
Oct 24, 2022
Merged
43 changes: 43 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Rmm.java
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,49 @@ public static boolean isInitialized() throws RmmException {
*/
public static native long getTotalBytesAllocated();

/**
* Returns the maximum amount of RMM memory (Bytes) outstanding during the
* lifetime of the process.
*/
public static native long getMaximumTotalBytesAllocated();

/**
* Resets a scoped maximum counter of RMM memory used to keep track of usage between
* code sections while debugging.
*
* @param initialValue an initial value (in Bytes) to use for this scoped counter
*/
public static void resetScopedMaximumBytesAllocated(long initialValue) {
resetScopedMaximumBytesAllocatedInternal(initialValue);
}

/**
* Resets a scoped maximum counter of RMM memory used to keep track of usage between
* code sections while debugging.
*
* This resets the counter to 0 Bytes.
*/
public static void resetScopedMaximumBytesAllocated() {
resetScopedMaximumBytesAllocatedInternal(0L);
}

private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue);

/**
* Returns the maximum amount of RMM memory (Bytes) outstanding since the last
* `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
* maximum amount seen since the last reset).
*
* If the memory used is net negative (for example if only frees happened since
* reset, and we reset to 0), then result will be 0.
*
* If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
* program and is equivalent to `getMaximumTotalBytesAllocated`.
*
* @return the scoped maximum bytes allocated
*/
public static native long getScopedMaximumBytesAllocated();

/**
* Sets the event handler to be called on RMM events (e.g.: allocation failure).
* @param handler event handler to invoke on RMM events or null to clear an existing handler
Expand Down
71 changes: 71 additions & 0 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <fstream>
#include <iostream>
#include <limits>
#include <mutex>

#include <rmm/mr/device/aligned_resource_adaptor.hpp>
#include <rmm/mr/device/arena_memory_resource.hpp>
Expand Down Expand Up @@ -50,6 +51,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
class base_tracking_resource_adaptor : public device_memory_resource {
public:
virtual std::size_t get_total_allocated() = 0;

virtual std::size_t get_max_total_allocated() = 0;

virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0;

virtual std::size_t get_scoped_max_total_allocated() = 0;
};

/**
Expand Down Expand Up @@ -79,18 +86,47 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {

std::size_t get_total_allocated() override { return total_allocated.load(); }

std::size_t get_max_total_allocated() override { return max_total_allocated; }

void reset_scoped_max_total_allocated(std::size_t initial_value) override {
std::scoped_lock lock(max_total_allocated_mutex);
scoped_allocated = 0;
scoped_max_total_allocated = initial_value;
jbrennan333 marked this conversation as resolved.
Show resolved Hide resolved
}

std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; }

private:
Upstream *const resource;
std::size_t const size_align;
// sum of what is currently allocated
std::atomic_size_t total_allocated{0};

// the maximum total allocated for the lifetime of this class
std::size_t max_total_allocated{0};

// the sum of what is currently outstanding from the last
// `reset_scoped_max_total_allocated` call. This can be negative.
std::atomic_long scoped_allocated{0};

// the maximum total allocated relative to the last
// `reset_scoped_max_total_allocated` call.
long scoped_max_total_allocated{0};

std::mutex max_total_allocated_mutex;

void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
// adjust size of allocation based on specified size alignment
num_bytes = (num_bytes + size_align - 1) / size_align * size_align;

auto result = resource->allocate(num_bytes, stream);
if (result) {
total_allocated += num_bytes;
scoped_allocated += num_bytes;

std::scoped_lock lock(max_total_allocated_mutex);
max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
}
return result;
}
Expand All @@ -102,6 +138,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {

if (p) {
total_allocated -= size;
scoped_allocated -= size;
}
}

Expand Down Expand Up @@ -132,6 +169,26 @@ std::size_t get_total_bytes_allocated() {
return 0;
}

std::size_t get_max_total_allocated() {
if (Tracking_memory_resource) {
return Tracking_memory_resource->get_max_total_allocated();
}
return 0;
}

void reset_scoped_max_total_allocated(std::size_t initial_value) {
if (Tracking_memory_resource) {
return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value);
}
}

std::size_t get_scoped_max_total_allocated() {
if (Tracking_memory_resource) {
return Tracking_memory_resource->get_scoped_max_total_allocated();
}
return 0;
}

/**
* @brief An RMM device memory resource adaptor that delegates to the wrapped resource
* for most operations but will call Java to handle certain situations (e.g.: allocation failure).
Expand Down Expand Up @@ -455,6 +512,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
return get_total_bytes_allocated();
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) {
return get_max_total_allocated();
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal(
JNIEnv *env, jclass, long initialValue) {
reset_scoped_max_total_allocated(initialValue);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env,
jclass) {
return get_scoped_max_total_allocated();
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
jlong stream) {
try {
Expand Down
83 changes: 83 additions & 0 deletions java/src/test/java/ai/rapids/cudf/RmmTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
assertEquals(0, Rmm.getTotalBytesAllocated());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testScopedMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());

Rmm.resetScopedMaximumBytesAllocated();
assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());

DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(1024, Rmm.getScopedMaximumBytesAllocated());
assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
assertEquals(0, Rmm.getTotalBytesAllocated());

// a non-zero value is the new minimum
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
ignored2.close();
Rmm.resetScopedMaximumBytesAllocated(10000);
assertEquals(10000, Rmm.getScopedMaximumBytesAllocated());
assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());

try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
Rmm.resetScopedMaximumBytesAllocated(1024);
try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
assertEquals(20480, Rmm.getScopedMaximumBytesAllocated());
assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
}
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
Rmm.resetScopedMaximumBytesAllocated();
assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
}
// because we allocated a net -2048 Bytes since reset
assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(0, Rmm.getScopedMaximumBytesAllocated());

// if we allocate 2KB and then 256B we start seeing a positive local maximum
try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
assertEquals(256, Rmm.getScopedMaximumBytesAllocated());
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
Expand Down