Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu memory watermark apis to JNI #11950

Merged
merged 15 commits into from
Oct 24, 2022
Merged
46 changes: 46 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Rmm.java
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,52 @@ public static boolean isInitialized() throws RmmException {
*/
public static native long getTotalBytesAllocated();

/**
* Returns the maximum amount of RMM memory (Bytes) outstanding during the
* lifetime of the process.
*
* Note that this result is meaningful when a single thread is using the GPU, or
* when we have joined all threads and CUDA synchronized with all streams.
jlowe marked this conversation as resolved.
Show resolved Hide resolved
*/
public static native long getMaximumOutstanding();
jlowe marked this conversation as resolved.
Show resolved Hide resolved

/**
* Resets a local maximum counter of RMM memory used to keep track of usage between
* sections code while debugging.
jlowe marked this conversation as resolved.
Show resolved Hide resolved
*
* @param initialValue an initial value (in Bytes) to use for this local counter
*/
public static void resetLocalMaximumOutstanding(long initialValue) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
resetLocalMaximumOutstandingInternal(initialValue);
}

/**
* Resets a local maximum counter of RMM memory used to keep track of usage between
* sections code while debugging.
*
* This resets the counter to 0 Bytes.
*/
public static void resetLocalMaximumOutstanding() {
resetLocalMaximumOutstandingInternal(0L);
}

public static native void resetLocalMaximumOutstandingInternal(long initialValue);

/**
* Returns the maximum amount of RMM memory (Bytes) outstanding since the last
* `resetLocalMaximumOutstanding` call was issued (it is "local" because it's the
* maximum amount seen between reset and get calls).
jlowe marked this conversation as resolved.
Show resolved Hide resolved
*
* Note that this result is meaningful when a single thread is using the GPU, or
* when we have joined all threads and CUDA synchronized with all streams.
*
jlowe marked this conversation as resolved.
Show resolved Hide resolved
* If the memory used is net negative (for example if only frees happened since
* reset, and we reset to 0), then result will be 0 until we reset
jlowe marked this conversation as resolved.
Show resolved Hide resolved
*
* @return the local maximum in Bytes
*/
public static native long getLocalMaximumOutstanding();

/**
* Sets the event handler to be called on RMM events (e.g.: allocation failure).
* @param handler event handler to invoke on RMM events or null to clear an existing handler
Expand Down
61 changes: 61 additions & 0 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
class base_tracking_resource_adaptor : public device_memory_resource {
public:
virtual std::size_t get_total_allocated() = 0;

virtual std::size_t get_max_outstanding() = 0;

virtual void reset_local_max_outstanding(std::size_t initial_value) = 0;

virtual std::size_t get_local_max_outstanding() = 0;
};

/**
Expand Down Expand Up @@ -79,10 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {

std::size_t get_total_allocated() override { return total_allocated.load(); }

std::size_t get_max_outstanding() override { return max_outstanding; }

void reset_local_max_outstanding(std::size_t initial_value) override {
local_max_outstanding = initial_value;
// keep track of where we currently are when the reset call is issued
local_allocated = total_allocated;
}

std::size_t get_local_max_outstanding() override { return local_max_outstanding; }

private:
Upstream *const resource;
std::size_t const size_align;
std::atomic_size_t total_allocated{0};
std::size_t max_outstanding{0};
std::size_t local_allocated{0};
std::size_t local_max_outstanding{0};

void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
// adjust size of allocation based on specified size alignment
Expand All @@ -91,6 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
auto result = resource->allocate(num_bytes, stream);
if (result) {
total_allocated += num_bytes;

// Note: none of the below is thread safe. It is only meaningful when
// a single thread is used.
max_outstanding = std::max(total_allocated.load(), max_outstanding);

// `total_allocated - local_allocated` can be negative in the case where we free
// after we call `reset_local_max_outstanding`
std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
Copy link
Contributor

@jbrennan333 jbrennan333 Oct 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use static_cast<intptr_t> instead of static_cast<long>
I don't think long is guaranteed to be big enough to hold a size_t.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think long == long long (I think this is 32bit vs 64bit compiled programs). To cover all of std::size_t, I'd have to go to unsigned long. That's a lot of GPU memory ;) I am not sure we need to worry too much about that, especially since we are going to send this to Spark shortly, which runs java, and java's long is 64-bit and signed.

size_t max value: 18446744073709551615
long max value: 9223372036854775807
unsigned long max value: 18446744073709551615
long long max value: 9223372036854775807
unsigned long long max value: 18446744073709551615

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in this case, long is sufficient because we are an LP64 architecture (we don't run on Windows, do we?).
std::intptr_t is guaranteed to be the same width as std::size_t, but signed (I don't think ssize_t is standard?). You could use int64_t here, since as you say we know we are going to pass it to java, which is using 64 bits. This was more of a technical nit, than an actual concern that it will break (too much history cross-porting to different architectures...)

Copy link
Contributor

@ttnghia ttnghia Oct 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These types are very confusing which can be alias of other types depending on the system. Therefore, for clarity, please always use the fix-width types: (u)int32_t and (u)int64_t. They guarantee you to have known limits.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is a debug feature, we don't use (u)int* right now (e.g. I'd make things more inconsistent unless I change the whole thing), and I am not sure whether cuDF is moving away from the alias types. I think we can update in one PR that is "go away from these old types to the better ones" in the future.

local_max_outstanding = std::max(local_diff, local_max_outstanding);
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}
return result;
}
Expand Down Expand Up @@ -132,6 +160,26 @@ std::size_t get_total_bytes_allocated() {
return 0;
}

std::size_t get_max_outstanding() {
if (Tracking_memory_resource) {
return Tracking_memory_resource->get_max_outstanding();
}
return 0;
}

void reset_local_max_outstanding(std::size_t initial_value) {
if (Tracking_memory_resource) {
return Tracking_memory_resource->reset_local_max_outstanding(initial_value);
}
}

std::size_t get_local_max_outstanding() {
if (Tracking_memory_resource) {
return Tracking_memory_resource->get_local_max_outstanding();
}
return 0;
}

/**
* @brief An RMM device memory resource adaptor that delegates to the wrapped resource
* for most operations but will call Java to handle certain situations (e.g.: allocation failure).
Expand Down Expand Up @@ -455,6 +503,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
return get_total_bytes_allocated();
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumOutstanding(JNIEnv *env, jclass) {
return get_max_outstanding();
}

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetLocalMaximumOutstandingInternal(
JNIEnv *env, jclass, long initialValue) {
reset_local_max_outstanding(initialValue);
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getLocalMaximumOutstanding(JNIEnv *env, jclass) {
return get_local_max_outstanding();
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
jlong stream) {
try {
Expand Down
83 changes: 83 additions & 0 deletions java/src/test/java/ai/rapids/cudf/RmmTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
assertEquals(0, Rmm.getTotalBytesAllocated());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
assertEquals(1024, Rmm.getMaximumOutstanding());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(1024, Rmm.getMaximumOutstanding());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testLocalMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getLocalMaximumOutstanding());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(2048, Rmm.getLocalMaximumOutstanding());

Rmm.resetLocalMaximumOutstanding(0);
assertEquals(0, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());

DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(1024, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());
assertEquals(0, Rmm.getTotalBytesAllocated());

// a non-zero value is the new minimum
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
ignored2.close();
Rmm.resetLocalMaximumOutstanding(10000);
assertEquals(10000, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());

try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
Rmm.resetLocalMaximumOutstanding(1024);
try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
assertEquals(20480, Rmm.getLocalMaximumOutstanding());
assertEquals(21504, Rmm.getMaximumOutstanding());
}
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getLocalMaximumOutstanding());
Rmm.resetLocalMaximumOutstanding();
assertEquals(0, Rmm.getLocalMaximumOutstanding());
}
// because we allocated a net -2048 Bytes since reset
assertEquals(0, Rmm.getLocalMaximumOutstanding());
DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(0, Rmm.getLocalMaximumOutstanding());

// if we allocate 2KB and then 256B we start seeing a positive local maximum
try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
assertEquals(256, Rmm.getLocalMaximumOutstanding());
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
Expand Down