Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu memory watermark apis to JNI #11950

Merged
merged 15 commits into from
Oct 24, 2022
Merged
12 changes: 5 additions & 7 deletions java/src/main/java/ai/rapids/cudf/Rmm.java
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,6 @@ public static boolean isInitialized() throws RmmException {
* Resets a local maximum counter of RMM memory used to keep track of usage between
* sections code while debugging.
*
* Note that this result is meaningful when a single thread is using the GPU, or
* when we have joined all threads and CUDA synchronized with all streams.
*
* @param initialValue an initial value (in Bytes) to use for this local counter
*/
public static void resetLocalMaximumOutstanding(long initialValue) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -162,9 +159,6 @@ public static void resetLocalMaximumOutstanding(long initialValue) {
* Resets a local maximum counter of RMM memory used to keep track of usage between
* sections code while debugging.
*
* Note that this result is meaningful when a single thread is using the GPU, or
* when we have joined all threads and CUDA synchronized with all streams.
*
* This resets the counter to 0 Bytes.
*/
public static void resetLocalMaximumOutstanding() {
Expand All @@ -180,7 +174,11 @@ public static void resetLocalMaximumOutstanding() {
*
* Note that this result is meaningful when a single thread is using the GPU, or
* when we have joined all threads and CUDA synchronized with all streams.
* @return
*
* If the memory used is net negative (for example if only frees happened since
* reset, and we reset to 0), then result will be 0 until we reset
jlowe marked this conversation as resolved.
Show resolved Hide resolved
*
* @return the local maximum in Bytes
*/
public static native long getLocalMaximumOutstanding();

Expand Down
26 changes: 15 additions & 11 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,23 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {

std::size_t get_total_allocated() override { return total_allocated.load(); }

std::size_t get_max_outstanding() override { return max_outstanding.load(); }
std::size_t get_max_outstanding() override { return max_outstanding; }

void reset_local_max_outstanding(std::size_t initial_value) override {
local_max_outstanding = initial_value;
local_allocated = initial_value;
// keep track of where we currently are when the reset call is issued
local_allocated = total_allocated;
}

std::size_t get_local_max_outstanding() override { return local_max_outstanding.load(); }
std::size_t get_local_max_outstanding() override { return local_max_outstanding; }

private:
Upstream *const resource;
std::size_t const size_align;
std::atomic_size_t total_allocated{0};
std::atomic_size_t max_outstanding{0};
std::atomic_size_t local_allocated{0};
std::atomic_size_t local_max_outstanding{0};
std::size_t max_outstanding{0};
std::size_t local_allocated{0};
std::size_t local_max_outstanding{0};

void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
// adjust size of allocation based on specified size alignment
Expand All @@ -109,11 +110,15 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
auto result = resource->allocate(num_bytes, stream);
if (result) {
total_allocated += num_bytes;
local_allocated += num_bytes;

// Note: this is not thread safe.
max_outstanding.store(std::max(total_allocated, max_outstanding));
local_max_outstanding.store(std::max(local_allocated, local_max_outstanding));
// Note: none of the below is thread safe. It is only meaningful when
// a single thread is used.
max_outstanding = std::max(total_allocated.load(), max_outstanding);

// `total_allocated - local_allocated` can be negative in the case where we free
// after we call `reset_local_max_outstanding`
std::size_t local_diff = std::max(static_cast<long>(total_allocated - local_allocated), 0L);
Copy link
Contributor

@jbrennan333 jbrennan333 Oct 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use static_cast<intptr_t> instead of static_cast<long>
I don't think long is guaranteed to be big enough to hold a size_t.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think long == long long (I think this is 32bit vs 64bit compiled programs). To cover all of std::size_t, I'd have to go to unsigned long. That's a lot of GPU memory ;) I am not sure we need to worry too much about that, especially since we are going to send this to Spark shortly, which runs java, and java's long is 64-bit and signed.

size_t max value: 18446744073709551615
long max value: 9223372036854775807
unsigned long max value: 18446744073709551615
long long max value: 9223372036854775807
unsigned long long max value: 18446744073709551615

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in this case, long is sufficient because we are an LP64 architecture (we don't run on Windows, do we?).
std::intptr_t is guaranteed to be the same width as std::size_t, but signed (I don't think ssize_t is standard?). You could use int64_t here, since as you say we know we are going to pass it to java, which is using 64 bits. This was more of a technical nit, than an actual concern that it will break (too much history cross-porting to different architectures...)

Copy link
Contributor

@ttnghia ttnghia Oct 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These types are very confusing which can be alias of other types depending on the system. Therefore, for clarity, please always use the fix-width types: (u)int32_t and (u)int64_t. They guarantee you to have known limits.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is a debug feature, we don't use (u)int* right now (e.g. I'd make things more inconsistent unless I change the whole thing), and I am not sure whether cuDF is moving away from the alias types. I think we can update in one PR that is "go away from these old types to the better ones" in the future.

local_max_outstanding = std::max(local_diff, local_max_outstanding);
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}
return result;
}
Expand All @@ -125,7 +130,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {

if (p) {
total_allocated -= size;
local_allocated -= size;
}
}

Expand Down
83 changes: 83 additions & 0 deletions java/src/test/java/ai/rapids/cudf/RmmTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
assertEquals(0, Rmm.getTotalBytesAllocated());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
assertEquals(1024, Rmm.getMaximumOutstanding());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(1024, Rmm.getMaximumOutstanding());
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testLocalMaxOutstanding(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getLocalMaximumOutstanding());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
assertEquals(2048, Rmm.getLocalMaximumOutstanding());

Rmm.resetLocalMaximumOutstanding(0);
assertEquals(0, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());

DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(1024, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());
assertEquals(0, Rmm.getTotalBytesAllocated());

// a non-zero value is the new minimum
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
ignored2.close();
Rmm.resetLocalMaximumOutstanding(10000);
assertEquals(10000, Rmm.getLocalMaximumOutstanding());
assertEquals(2048, Rmm.getMaximumOutstanding());

try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
Rmm.resetLocalMaximumOutstanding(1024);
try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
assertEquals(20480, Rmm.getLocalMaximumOutstanding());
assertEquals(21504, Rmm.getMaximumOutstanding());
}
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testLocalMaxOutstandingNegative(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getMaximumOutstanding());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertEquals(2048, Rmm.getLocalMaximumOutstanding());
Rmm.resetLocalMaximumOutstanding();
assertEquals(0, Rmm.getLocalMaximumOutstanding());
}
// because we allocated a net -2048 Bytes since reset
assertEquals(0, Rmm.getLocalMaximumOutstanding());
DeviceMemoryBuffer ignored = Rmm.alloc(1024);
ignored.close();
assertEquals(0, Rmm.getLocalMaximumOutstanding());

// if we allocate 2KB and then 256B we start seeing a positive local maximum
try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
assertEquals(256, Rmm.getLocalMaximumOutstanding());
}
}

@ParameterizedTest
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
Expand Down