Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set RMM pool to a fixed size in JNI #9583

Merged
merged 3 commits into from
Nov 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 26 additions & 90 deletions java/src/main/java/ai/rapids/cudf/Rmm.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,68 +88,37 @@ public static LogConf logToStderr() {
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param enableLogging Enable logging memory manager events
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
* @throws IllegalStateException if RMM has already been initialized
*/
public static void initialize(int allocationMode, boolean enableLogging, long poolSize)
jlowe marked this conversation as resolved.
Show resolved Hide resolved
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize)
throws RmmException {
initialize(allocationMode, enableLogging, poolSize, 0);
}
if (initialized) {
throw new IllegalStateException("RMM is already initialized");
}

/**
* Initialize memory manager state and storage. This will always initialize
* the CUDA context for the calling thread if it is not already set. The
* caller is responsible for setting the desired CUDA device prior to this
* call if a specific device is already set.
* <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
* context of the calling thread after this returns.
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param enableLogging Enable logging memory manager events
* @param poolSize The initial pool size in bytes
* @param maxPoolSize The maximum size the pool is allowed to grow. If the specified value
* is <= 0 then the maximum pool size will not be artificially limited.
* @throws IllegalStateException if RMM has already been initialized
*/
public static void initialize(int allocationMode, boolean enableLogging, long poolSize,
long maxPoolSize) throws RmmException {
LogConf lc = null;
if (enableLogging) {
String f = System.getenv("RMM_LOG_FILE");
if (f != null) {
lc = logTo(new File(f));
} else {
lc = logToStderr();
boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;

if (isAsync && isManaged) {
throw new IllegalArgumentException(
"CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
}
LogLoc loc = LogLoc.NONE;
String path = null;
if (logConf != null) {
if (logConf.file != null) {
path = logConf.file.getAbsolutePath();
}
loc = logConf.loc;
}
initialize(allocationMode, lc, poolSize, maxPoolSize);
}

/**
* Initialize memory manager state and storage. This will always initialize
* the CUDA context for the calling thread if it is not already set. The
* caller is responsible for setting the desired CUDA device prior to this
* call if a specific device is already set.
* <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
* context of the calling thread after this returns.
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
* @throws IllegalStateException if RMM has already been initialized
*/
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize)
throws RmmException {
initialize(allocationMode, logConf, poolSize, 0);
initializeInternal(allocationMode, loc.internalId, path, poolSize);
MemoryCleaner.setDefaultGpu(Cuda.getDevice());
initialized = true;
}

/**
Expand All @@ -175,44 +144,11 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
* {@link RmmAllocationMode#ARENA} or
* {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
* size is below the initial size.
* @deprecated Use the version without the maxPoolSize parameter instead.
*/
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
long maxPoolSize) throws RmmException {
if (initialized) {
throw new IllegalStateException("RMM is already initialized");
}

boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;

if (maxPoolSize > 0) {
if (!isPool && !isArena && !isAsync) {
throw new IllegalArgumentException(
"Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode");
}
if (maxPoolSize < poolSize) {
throw new IllegalArgumentException("Pool limit of " + maxPoolSize
+ " is less than initial pool size of " + poolSize);
}
}
if (isAsync && isManaged) {
throw new IllegalArgumentException(
"CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
}
LogLoc loc = LogLoc.NONE;
String path = null;
if (logConf != null) {
if (logConf.file != null) {
path = logConf.file.getAbsolutePath();
}
loc = logConf.loc;
}

initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize);
MemoryCleaner.setDefaultGpu(Cuda.getDevice());
initialized = true;
initialize(allocationMode, logConf, poolSize);
}

/**
Expand Down Expand Up @@ -256,7 +192,7 @@ private static long[] sortThresholds(long[] thresholds) {
}

private static native void initializeInternal(int allocationMode, int logTo, String path,
long poolSize, long maxPoolSize) throws RmmException;
long poolSize) throws RmmException;

/**
* Shut down any initialized RMM instance. This should be used very rarely. It does not need to
Expand Down
24 changes: 6 additions & 18 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,7 @@ extern "C" {

JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz,
jint allocation_mode, jint log_to,
jstring jpath, jlong pool_size,
jlong max_pool_size) {
jstring jpath, jlong pool_size) {
try {
// make sure the CUDA device is setup in the context
cudaError_t cuda_status = cudaFree(0);
Expand All @@ -339,37 +338,26 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
bool use_arena_alloc = allocation_mode & 4;
bool use_cuda_async_alloc = allocation_mode & 8;
if (use_pool_alloc) {
auto pool_limit = (max_pool_size > 0) ?
thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
thrust::nullopt;
if (use_managed_mem) {
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
} else {
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
}
} else if (use_arena_alloc) {
std::size_t pool_limit = (max_pool_size > 0) ? static_cast<std::size_t>(max_pool_size) :
std::numeric_limits<std::size_t>::max();
if (use_managed_mem) {
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
} else {
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
}
} else if (use_cuda_async_alloc) {
auto const pool_limit = max_pool_size > 0 ? static_cast<std::size_t>(max_pool_size) :
std::numeric_limits<std::size_t>::max();
auto const release_threshold = max_pool_size > 0 ?
thrust::optional<std::size_t>{max_pool_size} :
thrust::optional<std::size_t>{};
// Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
// `cuda_async_memory_resource` only has a release threshold.
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, release_threshold),
pool_limit);
std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_size), pool_size);
} else if (use_managed_mem) {
Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
} else {
Expand Down
2 changes: 1 addition & 1 deletion java/src/test/java/ai/rapids/cudf/CudfTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public CudfTestBase(int allocationMode, long poolSize) {
void beforeEach() {
assumeTrue(Cuda.isEnvCompatibleForTesting());
if (!Rmm.isInitialized()) {
Rmm.initialize(rmmAllocationMode, false, rmmPoolSize);
Rmm.initialize(rmmAllocationMode, Rmm.logToStderr(), rmmPoolSize);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public void init() {
Rmm.shutdown();
}
assertFalse(Rmm.isInitialized());
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, true, -1);
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), -1);
assertTrue(Rmm.isInitialized());
Rmm.shutdown();
assertFalse(Rmm.isInitialized());
Expand All @@ -74,7 +74,7 @@ public void shutdown() {
if (Rmm.isInitialized()) {
Rmm.shutdown();
}
Rmm.initialize(RmmAllocationMode.POOL, false, 2048);
Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 2048);
try (DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(1024)) {
assertThrows(RmmException.class, () -> Rmm.shutdown(500, 2000, TimeUnit.MILLISECONDS));
}
Expand All @@ -91,9 +91,9 @@ public void allocate() {
@Test
public void doubleInitFails() {
if (!Rmm.isInitialized()) {
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0);
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0);
}
assertThrows(IllegalStateException.class,
() -> Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024));
() -> Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024));
}
}
62 changes: 19 additions & 43 deletions java/src/test/java/ai/rapids/cudf/RmmTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public void teardown() {
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testTotalAllocated(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
assertEquals(0, Rmm.getTotalBytesAllocated());
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
assertEquals(1024, Rmm.getTotalBytesAllocated());
Expand Down Expand Up @@ -110,7 +110,7 @@ public boolean onAllocFailure(long sizeRequested) {

@Test
public void testSetEventHandlerTwice() {
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0L);
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
// installing an event handler the first time should not be an error
Rmm.setEventHandler(new BaseRmmEventHandler() {
@Override
Expand All @@ -131,7 +131,7 @@ public boolean onAllocFailure(long sizeRequested) {

@Test
public void testClearEventHandler() {
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0L);
Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
// clearing the event handler when it isn't set is not an error
Rmm.clearEventHandler();

Expand Down Expand Up @@ -161,7 +161,7 @@ public void testAllocOnlyThresholds() {
final AtomicInteger deallocInvocations = new AtomicInteger(0);
final AtomicLong allocated = new AtomicLong(0);

Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);

RmmEventHandler handler = new RmmEventHandler() {
@Override
Expand Down Expand Up @@ -304,7 +304,7 @@ public void onDeallocThreshold(long totalAllocSize) {

@Test
public void testExceptionHandling() {
Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);

RmmEventHandler handler = new RmmEventHandler() {
@Override
Expand Down Expand Up @@ -344,7 +344,7 @@ public void onDeallocThreshold(long totalAllocSize) {
public void testThreadAutoDeviceSetup() throws Exception {
// A smoke-test for automatic CUDA device setup for threads calling
// into cudf. Hard to fully test without requiring multiple CUDA devices.
Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
DeviceMemoryBuffer buff = Rmm.alloc(1024);
try {
ExecutorService executor = Executors.newSingleThreadExecutor();
Expand All @@ -368,62 +368,38 @@ public void testThreadAutoDeviceSetup() throws Exception {
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 1024 * 1024);
assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
// Verify that auto set device does not
Cuda.autoSetDevice();
}

@Test
public void testPoolGrowth() {
Rmm.initialize(RmmAllocationMode.POOL, false, 1024);
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024);
DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
DeviceMemoryBuffer ignored3 = Rmm.alloc(4096)) {
assertEquals(7168, Rmm.getTotalBytesAllocated());
}
}

@Test
public void testPoolLimit() {
Rmm.initialize(RmmAllocationMode.POOL, false, 1024, 2048);
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
public void testPoolSize() {
Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024);
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024)) {
assertThrows(OutOfMemoryError.class,
() -> {
DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
ignored3.close();
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
ignored2.close();
});
}
}

@Test
public void testPoolLimitLessThanInitialSize() {
assertThrows(IllegalArgumentException.class,
() -> Rmm.initialize(RmmAllocationMode.POOL, false, 10240, 1024));
}

@Test
public void testPoolLimitNonPoolMode() {
assertThrows(IllegalArgumentException.class,
() -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
}

@Test
public void testCudaAsyncMemoryResourceLimit() {
public void testCudaAsyncMemoryResourceSize() {
try {
Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048);
Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, Rmm.logToStderr(), 1024);
} catch (CudfException e) {
// CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test.
assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
throw e;
}
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024)) {
assertThrows(OutOfMemoryError.class,
() -> {
DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
ignored3.close();
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
ignored2.close();
});
}
}
Expand All @@ -433,12 +409,12 @@ public void testCudaAsyncIsIncompatibleWithManaged() {
assertThrows(IllegalArgumentException.class,
() -> Rmm.initialize(
RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY,
false, 1024, 2048));
Rmm.logToStderr(), 1024));
}

@Test
public void testCudaMemoryBuffer() {
Rmm.initialize(RmmAllocationMode.ARENA, false, 1024);
Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 1024);
try (CudaMemoryBuffer one = CudaMemoryBuffer.allocate(512);
CudaMemoryBuffer two = CudaMemoryBuffer.allocate(1024)) {
assertEquals(512, one.length);
Expand Down