rapidsai · rapids-bot · Sep 13, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021
@@ -85,7 +85,8 @@ public static LogConf logToStderr() {
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize) throws RmmException {
@@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -198,18 +204,21 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
     if (maxPoolSize > 0) {
-      if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) {
+      if ((allocationMode & RmmAllocationMode.POOL) == 0 &&
+          (allocationMode & RmmAllocationMode.ARENA) == 0 &&
+          (allocationMode & RmmAllocationMode.CUDA_ASYNC) == 0) {
         throw new IllegalArgumentException(
-                "Pool limit only supported in POOL or ARENA allocation mode");
+            "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode");
       }
       if (maxPoolSize < poolSize) {
         throw new IllegalArgumentException("Pool limit of " + maxPoolSize

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,4 +32,8 @@ public class RmmAllocationMode {
    * Use arena suballocation strategy
    */
   public static final int ARENA = 0x00000004;
+  /**
+   * Use CUDA async suballocation strategy
+   */
+  public static final int CUDA_ASYNC = 0x00000008;
 }
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
@@ -344,6 +345,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;
     bool use_arena_alloc = allocation_mode & 4;
+    bool use_cuda_async_alloc = allocation_mode & 8;
     if (use_pool_alloc) {
       auto pool_limit = (max_pool_size > 0) ?
                             thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
@@ -365,6 +367,11 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
       }
+    } else if (use_cuda_async_alloc) {
+      auto pool_limit = (max_pool_size > 0) ? thrust::optional<std::size_t>{max_pool_size} :
+                                              thrust::optional<std::size_t>{};
+      Initialized_resource =
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_limit);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.jupiter.api.Assumptions.assumeFalse;
 
 public class RmmTest {
   private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024;
@@ -51,18 +52,24 @@ public void teardown() {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testTotalAllocated(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) {
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
       assertEquals(1024, Rmm.getTotalBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testEventHandler(int rmmAllocMode) {
     AtomicInteger invokedCount = new AtomicInteger();
     AtomicLong amountRequested = new AtomicLong();
@@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) {
 
     Rmm.setEventHandler(handler);
     DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024);
-    assertThrows(DeallocThresholdException.class, () -> addr.close());
+    assertThrows(DeallocThresholdException.class, addr::close);
     assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024));
     assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY));
   }
@@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
     assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
@@ -399,9 +409,28 @@ public void testPoolLimitNonPoolMode() {
         () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
   }
 
-  private static class AllocFailException extends RuntimeException {}
-  private static class AllocThresholdException extends RuntimeException {}
-  private static class DeallocThresholdException extends RuntimeException {}
+  @Test
+  public void testCudaAsyncMemoryResource() {
+    try {
+      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024 * 1024L, 1024 * 1024L);
+    } catch (CudfException e) {
+      assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
+      throw e;
+    }
+    DeviceMemoryBuffer buff = Rmm.alloc(1024);
+    buff.close();
+    buff = Rmm.alloc(2048);
+    buff.close();
+  }
+
+  private static class AllocFailException extends RuntimeException {
+  }
+
+  private static class AllocThresholdException extends RuntimeException {
+  }
+
+  private static class DeallocThresholdException extends RuntimeException {
+  }
 
   private static abstract class BaseRmmEventHandler implements RmmEventHandler {
     @Override