rapidsai · rapids-bot · May 20, 2021 · May 18, 2021 · May 18, 2021 · May 18, 2021
@@ -173,6 +173,36 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize) throws RmmException {
+    initialize(allocationMode, logConf, poolSize, maxPoolSize, 256, 0);
+  }
+
+  /**
+   * Initialize memory manager state and storage. This will always initialize
+   * the CUDA context for the calling thread if it is not already set. The
+   * caller is responsible for setting the desired CUDA device prior to this
+   * call if a specific device is already set.
+   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
+   * context of the calling thread after this returns.
+   * @param allocationMode Allocation strategy to use. Bit set using
+   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
+   *                       {@link RmmAllocationMode#POOL},
+   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
+   * @param logConf        How to do logging or null if you don't want to
+   * @param poolSize       The initial pool size in bytes
+   * @param maxPoolSize    The maximum size the pool is allowed to grow. If the specified value
+   *                       is <= 0 then the pool size will not be artificially limited.
+   * @param allocationAlignment The size to which allocations are aligned.
+   * @param alignmentThreshold  Only allocations with size larger than or equal to this threshold
+   *                            are aligned with `allocationAlignment`.
+   * @throws IllegalStateException if RMM has already been initialized
+   * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
+   *                                  is not {@link RmmAllocationMode#POOL} or
+   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
+   *                                  below the initial size.
+   */
+  public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
+      long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
@@ -195,7 +225,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       loc = logConf.loc;
     }
 
-    initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize);
+    initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize,
+        allocationAlignment, alignmentThreshold);
     MemoryCleaner.setDefaultGpu(Cuda.getDevice());
     initialized = true;
   }
@@ -241,7 +272,8 @@ private static long[] sortThresholds(long[] thresholds) {
   }
 
   private static native void initializeInternal(int allocationMode, int logTo, String path,
-      long poolSize, long maxPoolSize) throws RmmException;
+      long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold)
+      throws RmmException;
 
   /**
    * Shut down any initialized RMM instance.  This should be used very rarely.  It does not need to

diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -32,4 +32,8 @@ public class RmmAllocationMode {
    * Use arena suballocation strategy
    */
   public static final int ARENA = 0x00000004;
+  /**
+   * Use aligned resource adapter for allocation
+   */
+  public static final int ALIGNED = 0x00000008;
 }
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <limits>
 
+#include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
@@ -332,7 +333,9 @@ extern "C" {
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz,
                                                                   jint allocation_mode, jint log_to,
                                                                   jstring jpath, jlong pool_size,
-                                                                  jlong max_pool_size) {
+                                                                  jlong max_pool_size,
+                                                                  jlong allocation_alignment,
+                                                                  jlong alignment_threshold) {
   try {
     // make sure the CUDA device is setup in the context
     cudaError_t cuda_status = cudaFree(0);
@@ -344,44 +347,42 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;
     bool use_arena_alloc = allocation_mode & 4;
+    bool use_aligned_adapter = allocation_mode & 8;
     if (use_pool_alloc) {
       auto pool_limit = (max_pool_size > 0) ?
                             thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
                             thrust::nullopt;
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
             std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       }
     } else if (use_arena_alloc) {
       std::size_t pool_limit = (max_pool_size > 0) ? static_cast<std::size_t>(max_pool_size) :
                                                      std::numeric_limits<std::size_t>::max();
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       }
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
-      auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-      Tracking_memory_resource.reset(wrapped);
     } else {
       Initialized_resource = std::make_shared<rmm::mr::cuda_memory_resource>();
-      auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-      Tracking_memory_resource.reset(wrapped);
     }
+
+    if (use_aligned_adapter) {
+      Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::aligned_resource_adaptor>(
+          Initialized_resource, allocation_alignment, alignment_threshold);
+    }
+
+    auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
+    Tracking_memory_resource.reset(wrapped);
+
     auto resource = Tracking_memory_resource.get();
     rmm::mr::set_current_device_resource(resource);