NVIDIA · rongou · Jun 3, 2021 · May 20, 2021 · May 20, 2021 · May 20, 2021
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
@@ -162,6 +162,8 @@ object GpuDeviceManager extends Logging {
     }
   }
 
+  private def toKB(x: Long): Double = x / 1024.0
+
   private def toMB(x: Long): Double = x / 1024 / 1024.0
 
   private def computeRmmInitSizes(conf: RapidsConf, info: CudaMemInfo): (Long, Long) = {
@@ -263,9 +265,18 @@ object GpuDeviceManager extends Logging {
         logInfo("Using legacy default stream")
       }
 
+      val (allocationAlignment, alignmentThreshold) = if (conf.isGdsSpillEnabled) {
+        logInfo(s"Using allocation alignment = ${toKB(RapidsGdsStore.AllocationAlignment)} KB, " +
+            s"alignment threshold = ${toKB(RapidsGdsStore.AlignmentThreshold)} KB")
+        (RapidsGdsStore.AllocationAlignment, RapidsGdsStore.AlignmentThreshold)
+      } else {
+        (0L, 0L)
+      }
+
       try {
         Cuda.setDevice(gpuId)
-        Rmm.initialize(init, logConf, initialAllocation, maxAllocation)
+        Rmm.initialize(
+          init, logConf, initialAllocation, maxAllocation, allocationAlignment, alignmentThreshold)
         RapidsBufferCatalog.init(conf)
         GpuShuffleEnv.init(conf)
       } catch {

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsGdsStore.scala
@@ -70,7 +70,8 @@ class RapidsGdsStore(
 
     override def materializeMemoryBuffer: MemoryBuffer = {
       closeOnExcept(DeviceMemoryBuffer.allocate(size)) { buffer =>
-        CuFile.readFileToDeviceBuffer(buffer, path, fileOffset)
+        CuFile.readFileToDeviceMemory(
+          buffer.getAddress, RapidsGdsStore.alignBufferSize(buffer), path, fileOffset)
         logDebug(s"Created device buffer for $path $fileOffset:$size via GDS")
         buffer
       }
@@ -83,7 +84,8 @@ class RapidsGdsStore(
           val dm = dmOriginal.slice(dstOffset, length)
           // TODO: switch to async API when it's released, using the passed in CUDA stream.
           stream.sync()
-          CuFile.readFileToDeviceBuffer(dm, path, fileOffset + srcOffset)
+          CuFile.readFileToDeviceMemory(
+            dm.getAddress, RapidsGdsStore.alignBufferSize(dm), path, fileOffset + srcOffset)
           logDebug(s"Created device buffer for $path $fileOffset:$size via GDS")
         case _ => throw new IllegalStateException(
           s"GDS can only copy to device buffer, not ${dst.getClass}")
@@ -102,17 +104,18 @@ class RapidsGdsStore(
   }
 
   private def singleShotSpill(other: RapidsBuffer, deviceBuffer: DeviceMemoryBuffer)
-      : RapidsBufferBase = {
+  : RapidsBufferBase = {
     val id = other.id
     val path = id.getDiskPath(diskBlockManager)
+    val alignedSize = RapidsGdsStore.alignBufferSize(deviceBuffer)
     // When sharing files, append to the file; otherwise, write from the beginning.
     val fileOffset = if (id.canShareDiskPaths) {
       // only one writer at a time for now when using shared files
       path.synchronized {
-        CuFile.appendDeviceBufferToFile(path, deviceBuffer)
+        CuFile.appendDeviceMemoryToFile(path, deviceBuffer.getAddress, alignedSize)
       }
     } else {
-      CuFile.writeDeviceBufferToFile(path, 0, deviceBuffer)
+      CuFile.writeDeviceMemoryToFile(path, 0, deviceBuffer.getAddress, alignedSize)
       0
     }
     logDebug(s"Spilled to $path $fileOffset:${other.size} via GDS")
@@ -121,7 +124,6 @@ class RapidsGdsStore(
   }
 
   class BatchSpiller() {
-    private val blockSize = 4096
     private[this] val spilledBuffers = new ConcurrentHashMap[File, Set[RapidsBufferId]]
     private[this] val pendingBuffers = ArrayBuffer.empty[RapidsGdsBatchedBuffer]
     private[this] val batchWriteBuffer = CuFileBuffer.allocate(batchWriteBufferSize, true)
@@ -149,15 +151,11 @@ class RapidsGdsStore(
         addBuffer(currentFile, id)
         val gdsBuffer = new RapidsGdsBatchedBuffer(id, currentFile, currentOffset,
           other.size, other.meta, other.getSpillPriority, other.spillCallback)
-        currentOffset += alignUp(deviceBuffer.getLength)
+        currentOffset += RapidsGdsStore.alignUp(deviceBuffer.getLength)
         pendingBuffers += gdsBuffer
         gdsBuffer
       }
 
-    private def alignUp(length: Long): Long = {
-      (length + blockSize - 1) & ~(blockSize - 1)
-    }
-
     private def copyToBuffer(
         buffer: MemoryBuffer, offset: Long, size: Long, stream: Cuda.Stream): Unit = {
       buffer.copyFromMemoryBuffer(0, batchWriteBuffer, offset, size, stream)
@@ -208,7 +206,8 @@ class RapidsGdsStore(
             Cuda.DEFAULT_STREAM.sync()
             logDebug(s"Created device buffer $size from batch write buffer")
           } else {
-            CuFile.readFileToDeviceBuffer(buffer, path, fileOffset)
+            CuFile.readFileToDeviceMemory(
+              buffer.getAddress, RapidsGdsStore.alignBufferSize(buffer), path, fileOffset)
             logDebug(s"Created device buffer for $path $fileOffset:$size via GDS")
           }
           buffer
@@ -227,7 +226,8 @@ class RapidsGdsStore(
             } else {
               // TODO: switch to async API when it's released, using the passed in CUDA stream.
               stream.sync()
-              CuFile.readFileToDeviceBuffer(dm, path, fileOffset + srcOffset)
+              CuFile.readFileToDeviceMemory(
+                dm.getAddress, RapidsGdsStore.alignBufferSize(dm), path, fileOffset + srcOffset)
               logDebug(s"Created device buffer for $path $fileOffset:$size via GDS")
             }
           case _ => throw new IllegalStateException(
@@ -252,4 +252,22 @@ class RapidsGdsStore(
       }
     }
   }
-}
+}
+
+object RapidsGdsStore {
+  val AllocationAlignment = 4096L
+  val AlignmentThreshold = 65536L
+
+  def alignUp(length: Long): Long = {
+    (length + AllocationAlignment - 1) & ~(AllocationAlignment - 1)
+  }
+
+  def alignBufferSize(buffer: DeviceMemoryBuffer): Long = {
+    val length = buffer.getLength
+    if (length < AlignmentThreshold) {
+      length
+    } else {
+      alignUp(length)
+    }
+  }
+}