From b48b801201ea43254b5e3f9550656f6fa7c4adbe Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 22 Nov 2024 07:31:01 -0800
Subject: [PATCH] remove extra sync, and make sure copyNext is always
 synchronous with the cuda stream

---
 .../com/nvidia/spark/rapids/spill/SpillFramework.scala    | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/spill/SpillFramework.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/spill/SpillFramework.scala
index 58f39ec56ee..5e62e908d53 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/spill/SpillFramework.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/spill/SpillFramework.scala
@@ -205,9 +205,8 @@ object SpillableHostBufferHandle extends Logging {
         while (chunkedPacker.hasNext) {
           withResource(chunkedPacker.next(bb)) { n =>
             builder.copyNext(n, Cuda.DEFAULT_STREAM)
-            // we are calling chunked packer on `bb` again each time, we need
-            // to synchronize before we ask for the next chunk
-            Cuda.DEFAULT_STREAM.sync()
+            // copyNext is synchronous w.r.t. the cuda stream passed,
+            // no need to synchronize here.
           }
         }
       }
@@ -221,7 +220,6 @@ object SpillableHostBufferHandle extends Logging {
     withResource(
       SpillFramework.stores.hostStore.makeBuilder(handle)) { builder =>
       builder.copyNext(buff, Cuda.DEFAULT_STREAM)
-      Cuda.DEFAULT_STREAM.sync()
       builder.build
     }
   }
@@ -1163,7 +1161,7 @@ class SpillableHostStore(val maxSize: Option[Long] = None)
 
     override def copyNext(mb: DeviceMemoryBuffer, stream: Cuda.Stream): Unit = {
       GpuTaskMetrics.get.spillToHostTime {
-        singleShotBuffer.copyFromMemoryBufferAsync(
+        singleShotBuffer.copyFromMemoryBuffer(
           copied,
           mb,
           0,