NVIDIA · revans2 · May 3, 2021 · Apr 19, 2021 · Apr 30, 2021 · Apr 30, 2021
diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py
@@ -28,15 +28,15 @@
 def _assert_equal(cpu, gpu, float_check, path):
     t = type(cpu)
     if (t is Row):
-        assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {}".format(path)
+        assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu))
         if hasattr(cpu, "__fields__") and hasattr(gpu, "__fields__"):
             for field in cpu.__fields__:
                 _assert_equal(cpu[field], gpu[field], float_check, path + [field])
         else:
             for index in range(len(cpu)):
                 _assert_equal(cpu[index], gpu[index], float_check, path + [index])
     elif (t is list):
-        assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {}".format(path)
+        assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu))
         for index in range(len(cpu)):
             _assert_equal(cpu[index], gpu[index], float_check, path + [index])
     elif (t is pytypes.GeneratorType):

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -110,7 +110,7 @@ def do_join(spark):
 @ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', single_level_array_gens_no_decimal, ids=idfn)
 @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross', 'FullOuter'], ids=idfn)
-@pytest.mark.parametrize('batch_size', ['100', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
+@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches
 def test_sortmerge_join_array(data_gen, join_type, batch_size):
     def do_join(spark):
         left, right = create_nested_df(spark, short_gen, data_gen, 500, 500)
@@ -132,11 +132,14 @@ def do_join(spark):
 @ignore_order(local=True)
 @pytest.mark.parametrize('data_gen', [all_basic_struct_gen], ids=idfn)
 @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross', 'FullOuter'], ids=idfn)
-def test_sortmerge_join_struct(data_gen, join_type):
+@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test out of core joins too
+def test_sortmerge_join_struct(data_gen, join_type, batch_size):
     def do_join(spark):
         left, right = create_nested_df(spark, short_gen, data_gen, 500, 500)
         return left.join(right, left.key == right.r_key, join_type)
-    assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf)
+    conf = {'spark.rapids.sql.batchSizeBytes': batch_size}
+    conf.update(_sortmerge_join_conf)
+    assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf)
 
 # For spark to insert a shuffled hash join it has to be enabled with
 # "spark.sql.join.preferSortMergeJoin" = "false" and both sides have to

diff --git a/...k300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala b/...k300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala
@@ -89,7 +89,7 @@ case class GpuBroadcastHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan) extends BinaryExecNode with GpuHashJoin {
   import GpuMetric._
@@ -100,7 +100,7 @@ case class GpuBroadcastHashJoinExec(
     JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME),
-    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME))
+    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -141,28 +141,20 @@ case class GpuBroadcastHashJoinExec(
     val filterTime = gpuLongMetric(FILTER_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
 
-    val boundCondition = condition.map(GpuBindReferences.bindReference(_, output))
+    val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    lazy val builtTable = {
-      val ret = withResource(
-        GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys =>
-        val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch))
-        withResource(combined) { combined =>
-          filterBuiltNullsIfNecessary(GpuColumnVector.from(combined))
-        }
-      }
-
-      // Don't warn for a leak, because we cannot control when we are done with this
-      (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected())
-      ret
-    }
+    val broadcastRelation = broadcastExchange
+        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
 
     val rdd = streamedPlan.executeColumnar()
-    rdd.mapPartitions(it =>
-      doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows,
-        numOutputBatches, streamTime, joinTime, filterTime, totalTime))
+    rdd.mapPartitions { it =>
+      val builtBatch = broadcastRelation.value.batch
+      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
+      doJoin(builtBatch, it, targetSize, spillCallback,
+        numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime,
+        filterTime, totalTime)
+    }
   }
 }
diff --git a/...rk300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala b/...rk300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala
@@ -75,7 +75,7 @@ case class GpuShuffledHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan,
     override val isSkewJoin: Boolean)

diff --git a/...k301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala b/...k301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala
@@ -87,7 +87,7 @@ case class GpuBroadcastHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan) extends BinaryExecNode with GpuHashJoin {
   import GpuMetric._
@@ -98,7 +98,7 @@ case class GpuBroadcastHashJoinExec(
     JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME),
-    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME))
+    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -139,28 +139,20 @@ case class GpuBroadcastHashJoinExec(
     val filterTime = gpuLongMetric(FILTER_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
 
-    val boundCondition = condition.map(GpuBindReferences.bindReference(_, output))
+    val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    lazy val builtTable = {
-      val ret = withResource(
-        GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys =>
-        val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch))
-        withResource(combined) { combined =>
-          filterBuiltNullsIfNecessary(GpuColumnVector.from(combined))
-        }
-      }
-
-      // Don't warn for a leak, because we cannot control when we are done with this
-      (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected())
-      ret
-    }
+    val broadcastRelation = broadcastExchange
+        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
 
     val rdd = streamedPlan.executeColumnar()
-    rdd.mapPartitions(it =>
-      doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows,
-        numOutputBatches, streamTime, joinTime, filterTime, totalTime))
+    rdd.mapPartitions { it =>
+      val builtBatch = broadcastRelation.value.batch
+      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
+      doJoin(builtBatch, it, targetSize, spillCallback,
+        numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime,
+        filterTime, totalTime)
+    }
   }
 }
diff --git a/...db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala b/...db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala
@@ -86,7 +86,7 @@ case class GpuBroadcastHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan) extends BinaryExecNode with GpuHashJoin {
   import GpuMetric._
@@ -97,7 +97,7 @@ case class GpuBroadcastHashJoinExec(
     JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME),
-    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME))
+    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -138,28 +138,20 @@ case class GpuBroadcastHashJoinExec(
     val filterTime = gpuLongMetric(FILTER_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
 
-    val boundCondition = condition.map(GpuBindReferences.bindReference(_, output))
+    val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    lazy val builtTable = {
-      val ret = withResource(
-        GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys =>
-        val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch))
-        withResource(combined) { combined =>
-          filterBuiltNullsIfNecessary(GpuColumnVector.from(combined))
-        }
-      }
-
-      // Don't warn for a leak, because we cannot control when we are done with this
-      (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected())
-      ret
-    }
+    val broadcastRelation = broadcastExchange
+        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
 
     val rdd = streamedPlan.executeColumnar()
-    rdd.mapPartitions(it =>
-      doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows,
-        numOutputBatches, streamTime, joinTime, filterTime, totalTime))
+    rdd.mapPartitions { it =>
+      val builtBatch = broadcastRelation.value.batch
+      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
+      doJoin(builtBatch, it, targetSize, spillCallback,
+        numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime,
+        filterTime, totalTime)
+    }
   }
 }
diff --git a/...1db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala b/...1db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala
@@ -58,15 +58,15 @@ class GpuShuffledHashJoinMeta(
   }
 
   override def convertToGpu(): GpuExec = {
-    val Seq(leftChild, rightChild) = childPlans.map(_.convertIfNeeded())
+    val Seq(left, right) = childPlans.map(_.convertIfNeeded)
     GpuShuffledHashJoinExec(
       leftKeys.map(_.convertToGpu()),
       rightKeys.map(_.convertToGpu()),
       join.joinType,
       GpuJoinUtils.getGpuBuildSide(join.buildSide),
       condition.map(_.convertToGpu()),
-      leftChild,
-      rightChild)
+      left,
+      right)
   }
 }
 
@@ -75,12 +75,12 @@ case class GpuShuffledHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan)
-    extends GpuShuffledHashJoinBase(
-      leftKeys,
-      rightKeys,
-      buildSide,
-      condition,
-      isSkewJoin = false)
+  extends GpuShuffledHashJoinBase(
+    leftKeys,
+    rightKeys,
+    buildSide,
+    condition,
+    isSkewJoin = false)
diff --git a/...k311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala b/...k311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala
@@ -91,7 +91,7 @@ case class GpuBroadcastHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan) extends BinaryExecNode with GpuHashJoin {
   import GpuMetric._
@@ -102,7 +102,7 @@ case class GpuBroadcastHashJoinExec(
     JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME),
-    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME))
+    FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -143,28 +143,20 @@ case class GpuBroadcastHashJoinExec(
     val filterTime = gpuLongMetric(FILTER_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
 
-    val broadcastRelation = broadcastExchange
-        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
+    val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
 
-    val boundCondition = condition.map(GpuBindReferences.bindReference(_, output))
+    val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
 
-    lazy val builtTable = {
-      val ret = withResource(
-        GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys =>
-        val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch))
-        withResource(combined) { combined =>
-          filterBuiltNullsIfNecessary(GpuColumnVector.from(combined))
-        }
-      }
-
-      // Don't warn for a leak, because we cannot control when we are done with this
-      (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected())
-      ret
-    }
+    val broadcastRelation = broadcastExchange
+        .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]()
 
     val rdd = streamedPlan.executeColumnar()
-    rdd.mapPartitions(it =>
-      doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows,
-        numOutputBatches, streamTime, joinTime, filterTime, totalTime))
+    rdd.mapPartitions { it =>
+      val builtBatch = broadcastRelation.value.batch
+      GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected())
+      doJoin(builtBatch, it, targetSize, spillCallback,
+        numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime,
+        filterTime, totalTime)
+    }
   }
 }
diff --git a/...rk311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala b/...rk311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala
@@ -58,7 +58,7 @@ class GpuShuffledHashJoinMeta(
   }
 
   override def convertToGpu(): GpuExec = {
-    val Seq(left, right) = childPlans.map(_.convertIfNeeded())
+    val Seq(left, right) = childPlans.map(_.convertIfNeeded)
     GpuShuffledHashJoinExec(
       leftKeys.map(_.convertToGpu()),
       rightKeys.map(_.convertToGpu()),
@@ -76,7 +76,7 @@ case class GpuShuffledHashJoinExec(
     rightKeys: Seq[Expression],
     joinType: JoinType,
     buildSide: GpuBuildSide,
-    condition: Option[Expression],
+    override val condition: Option[Expression],
     left: SparkPlan,
     right: SparkPlan,
     override val isSkewJoin: Boolean)

diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java
@@ -81,7 +81,7 @@ public static synchronized void debug(String name, ColumnarBatch cb) {
    * @param name the name of the column to print out.
    * @param col the column to print out.
    */
-  public static synchronized void debug(String name, ai.rapids.cudf.ColumnVector col) {
+  public static synchronized void debug(String name, ai.rapids.cudf.ColumnView col) {
     try (HostColumnVector hostCol = col.copyToHost()) {
       debug(name, hostCol);
     }
@@ -671,7 +671,8 @@ static boolean typeConversionAllowed(Table table, DataType[] colTypes, int start
    */
   static boolean typeConversionAllowed(Table table, DataType[] colTypes) {
     final int numColumns = table.getNumberOfColumns();
-    assert numColumns == colTypes.length: "The number of columns and the number of types don't match";
+    assert numColumns == colTypes.length: "The number of columns and the number of types don't " +
+        "match " + table + " " + Arrays.toString(colTypes);
     boolean ret = true;
     for (int colIndex = 0; colIndex < numColumns; colIndex++) {
       ret = ret && typeConversionAllowed(table.getColumn(colIndex), colTypes[colIndex]);

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala
@@ -96,6 +96,14 @@ case class GpuBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean
   override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]"
 
   override def columnarEval(batch: ColumnarBatch): Any = {
-    batch.column(ordinal).asInstanceOf[GpuColumnVector].incRefCount()
+    batch.column(ordinal) match {
+      case fb: GpuColumnVectorFromBuffer =>
+        // When doing a project we might re-order columns or do other things that make it
+        // so this no longer looks like the original contiguous buffer it came from
+        // so to avoid it appearing to down stream processing as the same buffer we change
+        // the type here.
+        new GpuColumnVector(fb.dataType(), fb.getBase.incRefCount())
+      case cv: GpuColumnVector => cv.incRefCount()
+    }
   }
 }