From ee51ac9ff0a4977fbca8bd80cca2e584b5300f0f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 19 Apr 2021 16:35:18 -0500 Subject: [PATCH 1/9] Allow batching the output of a join Signed-off-by: Robert (Bobby) Evans --- integration_tests/src/main/python/asserts.py | 4 +- .../src/main/python/join_test.py | 14 +- .../spark300/GpuBroadcastHashJoinExec.scala | 29 +- .../spark300/GpuShuffledHashJoinExec.scala | 2 +- .../spark301/GpuBroadcastHashJoinExec.scala | 29 +- .../spark301db/GpuBroadcastHashJoinExec.scala | 29 +- .../spark301db/GpuShuffledHashJoinExec.scala | 20 +- .../spark311/GpuBroadcastHashJoinExec.scala | 29 +- .../spark311/GpuShuffledHashJoinExec.scala | 4 +- .../nvidia/spark/rapids/GpuColumnVector.java | 2 +- .../rapids/GpuShuffledHashJoinBase.scala | 43 +- .../GpuBroadcastNestedLoopJoinExec.scala | 2 +- .../sql/rapids/execution/GpuHashJoin.scala | 1017 +++++++++++++---- 13 files changed, 879 insertions(+), 345 deletions(-) diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py index 4af5cd0d2f8..1f50028a817 100644 --- a/integration_tests/src/main/python/asserts.py +++ b/integration_tests/src/main/python/asserts.py @@ -28,7 +28,7 @@ def _assert_equal(cpu, gpu, float_check, path): t = type(cpu) if (t is Row): - assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {}".format(path) + assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu)) if hasattr(cpu, "__fields__") and hasattr(gpu, "__fields__"): for field in cpu.__fields__: _assert_equal(cpu[field], gpu[field], float_check, path + [field]) @@ -36,7 +36,7 @@ def _assert_equal(cpu, gpu, float_check, path): for index in range(len(cpu)): _assert_equal(cpu[index], gpu[index], float_check, path + [index]) elif (t is list): - assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {}".format(path) + assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu)) for index in range(len(cpu)): _assert_equal(cpu[index], gpu[index], float_check, path + [index]) elif (t is pytypes.GeneratorType): diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index 6a4258308af..51c4ccf4a84 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -93,11 +93,14 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens_no_decimal, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross', 'FullOuter'], ids=idfn) -def test_sortmerge_join_array(data_gen, join_type): +@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test out of core joins too +def test_sortmerge_join_array(data_gen, join_type, batch_size): def do_join(spark): left, right = create_nested_df(spark, short_gen, data_gen, 500, 500) return left.join(right, left.key == right.r_key, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = {'spark.rapids.sql.batchSizeBytes': batch_size} + conf.update(_sortmerge_join_conf) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'KnownFloatingPointNormalized', 'ArrayTransform', 'LambdaFunction', 'NamedLambdaVariable', 'NormalizeNaNAndZero') @ignore_order(local=True) @@ -112,11 +115,14 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [all_basic_struct_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross', 'FullOuter'], ids=idfn) -def test_sortmerge_join_struct(data_gen, join_type): +@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test out of core joins too +def test_sortmerge_join_struct(data_gen, join_type, batch_size): def do_join(spark): left, right = create_nested_df(spark, short_gen, data_gen, 500, 500) return left.join(right, left.key == right.r_key, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = {'spark.rapids.sql.batchSizeBytes': batch_size} + conf.update(_sortmerge_join_conf) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'KnownFloatingPointNormalized', 'NormalizeNaNAndZero', 'CreateNamedStruct', 'GetStructField', 'Literal', 'If', 'IsNull') @ignore_order(local=True) diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala index d58584aa5a7..92156e550e1 100644 --- a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala +++ b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala @@ -89,18 +89,20 @@ case class GpuBroadcastHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ + private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS), STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME), JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME), - FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) + FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics override def requiredChildDistribution: Seq[Distribution] = { val mode = HashedRelationBroadcastMode(buildKeys) @@ -141,28 +143,17 @@ case class GpuBroadcastHashJoinExec( val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) + val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - val boundCondition = condition.map(GpuBindReferences.bindReference(_, output)) - - lazy val builtTable = { - val ret = withResource( - GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys => - val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch)) - withResource(combined) { combined => - GpuColumnVector.from(combined) - } - } - - // Don't warn for a leak, because we cannot control when we are done with this - (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected()) - ret - } + lazy val builtBatch = broadcastRelation.value.batch val rdd = streamedPlan.executeColumnar() rdd.mapPartitions(it => - doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows, - numOutputBatches, streamTime, joinTime, filterTime, totalTime)) + doJoin(builtBatch, it, targetSize, spillCallback, + numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, + filterTime, totalTime)) } } diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala index ff87fe64af9..58700c594aa 100644 --- a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala +++ b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuShuffledHashJoinExec.scala @@ -75,7 +75,7 @@ case class GpuShuffledHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan, override val isSkewJoin: Boolean) diff --git a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala index 4d0b461f79a..8e5b1f34978 100644 --- a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala @@ -87,18 +87,20 @@ case class GpuBroadcastHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ + private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS), STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME), JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME), - FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) + FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics override def requiredChildDistribution: Seq[Distribution] = { val mode = HashedRelationBroadcastMode(buildKeys) @@ -139,28 +141,17 @@ case class GpuBroadcastHashJoinExec( val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) + val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - val boundCondition = condition.map(GpuBindReferences.bindReference(_, output)) - - lazy val builtTable = { - val ret = withResource( - GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys => - val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch)) - withResource(combined) { combined => - GpuColumnVector.from(combined) - } - } - - // Don't warn for a leak, because we cannot control when we are done with this - (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected()) - ret - } + lazy val builtBatch = broadcastRelation.value.batch val rdd = streamedPlan.executeColumnar() rdd.mapPartitions(it => - doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows, - numOutputBatches, streamTime, joinTime, filterTime, totalTime)) + doJoin(builtBatch, it, targetSize, spillCallback, + numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, + filterTime, totalTime)) } } diff --git a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala index 5eb8e571596..7e7c2310559 100644 --- a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala @@ -86,18 +86,20 @@ case class GpuBroadcastHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ + private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS), STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME), JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME), - FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) + FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics override def requiredChildDistribution: Seq[Distribution] = { val mode = HashedRelationBroadcastMode(buildKeys) @@ -138,28 +140,17 @@ case class GpuBroadcastHashJoinExec( val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) + val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - val boundCondition = condition.map(GpuBindReferences.bindReference(_, output)) - - lazy val builtTable = { - val ret = withResource( - GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys => - val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch)) - withResource(combined) { combined => - GpuColumnVector.from(combined) - } - } - - // Don't warn for a leak, because we cannot control when we are done with this - (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected()) - ret - } + lazy val builtBatch = broadcastRelation.value.batch val rdd = streamedPlan.executeColumnar() rdd.mapPartitions(it => - doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows, - numOutputBatches, streamTime, joinTime, filterTime, totalTime)) + doJoin(builtBatch, it, targetSize, spillCallback, + numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, + filterTime, totalTime)) } } diff --git a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala index a957a364812..f0ea0169672 100644 --- a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala +++ b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuShuffledHashJoinExec.scala @@ -58,15 +58,15 @@ class GpuShuffledHashJoinMeta( } override def convertToGpu(): GpuExec = { - val Seq(leftChild, rightChild) = childPlans.map(_.convertIfNeeded()) + val Seq(left, right) = childPlans.map(_.convertIfNeeded) GpuShuffledHashJoinExec( leftKeys.map(_.convertToGpu()), rightKeys.map(_.convertToGpu()), join.joinType, GpuJoinUtils.getGpuBuildSide(join.buildSide), condition.map(_.convertToGpu()), - leftChild, - rightChild) + left, + right) } } @@ -75,12 +75,12 @@ case class GpuShuffledHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan) - extends GpuShuffledHashJoinBase( - leftKeys, - rightKeys, - buildSide, - condition, - isSkewJoin = false) + extends GpuShuffledHashJoinBase( + leftKeys, + rightKeys, + buildSide, + condition, + isSkewJoin = false) diff --git a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala index caa4070c2ee..a2ca9926f28 100644 --- a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala +++ b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala @@ -91,18 +91,20 @@ case class GpuBroadcastHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ + private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS), STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME), JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME), - FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) + FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics override def requiredChildDistribution: Seq[Distribution] = { val mode = HashedRelationBroadcastMode(buildKeys) @@ -143,28 +145,17 @@ case class GpuBroadcastHashJoinExec( val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) + val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - val boundCondition = condition.map(GpuBindReferences.bindReference(_, output)) - - lazy val builtTable = { - val ret = withResource( - GpuProjectExec.project(broadcastRelation.value.batch, gpuBuildKeys)) { keys => - val combined = GpuHashJoin.incRefCount(combine(keys, broadcastRelation.value.batch)) - withResource(combined) { combined => - GpuColumnVector.from(combined) - } - } - - // Don't warn for a leak, because we cannot control when we are done with this - (0 until ret.getNumberOfColumns).foreach(ret.getColumn(_).noWarnLeakExpected()) - ret - } + lazy val builtBatch = broadcastRelation.value.batch val rdd = streamedPlan.executeColumnar() rdd.mapPartitions(it => - doJoin(builtTable, it, boundCondition, numOutputRows, joinOutputRows, - numOutputBatches, streamTime, joinTime, filterTime, totalTime)) + doJoin(builtBatch, it, targetSize, spillCallback, + numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, + filterTime, totalTime)) } } diff --git a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala index e25927e0c28..ac092c2a7c6 100644 --- a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala +++ b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuShuffledHashJoinExec.scala @@ -58,7 +58,7 @@ class GpuShuffledHashJoinMeta( } override def convertToGpu(): GpuExec = { - val Seq(left, right) = childPlans.map(_.convertIfNeeded()) + val Seq(left, right) = childPlans.map(_.convertIfNeeded) GpuShuffledHashJoinExec( leftKeys.map(_.convertToGpu()), rightKeys.map(_.convertToGpu()), @@ -76,7 +76,7 @@ case class GpuShuffledHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], left: SparkPlan, right: SparkPlan, override val isSkewJoin: Boolean) diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java index a028e5e06db..003d4bc57bf 100644 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java +++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java @@ -81,7 +81,7 @@ public static synchronized void debug(String name, ColumnarBatch cb) { * @param name the name of the column to print out. * @param col the column to print out. */ - public static synchronized void debug(String name, ai.rapids.cudf.ColumnVector col) { + public static synchronized void debug(String name, ai.rapids.cudf.ColumnView col) { try (HostColumnVector hostCol = col.copyToHost()) { debug(name, hostCol); } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala index 6a9ab2410de..075f73dcf59 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala @@ -30,10 +30,12 @@ abstract class GpuShuffledHashJoinBase( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: GpuBuildSide, - condition: Option[Expression], + override val condition: Option[Expression], val isSkewJoin: Boolean) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ + private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -42,7 +44,7 @@ abstract class GpuShuffledHashJoinBase( STREAM_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_STREAM_TIME), JOIN_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_TIME), JOIN_OUTPUT_ROWS -> createMetric(MODERATE_LEVEL, DESCRIPTION_JOIN_OUTPUT_ROWS), - FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) + FILTER_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_FILTER_TIME)) ++ spillMetrics override def requiredChildDistribution: Seq[Distribution] = HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil @@ -68,37 +70,24 @@ abstract class GpuShuffledHashJoinBase( val joinTime = gpuLongMetric(JOIN_TIME) val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) - - val boundCondition = condition.map(GpuBindReferences.bindReference(_, output)) + val spillCallback = GpuMetric.makeSpillCallback(allMetrics) streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => { - var combinedSize = 0 - val startTime = System.nanoTime() - val builtTable = withResource(ConcatAndConsumeAll.getSingleBatchWithVerification( - buildIter, localBuildOutput)) { buildBatch: ColumnarBatch => - withResource(GpuProjectExec.project(buildBatch, gpuBuildKeys)) { keys => - val combined = GpuHashJoin.incRefCount(combine(keys, buildBatch)) - withResource(combined) { combined => - combinedSize = - GpuColumnVector.extractColumns(combined) - .map(_.getBase.getDeviceMemorySize).sum.toInt - GpuColumnVector.from(combined) - } - } - } - val delta = System.nanoTime() - startTime - buildTime += delta - totalTime += delta - buildDataSize += combinedSize - val context = TaskContext.get() - context.addTaskCompletionListener[Unit](_ => builtTable.close()) + withResource(ConcatAndConsumeAll.getSingleBatchWithVerification(buildIter, + localBuildOutput)) { builtBatch => + // doJoin will increment the reference counts as needed for the builtBatch + val delta = System.nanoTime() - startTime + buildTime += delta + totalTime += delta + buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(builtBatch) - doJoin(builtTable, streamIter, boundCondition, - numOutputRows, joinOutputRows, numOutputBatches, - streamTime, joinTime, filterTime, totalTime) + doJoin(builtBatch, streamIter, targetSize, spillCallback, + numOutputRows, joinOutputRows, numOutputBatches, + streamTime, joinTime, filterTime, totalTime) + } } } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala index f104efb0ca8..2b7ea1028a9 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExec.scala @@ -54,7 +54,7 @@ class GpuBroadcastNestedLoopJoinMeta( join.joinType match { case Inner => case Cross => - case _ => willNotWorkOnGpu(s"$join.joinType currently is not supported") + case _ => willNotWorkOnGpu(s"${join.joinType} currently is not supported") } val gpuBuildSide = ShimLoader.getSparkShims.getBuildSide(join) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index d7f17532099..bff755aaaa6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -15,15 +15,16 @@ */ package org.apache.spark.sql.rapids.execution -import ai.rapids.cudf.{NvtxColor, Table} +import ai.rapids.cudf.{ColumnView, DType, GatherMap, NvtxColor, NvtxRange, OrderByArg, Scalar, Table} import com.nvidia.spark.rapids._ +import com.nvidia.spark.rapids.RapidsBuffer.SpillCallback import org.apache.spark.TaskContext -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression} import org.apache.spark.sql.catalyst.plans.{Cross, ExistenceJoin, FullOuter, Inner, InnerLike, JoinType, LeftAnti, LeftExistence, LeftOuter, LeftSemi, RightOuter} import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.types.{ArrayType, MapType, StructType} -import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch object JoinTypeChecks { def tagForGpu(joinType: JoinType, meta: RapidsMeta[_, _, _]): Unit = { @@ -55,7 +56,8 @@ object JoinTypeChecks { } } -object GpuHashJoin { +object GpuHashJoin extends Arm { + def tagJoin( meta: RapidsMeta[_, _, _], joinType: JoinType, @@ -79,12 +81,669 @@ object GpuHashJoin { } } - def incRefCount(cb: ColumnarBatch): ColumnarBatch = { - GpuColumnVector.extractBases(cb).foreach(_.incRefCount()) - cb + def extractTopLevelAttributes( + exprs: Seq[Expression], + includeAlias: Boolean): Seq[Option[Attribute]] = + exprs.map { + case a: AttributeReference => Some(a.toAttribute) + case GpuAlias(a: AttributeReference, _) if includeAlias => Some(a.toAttribute) + case _ => None + } + + /** + * Filter rows from the batch where all of the keys are null. + */ + def filterNulls(cb: ColumnarBatch, boundKeys: Seq[Expression]): ColumnarBatch = { + var mask: ai.rapids.cudf.ColumnVector = null + try { + withResource(GpuProjectExec.project(cb, boundKeys)) { keys => + val keyColumns = GpuColumnVector.extractBases(keys) + // to remove a row all of the key columns must be null for that row + // If there is even one key column with no nulls in it, don't filter anything + // we do this by leaving mask as null + if (keyColumns.forall(_.hasNulls)) { + keyColumns.foreach { column => + withResource(column.isNull) { nn => + if (mask == null) { + mask = nn.incRefCount() + } else { + mask = withResource(mask) { _ => + mask.and(nn) + } + } + } + } + } + } + + if (mask == null) { + // There was nothing to filter. + GpuColumnVector.incRefCounts(cb) + } else { + val colTypes = GpuColumnVector.extractTypes(cb) + withResource(GpuColumnVector.from(cb)) { tbl => + withResource(tbl.filter(mask)) { filtered => + GpuColumnVector.from(filtered, colTypes) + } + } + } + } finally { + if (mask != null) { + mask.close() + mask = null + } + } + } +} + +/** + * Generic trait for all join gather instances. + * All instances should be spillable. + * The life cycle of this assumes that when it is created that the data and + * gather maps will be used shortly. + * If you are not going to use these for a while, like when returning from an iterator, + * then allowSpilling should be called so that the cached data is released and spilling + * can be allowed. If you need/want to use the data again, just start using it, and it + * will be cached yet again until allowSpilling is called. + * When you are completely done with this object call close on it. + */ +trait JoinGatherer extends AutoCloseable with Arm { + /** + * Gather the next n rows from the join gather maps. + * @param n how many rows to gather + * @return the gathered data as a ColumnarBatch + */ + def gatherNext(n: Int): ColumnarBatch + + /** + * Is all of the data gathered so far. + */ + def isDone: Boolean + + /** + * Number of rows left to gather + */ + def numRowsLeft: Long + + /** + * Indicate that we are done messing with the data for now and it can be spilled. + */ + def allowSpilling(): Unit + + /** + * A really fast and dirty way to estimate the size of each row in the join output + */ + def realCheapPerRowSizeEstimate: Double + + /** + * Get the bit count size map for the next n rows to be gathered. + */ + def getBitSizeMap(n: Int): ColumnView + + /** + * If the data is all fixed width return the size of each row, otherwise return null. + */ + def getFixedWidthBitSize: Option[Int] + + /** + * Do a complete/expensive job to get the number of rows that can be gathered to get close + * to the targetSize for the final output. + */ + def gatherRowEstimate(targetSize: Long): Int = { + val bitSizePerRow = getFixedWidthBitSize + if (bitSizePerRow.isDefined) { + Math.min(Math.min((targetSize/bitSizePerRow.get) / 8, numRowsLeft), Integer.MAX_VALUE).toInt + } else { + val estimatedRows = Math.min( + ((targetSize / realCheapPerRowSizeEstimate) * 1.1).toLong, + numRowsLeft) + val numRowsToProbe = Math.min(estimatedRows, Integer.MAX_VALUE).toInt + if (numRowsToProbe <= 0) { + 1 + } else { + val sum = withResource(getBitSizeMap(numRowsToProbe)) { bitSizes => + bitSizes.prefixSum() + } + val cutoff = withResource(sum) { sum => + withResource(new Table(sum)) { sumTable => + withResource(ai.rapids.cudf.ColumnVector.fromLongs(targetSize * 8)) { bound => + withResource(new Table(bound)) { boundTab => + sumTable.lowerBound(boundTab, OrderByArg.asc(0)) + } + } + } + } + withResource(cutoff) { cutoff => + withResource(cutoff.copyToHost()) { hostCutoff => + hostCutoff.getInt(0) + hostCutoff.getInt(0) + } + } + } + } + } +} + +/** + * Holds a columnar batch that is cached until it is marked that it can be spilled. + */ +class LazySpillableColumnarBatch( + cb: ColumnarBatch, + spillCallback: SpillCallback) extends AutoCloseable with Arm { + + private var cached: Option[ColumnarBatch] = Some(GpuColumnVector.incRefCounts(cb)) + private var spill: Option[SpillableColumnarBatch] = None + val numRows: Int = cb.numRows() + val deviceMemorySize: Long = GpuColumnVector.getTotalDeviceMemoryUsed(cb) + val dataTypes: Array[DataType] = GpuColumnVector.extractTypes(cb) + val numCols: Int = dataTypes.length + + def getBatch: ColumnarBatch = synchronized { + if (cached.isEmpty) { + cached = Some(spill.get.getColumnarBatch()) + } + cached.get + } + + def allowSpilling(): Unit = synchronized { + if (spill.isEmpty && cached.isDefined) { + // First time we need to allow for spilling + spill = Some(SpillableColumnarBatch(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableColumnarBatch takes ownership of it. + cached = None + } + cached.foreach(_.close()) + cached = None + } + + override def close(): Unit = synchronized { + cached.foreach(_.close()) + cached = None + spill.foreach(_.close()) + spill = None + } +} + +object JoinGathererImpl { + + /** + * Calculate the row size in bits for a fixed width schema. If a type is encountered that is + * not fixed width, or is not known a None is returned. + */ + def fixedWidthRowSizeBits(dts: Seq[DataType]): Option[Int] = + sumRowSizesBits(dts, nullValueCalc = false) + + /** + * Calculate the null row size for a given schema in bits. If an unexpected type is enountered + * an exception is thrown + */ + def nullRowSizeBits(dts: Seq[DataType]): Int = + sumRowSizesBits(dts, nullValueCalc = true).get + + + /** + * Sum the row sizes for each data type passed in. If any one of the sizes is not available + * the entire result is considered to not be available. If nullValueCalc is true a result is + * guaranteed to be returned or an exception thrown. + */ + private def sumRowSizesBits(dts: Seq[DataType], nullValueCalc: Boolean): Option[Int] = { + val allOptions = dts.map(calcRowSizeBits(_, nullValueCalc)) + if (allOptions.exists(_.isEmpty)) { + None + } else { + Some(allOptions.map(_.get).sum + 1) + } + } + + /** + * Calculate the row bit size for the given data type. If nullValueCalc is false + * then variable width types and unexpected types will result in a None being returned. + * If it is true variable width types will have a value returned that corresponds to a + * null, and unknown types will throw an exception. + */ + private def calcRowSizeBits(dt: DataType, nullValueCalc: Boolean): Option[Int] = dt match { + case StructType(fields) => + sumRowSizesBits(fields.map(_.dataType), nullValueCalc) + case dt: DecimalType if dt.precision > DType.DECIMAL64_MAX_PRECISION => + if (nullValueCalc) { + throw new IllegalArgumentException(s"Found an unsupported type $dt") + } else { + None + } + case _: NumericType | DateType | TimestampType | BooleanType | NullType => + Some(GpuColumnVector.getNonNestedRapidsType(dt).getSizeInBytes * 8 + 1) + case StringType | BinaryType | ArrayType(_, _) if nullValueCalc => + // Single offset value and a validity value + Some((DType.INT32.getSizeInBytes * 8) + 1) + case x if nullValueCalc => + throw new IllegalArgumentException(s"Found an unsupported type $x") + case _ => None + } +} + +/** + * JoinGatherer for a single map/table + */ +class JoinGathererImpl( + // TODO need a way to spill/cache the GatherMap + private val gatherMap: GatherMap, + private val data: LazySpillableColumnarBatch, + private val closeData: Boolean) extends JoinGatherer { + + // How much of the gather map we have output so far + private var gatheredUpTo: Long = 0 + private val totalRows: Long = gatherMap.getRowCount + private val totalInputRows: Int = data.numRows + private val totalInputSize: Long = data.deviceMemorySize + private val (fixedWidthRowSizeBits, nullRowSizeBits) = { + val dts = data.dataTypes + val fw = JoinGathererImpl.fixedWidthRowSizeBits(dts) + val nullVal = JoinGathererImpl.nullRowSizeBits(dts) + (fw, nullVal) + } + + override def realCheapPerRowSizeEstimate: Double = { + // Avoid divide by 0 here and later on + if (totalInputRows > 0 && totalInputSize > 0) { + totalInputSize.toDouble / totalInputRows + } else { + 1.0 + } + } + + override def getFixedWidthBitSize: Option[Int] = fixedWidthRowSizeBits + + override def gatherNext(n: Int): ColumnarBatch = synchronized { + val start = gatheredUpTo + assert((start + n) <= totalRows) + val ret = withResource(gatherMap.toColumnView(start, n)) { gatherView => + val batch = data.getBatch + val gatheredTab = withResource(GpuColumnVector.from(batch)) { table => + table.gather(gatherView) + } + withResource(gatheredTab) { gt => + GpuColumnVector.from(gt, GpuColumnVector.extractTypes(batch)) + } + } + gatheredUpTo += n + ret + } + + override def isDone: Boolean = synchronized { + gatheredUpTo >= totalRows + } + + override def numRowsLeft: Long = totalRows - gatheredUpTo + + override def allowSpilling(): Unit = { + data.allowSpilling() + } + + override def getBitSizeMap(n: Int): ColumnView = synchronized { + val cb = data.getBatch + val inputBitCounts = withResource(GpuColumnVector.from(cb)) { table => + withResource(table.rowBitCount()) { bits => + bits.castTo(DType.INT64) + } + } + // Gather the bit counts so we know what the output table will look like + val gatheredBitCount = withResource(inputBitCounts) { inputBitCounts => + withResource(gatherMap.toColumnView(gatheredUpTo, n)) { gatherView => + // Gather only works on a table so wrap the single column + val gatheredTab = withResource(new Table(inputBitCounts)) { table => + table.gather(gatherView) + } + withResource(gatheredTab) { gatheredTab => + gatheredTab.getColumn(0).incRefCount() + } + } + } + // The gather could have introduced nulls in the case of outer joins. Because of that + // we need to replace them with an appropriate size + if (gatheredBitCount.hasNulls) { + withResource(gatheredBitCount) { gatheredBitCount => + withResource(Scalar.fromLong(nullRowSizeBits.toLong)) { nullSize => + withResource(gatheredBitCount.isNull) { nullMask => + nullMask.ifElse(nullSize, gatheredBitCount) + } + } + } + } else { + gatheredBitCount + } + } + + override def close(): Unit = synchronized { + gatherMap.close() + if (closeData) { + data.close() + } else { + data.allowSpilling() + } } } +/** + * Join Gatherer for a left table and a right table + */ +case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends JoinGatherer { + assert(left.numRowsLeft == right.numRowsLeft, + "all gatherers much have the same number of rows to gather") + + override def gatherNext(n: Int): ColumnarBatch = { + withResource(left.gatherNext(n)) { leftGathered => + withResource(right.gatherNext(n)) { rightGathered => + val vectors = Seq(leftGathered, rightGathered).flatMap { batch => + (0 until batch.numCols()).map { i => + val col = batch.column(i) + col.asInstanceOf[GpuColumnVector].incRefCount() + col + } + }.toArray + new ColumnarBatch(vectors, n) + } + } + } + + override def isDone: Boolean = left.isDone + + override def numRowsLeft: Long = left.numRowsLeft + + override def allowSpilling(): Unit = { + left.allowSpilling() + right.allowSpilling() + } + + override def realCheapPerRowSizeEstimate: Double = + left.realCheapPerRowSizeEstimate + right.realCheapPerRowSizeEstimate + + override def getBitSizeMap(n: Int): ColumnView = { + (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { + case (Some(l), Some(r)) => // This should never happen, but just in case + withResource(GpuScalar.from(l.toLong + r.toLong, LongType)) { s => + ai.rapids.cudf.ColumnVector.fromScalar(s, n) + } + case (Some(l), None) => + withResource(GpuScalar.from(l.toLong, LongType)) { ls => + withResource(right.getBitSizeMap(n)) { rightBits => + ls.add(rightBits, DType.INT64) + } + } + case (None, Some(r)) => + withResource(GpuScalar.from(r.toLong, LongType)) { rs => + withResource(left.getBitSizeMap(n)) { leftBits => + rs.add(leftBits, DType.INT64) + } + } + case _ => + withResource(left.getBitSizeMap(n)) { leftBits => + withResource(right.getBitSizeMap(n)) { rightBits => + leftBits.add(rightBits, DType.INT64) + } + } + } + } + + override def getFixedWidthBitSize: Option[Int] = { + (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { + case (Some(l), Some(r)) => Some(l + r) + case _ => None + } + } + + override def close(): Unit = { + left.close() + right.close() + } +} + +object JoinGatherer extends Arm { + def apply(gatherMap: GatherMap, + inputData: LazySpillableColumnarBatch, + closeData: Boolean): JoinGatherer = + new JoinGathererImpl(gatherMap, inputData, closeData) + + def apply(leftMap: GatherMap, + leftData: LazySpillableColumnarBatch, + closeLeftData: Boolean, + rightMap: GatherMap, + rightData: LazySpillableColumnarBatch, + closeRightData: Boolean): JoinGatherer = { + val left = JoinGatherer(leftMap, leftData, closeLeftData) + val right = JoinGatherer(rightMap, rightData, closeRightData) + MultiJoinGather(left, right) + } + + def getRowsInNextBatch(gatherer: JoinGatherer, targetSize: Long): Int = { + withResource(new NvtxRange("calc gather size", NvtxColor.YELLOW)) { _ => + val rowsLeft = gatherer.numRowsLeft + val rowEstimate: Long = gatherer.getFixedWidthBitSize match { + case Some(fixedSize) => + // Odd corner cases for tests, make sure we do at least one row + Math.max(1, (targetSize / fixedSize) / 8) + case None => + // Heuristic to see if we need to do the expensive calculation + if (rowsLeft * gatherer.realCheapPerRowSizeEstimate <= targetSize * 0.75) { + rowsLeft + } else { + gatherer.gatherRowEstimate(targetSize) + } + } + Math.min(Math.min(rowEstimate, rowsLeft), Integer.MAX_VALUE).toInt + } + } +} + +class HashJoinIterator( + inputBuiltKeys: ColumnarBatch, + inputBuiltData: ColumnarBatch, + val stream: Iterator[ColumnarBatch], + val boundStreamKeys: Seq[Expression], + val boundStreamData: Seq[Expression], + val streamAttributes: Seq[Attribute], + val targetSize: Long, + val joinType: JoinType, + val buildSide: GpuBuildSide, + val spillCallback: SpillCallback, + streamTime: GpuMetric, + joinTime: GpuMetric, + totalTime: GpuMetric) extends Iterator[ColumnarBatch] with Arm { + import scala.collection.JavaConverters._ + + // For some join types even if there is no stream data we might output something + private var initialJoin = true + private var nextCb: Option[ColumnarBatch] = None + private var gathererStore: Option[JoinGatherer] = None + private val builtKeys = { + val tmp = new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback) + // Close the input keys, the lazy spillable batch now owns it. + inputBuiltKeys.close() + tmp + } + private val builtData = { + val tmp = new LazySpillableColumnarBatch(inputBuiltData, spillCallback) + // Close the input data, the lazy spillable batch now owns it. + inputBuiltData.close() + tmp + } + + def close(): Unit = { + builtKeys.close() + builtData.close() + nextCb.foreach(_.close()) + nextCb = None + gathererStore.foreach(_.close()) + gathererStore = None + } + + TaskContext.get().addTaskCompletionListener[Unit](_ => close()) + + private def nextCbFromGatherer(): Option[ColumnarBatch] = { + withResource(new NvtxWithMetrics("hash join gather", NvtxColor.DARK_GREEN, joinTime)) { _ => + val ret = gathererStore.map { gather => + val nextRows = JoinGatherer.getRowsInNextBatch(gather, targetSize) + gather.gatherNext(nextRows) + } + if (gathererStore.exists(_.isDone)) { + gathererStore.foreach(_.close()) + gathererStore = None + } + + if (ret.isDefined && gathererStore.isDefined) { + // We are about to return something. We got everything we need from it so now let it spill + // if there is more to be gathered later on. + gathererStore.foreach(_.allowSpilling()) + } + ret + } + } + + private def makeGatherer( + maps: Array[GatherMap], + leftData: LazySpillableColumnarBatch, + rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { + // The joiner should own/close the data that is on the stream side + // the build side is owned by the iterator. + val (joinerOwnsLeftData, joinerOwnsRightData) = buildSide match { + case GpuBuildRight => (true, false) + case GpuBuildLeft => (false, true) + } + val gatherer = maps.length match { + case 1 => + if (joinerOwnsRightData) { + rightData.close() + } + JoinGatherer(maps(0), leftData, joinerOwnsLeftData) + case 2 => if (rightData.numCols == 0) { + maps(1).close() + if (joinerOwnsRightData) { + rightData.close() + } + JoinGatherer(maps(0), leftData, joinerOwnsLeftData) + } else { + JoinGatherer(maps(0), leftData, joinerOwnsLeftData, + maps(1), rightData, joinerOwnsRightData) + } + case other => + throw new IllegalArgumentException(s"Got back unexpected number of gather maps $other") + } + if (gatherer.isDone) { + gatherer.close() + None + } else { + Some(gatherer) + } + } + + private def joinGatherMapLeftRight( + leftKeys: Table, + leftData: LazySpillableColumnarBatch, + rightKeys: Table, + rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { + withResource(new NvtxWithMetrics("hash join gather map", NvtxColor.ORANGE, joinTime)) { _ => + val maps = joinType match { + case LeftOuter => leftKeys.leftJoinGatherMaps(rightKeys, false) + case RightOuter => + // Reverse the output of the join, because we expect the right gather map to + // always be on the right + rightKeys.leftJoinGatherMaps(leftKeys, false).reverse + case _: InnerLike => leftKeys.innerJoinGatherMaps(rightKeys, false) + case LeftSemi => Array(leftKeys.leftSemiJoinGatherMap(rightKeys, false)) + case LeftAnti => Array(leftKeys.leftAntiJoinGatherMap(rightKeys, false)) + case FullOuter => leftKeys.fullJoinGatherMaps(rightKeys, false) + case _ => + throw new NotImplementedError(s"Joint Type ${joinType.getClass} is not currently" + + s" supported") + } + makeGatherer(maps, leftData, rightData) + } + } + + private def joinGatherMapLeftRight( + leftKeys: ColumnarBatch, + leftData: LazySpillableColumnarBatch, + rightKeys: ColumnarBatch, + rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { + withResource(GpuColumnVector.from(leftKeys)) { leftKeysTab => + withResource(GpuColumnVector.from(rightKeys)) { rightKeysTab => + joinGatherMapLeftRight(leftKeysTab, leftData, rightKeysTab, rightData) + } + } + } + + private def joinGatherMap( + buildKeys: ColumnarBatch, + buildData: LazySpillableColumnarBatch, + streamKeys: ColumnarBatch, + streamData: LazySpillableColumnarBatch): Option[JoinGatherer] = { + buildSide match { + case GpuBuildLeft => + joinGatherMapLeftRight(buildKeys, buildData, streamKeys, streamData) + case GpuBuildRight => + joinGatherMapLeftRight(streamKeys, streamData, buildKeys, buildData) + } + } + + private def joinGatherMap( + buildKeys: ColumnarBatch, + buildData: LazySpillableColumnarBatch, + streamCb: ColumnarBatch): Option[JoinGatherer] = { + withResource(GpuProjectExec.project(streamCb, boundStreamKeys)) { streamKeys => + withResource(GpuProjectExec.project(streamCb, boundStreamData)) { streamData => + joinGatherMap(buildKeys, buildData, + streamKeys, new LazySpillableColumnarBatch(streamData, spillCallback)) + } + } + } + + override def hasNext: Boolean = { + var mayContinue = true + while (nextCb.isEmpty && mayContinue) { + val startTime = System.nanoTime() + if (gathererStore.exists(!_.isDone)) { + nextCb = nextCbFromGatherer() + } else if (stream.hasNext) { + // Need to refill the gatherer + gathererStore.foreach(_.close()) + gathererStore = None + withResource(stream.next()) { cb => + streamTime += (System.nanoTime() - startTime) + gathererStore = joinGatherMap(builtKeys.getBatch, builtData, cb) + } + nextCb = nextCbFromGatherer() + } else if (initialJoin) { + withResource(GpuColumnVector.emptyBatch(streamAttributes.asJava)) { cb => + gathererStore = joinGatherMap(builtKeys.getBatch, builtData, cb) + } + nextCb = nextCbFromGatherer() + } else { + mayContinue = false + } + totalTime += (System.nanoTime() - startTime) + initialJoin = false + } + if (nextCb.isEmpty) { + // Nothing is left to return so close ASAP. + close() + } else { + builtKeys.allowSpilling() + } + nextCb.isDefined + } + + override def next(): ColumnarBatch = { + if (!hasNext) { + throw new NoSuchElementException() + } + val ret = nextCb.get + nextCb = None + ret + } +} + + trait GpuHashJoin extends GpuExec { def left: SparkPlan def right: SparkPlan @@ -145,57 +804,111 @@ trait GpuHashJoin extends GpuExec { } } - protected lazy val (gpuBuildKeys, gpuStreamedKeys) = { - require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType), - "Join keys from two sides should have same types") - val lkeys = GpuBindReferences.bindGpuReferences(leftKeys, left.output) - val rkeys = GpuBindReferences.bindGpuReferences(rightKeys, right.output) - buildSide match { - case GpuBuildLeft => (lkeys, rkeys) - case GpuBuildRight => (rkeys, lkeys) + def dedupDataFromKeys( + rightOutput: Seq[Attribute], + rightKeys: Seq[Expression], + leftKeys: Seq[Expression]): (Seq[Attribute], Seq[NamedExpression]) = { + // This means that we need a mapping from what we remove on the right to what in leftData can + // provide it. These are still symbolic references, so we are going to convert everything into + // attributes, and use it to make out mapping. + val leftKeyAttributes = GpuHashJoin.extractTopLevelAttributes(leftKeys, includeAlias = true) + val rightKeyAttributes = GpuHashJoin.extractTopLevelAttributes(rightKeys, includeAlias = false) + val zippedKeysMapping = rightKeyAttributes.zip(leftKeyAttributes) + val rightToLeftKeyMap = zippedKeysMapping.filter { + case (Some(_), Some(_: AttributeReference)) => true + case _ => false + }.map { + case (Some(right), Some(left)) => (right.exprId, left) + case _ => throw new IllegalStateException("INTERNAL ERROR THIS SHOULD NOT BE REACHABLE") + }.toMap + + val rightData = rightOutput.filterNot(att => rightToLeftKeyMap.contains(att.exprId)) + val remappedRightOutput = rightOutput.map { att => + rightToLeftKeyMap.get(att.exprId) + .map(leftAtt => GpuAlias(leftAtt, att.name)(att.exprId)) + .getOrElse(att) } + (rightData, remappedRightOutput) } /** - * Place the columns in left and the columns in right into a single ColumnarBatch + * Spark does joins rather simply. They do it row by row, and as such don't really worry + * about how much space is being taken up. We are doing this in batches, and have the option to + * deduplicate columns that we know are the same to save even more memory. + * + * As such we do the join in a few different stages. + * + * 1. We separate out the join keys from the data that will be gathered. The join keys are used + * to produce a gather map, and then can be released. The data needs to stay until it has been + * gathered. Depending on the type of join and what is being done the join output is likely to + * contain the join keys twice. We don't want to do this because it takes up too much memory + * so we remove the keys from the data for one side of the join. + * + * 2. After this we will do the join. We can produce multiple batches from a single + * pair of input batches. The output of this stage is called the intermediate output and is the + * data columns each side of the join smashed together. + * + * 3. In some cases there is a condition that filters out data from the join that should not be + * included. In the CPU code the condition will operate on the intermediate output. In some cases + * the condition may need to be rewritten to point to the deduplicated key column. + * + * 4. Finally we need to fix up the data to produce the correct output. This should be a simple + * projection that puts the deduplicated keys back to where they need to be. */ - def combine(left: ColumnarBatch, right: ColumnarBatch): ColumnarBatch = { - val l = GpuColumnVector.extractColumns(left) - val r = GpuColumnVector.extractColumns(right) - val c = l ++ r - new ColumnarBatch(c.asInstanceOf[Array[ColumnVector]], left.numRows()) + protected lazy val (leftData, rightData, intermediateOutput, finalProject) = { + require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType), + "Join keys from two sides should have same types") + val (leftData, remappedLeftOutput, rightData, remappedRightOutput) = joinType match { + case FullOuter | RightOuter | LeftOuter => + // We cannot dedupe anything here because the we can get nulls in the key columns + // at least one side + (left.output, left.output, right.output, right.output) + case _: InnerLike | LeftSemi | LeftAnti => + val (rightData, remappedRightData) = dedupDataFromKeys(right.output, rightKeys, leftKeys) + (left.output, left.output, rightData, remappedRightData) + case x => + throw new IllegalArgumentException(s"GpuHashJoin should not take $x as the JoinType") + } + + val intermediateOutput = leftData ++ rightData + + val finalProject: Seq[Expression] = joinType match { + case _: InnerLike | LeftOuter | RightOuter | FullOuter => + remappedLeftOutput ++ remappedRightOutput +// case j: ExistenceJoin => +// remappedLeftOutput :+ j.exists + case LeftExistence(_) => + remappedLeftOutput + case x => + throw new IllegalArgumentException(s"GpuHashJoin should not take $x as the JoinType") + } + (leftData, rightData, intermediateOutput, finalProject) } - // TODO eventually dedupe the keys - lazy val joinKeyIndices: Range = gpuBuildKeys.indices + protected lazy val (boundBuildKeys, boundBuildData, + boundStreamKeys, boundStreamData, + boundCondition, boundFinalProject) = { + val lkeys = GpuBindReferences.bindGpuReferences(leftKeys, left.output) + val ldata = GpuBindReferences.bindGpuReferences(leftData, left.output) + val rkeys = GpuBindReferences.bindGpuReferences(rightKeys, right.output) + val rdata = GpuBindReferences.bindGpuReferences(rightData, right.output) + val boundCondition = + condition.map(c => GpuBindReferences.bindGpuReference(c, intermediateOutput)) + val boundFinalProject = GpuBindReferences.bindGpuReferences(finalProject, intermediateOutput) - val localBuildOutput: Seq[Attribute] = buildPlan.output - // The first columns are the ones we joined on and need to remove - lazy val joinIndices: Seq[Int] = joinType match { - case RightOuter => - // The left table and right table are switched in the output - // because we don't support a right join, only left - val numRight = right.output.length - val numLeft = left.output.length - val joinLength = joinKeyIndices.length - def remap(index: Int): Int = { - if (index < numLeft) { - // part of the left table, but is on the right side of the tmp output - index + joinLength + numRight - } else { - // part of the right table, but is on the left side of the tmp output - index + joinLength - numLeft - } - } - output.indices.map (remap) - case _ => - val joinLength = joinKeyIndices.length - output.indices.map (v => v + joinLength) + buildSide match { + case GpuBuildLeft => (lkeys, ldata, rkeys, rdata, boundCondition, boundFinalProject) + case GpuBuildRight => (rkeys, rdata, lkeys, ldata, boundCondition, boundFinalProject) + } } - def doJoin(builtTable: Table, + val localBuildOutput: Seq[Attribute] = buildPlan.output + + def doJoin( + builtBatch: ColumnarBatch, stream: Iterator[ColumnarBatch], - boundCondition: Option[Expression], + targetSize: Long, + spillCallback: SpillCallback, numOutputRows: GpuMetric, joinOutputRows: GpuMetric, numOutputBatches: GpuMetric, @@ -203,191 +916,53 @@ trait GpuHashJoin extends GpuExec { joinTime: GpuMetric, filterTime: GpuMetric, totalTime: GpuMetric): Iterator[ColumnarBatch] = { - new Iterator[ColumnarBatch] { - import scala.collection.JavaConverters._ - var nextCb: Option[ColumnarBatch] = None - var first: Boolean = true - - TaskContext.get().addTaskCompletionListener[Unit](_ => closeCb()) + val realTarget = Math.max(targetSize, 10 * 1024) - def closeCb(): Unit = { - nextCb.foreach(_.close()) - nextCb = None - } + val (builtKeys, builtData) = { + val builtAnyNullable = + (joinType == LeftSemi || joinType == LeftAnti) && boundBuildKeys.forall(_.nullable) - override def hasNext: Boolean = { - var mayContinue = true - while (nextCb.isEmpty && mayContinue) { - val startTime = System.nanoTime() - if (stream.hasNext) { - val cb = stream.next() - streamTime += (System.nanoTime() - startTime) - nextCb = doJoin(builtTable, cb, boundCondition, joinOutputRows, numOutputRows, - numOutputBatches, joinTime, filterTime) - totalTime += (System.nanoTime() - startTime) - } else if (first) { - // We have to at least try one in some cases - val cb = GpuColumnVector.emptyBatch(streamedPlan.output.asJava) - streamTime += (System.nanoTime() - startTime) - nextCb = doJoin(builtTable, cb, boundCondition, joinOutputRows, numOutputRows, - numOutputBatches, joinTime, filterTime) - totalTime += (System.nanoTime() - startTime) - } else { - mayContinue = false - } - first = false - } - nextCb.isDefined + val cb = if (builtAnyNullable) { + GpuHashJoin.filterNulls(builtBatch, boundBuildKeys) + } else { + GpuColumnVector.incRefCounts(builtBatch) } - override def next(): ColumnarBatch = { - if (!hasNext) { - throw new NoSuchElementException() + withResource(cb) { cb => + closeOnExcept(GpuProjectExec.project(cb, boundBuildKeys)) { builtKeys => + (builtKeys, GpuProjectExec.project(cb, boundBuildData)) } - val ret = nextCb.get - nextCb = None - ret } } - } - private[this] def doJoin(builtTable: Table, - streamedBatch: ColumnarBatch, - boundCondition: Option[Expression], - numOutputRows: GpuMetric, - numJoinOutputRows: GpuMetric, - numOutputBatches: GpuMetric, - joinTime: GpuMetric, - filterTime: GpuMetric): Option[ColumnarBatch] = { - - val combined = withResource(streamedBatch) { streamedBatch => - withResource(GpuProjectExec.project(streamedBatch, gpuStreamedKeys)) { - streamedKeysBatch => - GpuHashJoin.incRefCount(combine(streamedKeysBatch, streamedBatch)) - } - } - val streamedTable = withResource(combined) { cb => - GpuColumnVector.from(cb) - } - - val joined = - withResource(new NvtxWithMetrics("hash join", NvtxColor.ORANGE, joinTime)) { _ => - // `doJoinLeftRight` closes the right table if the last argument (`closeRightTable`) - // is true, but never closes the left table. - buildSide match { - case GpuBuildLeft => - // tell `doJoinLeftRight` it is ok to close the `streamedTable`, this can help - // in order to close temporary/intermediary data after a filter in some scenarios. - doJoinLeftRight(builtTable, streamedTable, true) - case GpuBuildRight => - // tell `doJoinLeftRight` to not close `builtTable`, as it is owned by our caller, - // here we close the left table as that one is never closed by `doJoinLeftRight`. - withResource(streamedTable) { _ => - doJoinLeftRight(streamedTable, builtTable, false) - } + // The HashJoinIterator takes ownership of the built keys and built data. It will close + // them when it is done + val joinIterator = + new HashJoinIterator(builtKeys, builtData, stream, boundStreamKeys, boundStreamData, + streamedPlan.output, realTarget, joinType, buildSide, spillCallback, + streamTime, joinTime, totalTime) + val boundFinal = boundFinalProject + if (boundCondition.isDefined) { + val condition = boundCondition.get + joinIterator.flatMap { cb => + joinOutputRows += cb.numRows() + val tmp = GpuFilter(cb, condition, numOutputRows, numOutputBatches, filterTime) + if (tmp.numRows == 0) { + // Not sure if there is a better way to work around this + numOutputBatches.set(numOutputBatches.value - 1) + tmp.close() + None + } else { + Some(GpuProjectExec.projectAndClose(tmp, boundFinal, NoopMetric)) } } - - numJoinOutputRows += joined.numRows() - - val tmp = if (boundCondition.isDefined) { - GpuFilter(joined, boundCondition.get, numOutputRows, numOutputBatches, filterTime) - } else { - numOutputRows += joined.numRows() - numOutputBatches += 1 - joined - } - if (tmp.numRows() == 0) { - // Not sure if there is a better way to work around this - numOutputBatches.set(numOutputBatches.value - 1) - tmp.close() - None } else { - Some(tmp) - } - } - - // This is a work around added in response to https://github.com/NVIDIA/spark-rapids/issues/1643. - // to deal with slowness arising from many nulls in the build-side of the join. The work around - // should be removed when https://github.com/rapidsai/cudf/issues/7300 is addressed. - private[this] def filterNulls(table: Table, joinKeyIndices: Range, closeTable: Boolean): Table = { - var mask: ai.rapids.cudf.ColumnVector = null - try { - joinKeyIndices.indices.foreach { c => - mask = withResource(table.getColumn(c).isNotNull) { nn => - if (mask == null) { - nn.incRefCount() - } else { - withResource(mask) { _ => - mask.and(nn) - } - } - } + joinIterator.map { cb => + joinOutputRows += cb.numRows() + numOutputRows += cb.numRows() + numOutputBatches += 1 + GpuProjectExec.projectAndClose(cb, boundFinal, NoopMetric) } - table.filter(mask) - } finally { - if (mask != null) { - mask.close() - } - - // in some cases, we cannot close the table since it was the build table and is - // reused. - if (closeTable) { - table.close() - } - } - } - - private[this] def doJoinLeftRight( - leftTable: Table, rightTable: Table, closeRightTable: Boolean): ColumnarBatch = { - - def withRightTable(body: Table => Table): Table = { - val builtAnyNullable = - (joinType == LeftSemi || joinType == LeftAnti) && gpuBuildKeys.exists(_.nullable) - - if (builtAnyNullable) { - withResource(filterNulls(rightTable, joinKeyIndices, closeRightTable)) { filtered => - body(filtered) - } - } else { - try { - body(rightTable) - } finally { - if (closeRightTable) { - rightTable.close() - } - } - } - } - - val joinedTable = withRightTable { rt => - joinType match { - case LeftOuter => leftTable.onColumns(joinKeyIndices: _*) - .leftJoin(rt.onColumns(joinKeyIndices: _*), false) - case RightOuter => rt.onColumns(joinKeyIndices: _*) - .leftJoin(leftTable.onColumns(joinKeyIndices: _*), false) - case _: InnerLike => leftTable.onColumns(joinKeyIndices: _*) - .innerJoin(rt.onColumns(joinKeyIndices: _*), false) - case LeftSemi => leftTable.onColumns(joinKeyIndices: _*) - .leftSemiJoin(rt.onColumns(joinKeyIndices: _*), false) - case LeftAnti => leftTable.onColumns(joinKeyIndices: _*) - .leftAntiJoin(rt.onColumns(joinKeyIndices: _*), false) - case FullOuter => leftTable.onColumns(joinKeyIndices: _*) - .fullJoin(rt.onColumns(joinKeyIndices: _*), false) - case _ => - throw new NotImplementedError(s"Joint Type ${joinType.getClass} is not currently" + - s" supported") - } - } - - try { - val result = joinIndices.zip(output).map { case (joinIndex, outAttr) => - GpuColumnVector.from(joinedTable.getColumn(joinIndex).incRefCount(), outAttr.dataType) - }.toArray[ColumnVector] - - new ColumnarBatch(result, joinedTable.getRowCount.toInt) - } finally { - joinedTable.close() } } } From 7483cc2d31bfac22bfcdbc886ee5726ff5851c66 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 30 Apr 2021 06:49:28 -0500 Subject: [PATCH 2/9] Allow spilling of gather maps --- .../com/nvidia/spark/rapids/MetaUtils.scala | 12 ++ .../spark/rapids/SpillableColumnarBatch.scala | 70 +++++++++++- .../sql/rapids/execution/GpuHashJoin.scala | 105 +++++++++++++----- 3 files changed, 158 insertions(+), 29 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala index 300fa1bceed..afa5fdf39e9 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/MetaUtils.scala @@ -146,6 +146,18 @@ object MetaUtils extends Arm { } } + /** + * This is a hack to create a table meta that passed muster, but is not really going to be used + */ + lazy val ignoreTableMeta: TableMeta = { + val fbb = new FlatBufferBuilder(1024) + TableMeta.startTableMeta(fbb) + TableMeta.addRowCount(fbb, 0) + fbb.finish(TableMeta.endTableMeta(fbb)) + // copy the message to trim the backing array to only what is needed + TableMeta.getRootAsTableMeta(ByteBuffer.wrap(fbb.sizedByteArray())) + } + /** * Construct a table from a contiguous device buffer and a * `TableMeta` message describing the schema of the buffer data. diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala index c1b17d1e40c..8164f8b8595 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SpillableColumnarBatch.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids -import ai.rapids.cudf.ContiguousTable +import ai.rapids.cudf.{ContiguousTable, DeviceMemoryBuffer} import org.apache.spark.TaskContext import org.apache.spark.sql.rapids.TempSpillBufferId @@ -208,4 +208,72 @@ object SpillableColumnarBatch extends Arm { } } } +} + + +/** + * Just like a SpillableColumnarBatch but for buffers. + */ +class SpillableBuffer (id: TempSpillBufferId) extends AutoCloseable with Arm { + private var closed = false + + /** + * The ID that this is stored under. + * @note Use with caution because if this has been closed the id is no longer valid. + */ + def spillId: TempSpillBufferId = id + + lazy val sizeInBytes: Long = + withResource(RapidsBufferCatalog.acquireBuffer(id)) { buff => + buff.size + } + + /** + * Set a new spill priority. + */ + def setSpillPriority(priority: Long): Unit = { + withResource(RapidsBufferCatalog.acquireBuffer(id)) { rapidsBuffer => + rapidsBuffer.setSpillPriority(priority) + } + } + + /** + * Get the device buffer. + * @note It is the responsibility of the caller to close the buffer. + */ + def getDeviceBuffer(): DeviceMemoryBuffer = { + withResource(RapidsBufferCatalog.acquireBuffer(id)) { rapidsBuffer => + GpuSemaphore.acquireIfNecessary(TaskContext.get()) + rapidsBuffer.getDeviceMemoryBuffer + } + } + + /** + * Remove the buffer from the cache. + */ + override def close(): Unit = { + if (!closed) { + RapidsBufferCatalog.removeBuffer(id) + closed = true + } + } +} + +object SpillableBuffer extends Arm { + + /** + * Create a new SpillableBuffer. + * @note This takes over ownership of buffer, and buffer should not be used after this. + * @param buffer the buffer to make spillable + * @param priority the initial spill priority of this buffer + * @param spillCallback a callback when the buffer is spilled. This should be very light weight. + * It should never allocate GPU memory and really just be used for metrics. + */ + def apply(buffer: DeviceMemoryBuffer, + priority: Long, + spillCallback: RapidsBuffer.SpillCallback): SpillableBuffer = { + val id = TempSpillBufferId() + RapidsBufferCatalog.addBuffer(id, buffer, MetaUtils.ignoreTableMeta, priority, spillCallback) + new SpillableBuffer(id) + } } \ No newline at end of file diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index bff755aaaa6..c47c12400e4 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -15,7 +15,7 @@ */ package org.apache.spark.sql.rapids.execution -import ai.rapids.cudf.{ColumnView, DType, GatherMap, NvtxColor, NvtxRange, OrderByArg, Scalar, Table} +import ai.rapids.cudf.{ColumnView, DeviceMemoryBuffer, DType, GatherMap, NvtxColor, NvtxRange, OrderByArg, Scalar, Table} import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.RapidsBuffer.SpillCallback @@ -266,6 +266,47 @@ class LazySpillableColumnarBatch( } } +class LazySpillableGatherMap( + map: GatherMap, + spillCallback: SpillCallback) extends AutoCloseable with Arm { + + val getRowCount: Long = map.getRowCount + + private var cached: Option[DeviceMemoryBuffer] = Some(map.releaseBuffer()) + private var spill: Option[SpillableBuffer] = None + + def toColumnView(startRow: Long, numRows: Int): ColumnView = { + ColumnView.fromDeviceBuffer(getBuffer, startRow * 4L, DType.INT32, numRows) + } + + private def getBuffer = synchronized { + if (cached.isEmpty) { + cached = Some(spill.get.getDeviceBuffer()) + } + cached.get + } + + def allowSpilling(): Unit = synchronized { + if (spill.isEmpty && cached.isDefined) { + // First time we need to allow for spilling + spill = Some(SpillableBuffer(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableBuffer takes ownership of it. + cached = None + } + cached.foreach(_.close()) + cached = None + } + + override def close(): Unit = synchronized { + cached.foreach(_.close()) + cached = None + spill.foreach(_.close()) + spill = None + } +} + object JoinGathererImpl { /** @@ -327,8 +368,7 @@ object JoinGathererImpl { * JoinGatherer for a single map/table */ class JoinGathererImpl( - // TODO need a way to spill/cache the GatherMap - private val gatherMap: GatherMap, + private val gatherMap: LazySpillableGatherMap, private val data: LazySpillableColumnarBatch, private val closeData: Boolean) extends JoinGatherer { @@ -379,6 +419,7 @@ class JoinGathererImpl( override def allowSpilling(): Unit = { data.allowSpilling() + gatherMap.allowSpilling() } override def getBitSizeMap(n: Int): ColumnView = synchronized { @@ -500,15 +541,15 @@ case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends Join } object JoinGatherer extends Arm { - def apply(gatherMap: GatherMap, + def apply(gatherMap: LazySpillableGatherMap, inputData: LazySpillableColumnarBatch, closeData: Boolean): JoinGatherer = new JoinGathererImpl(gatherMap, inputData, closeData) - def apply(leftMap: GatherMap, + def apply(leftMap: LazySpillableGatherMap, leftData: LazySpillableColumnarBatch, closeLeftData: Boolean, - rightMap: GatherMap, + rightMap: LazySpillableGatherMap, rightData: LazySpillableColumnarBatch, closeRightData: Boolean): JoinGatherer = { val left = JoinGatherer(leftMap, leftData, closeLeftData) @@ -610,30 +651,38 @@ class HashJoinIterator( case GpuBuildRight => (true, false) case GpuBuildLeft => (false, true) } - val gatherer = maps.length match { - case 1 => - if (joinerOwnsRightData) { - rightData.close() - } - JoinGatherer(maps(0), leftData, joinerOwnsLeftData) - case 2 => if (rightData.numCols == 0) { - maps(1).close() - if (joinerOwnsRightData) { - rightData.close() - } - JoinGatherer(maps(0), leftData, joinerOwnsLeftData) + try { + val gatherer = maps.length match { + case 1 => + if (joinerOwnsRightData) { + rightData.close() + } + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + leftData, joinerOwnsLeftData) + case 2 => + if (rightData.numCols == 0) { + if (joinerOwnsRightData) { + rightData.close() + } + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + leftData, joinerOwnsLeftData) + } else { + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + leftData, joinerOwnsLeftData, + new LazySpillableGatherMap(maps(1), spillCallback), + rightData, joinerOwnsRightData) + } + case other => + throw new IllegalArgumentException(s"Got back unexpected number of gather maps $other") + } + if (gatherer.isDone) { + gatherer.close() + None } else { - JoinGatherer(maps(0), leftData, joinerOwnsLeftData, - maps(1), rightData, joinerOwnsRightData) + Some(gatherer) } - case other => - throw new IllegalArgumentException(s"Got back unexpected number of gather maps $other") - } - if (gatherer.isDone) { - gatherer.close() - None - } else { - Some(gatherer) + } finally { + maps.foreach(_.close()) } } From a39ff12a2701015c12bb56c6909fda8d57ab3d50 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 30 Apr 2021 15:16:21 -0500 Subject: [PATCH 3/9] Some fixes and cleanup --- integration_tests/src/main/python/asserts.py | 2 +- .../shims/spark300/GpuBroadcastHashJoinExec.scala | 9 +++++---- .../shims/spark301/GpuBroadcastHashJoinExec.scala | 9 +++++---- .../shims/spark301db/GpuBroadcastHashJoinExec.scala | 9 +++++---- .../shims/spark311/GpuBroadcastHashJoinExec.scala | 9 +++++---- .../java/com/nvidia/spark/rapids/GpuColumnVector.java | 3 ++- .../com/nvidia/spark/rapids/GpuBoundAttribute.scala | 10 +++++++++- 7 files changed, 32 insertions(+), 19 deletions(-) diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py index 1f50028a817..c0464a476aa 100644 --- a/integration_tests/src/main/python/asserts.py +++ b/integration_tests/src/main/python/asserts.py @@ -36,7 +36,7 @@ def _assert_equal(cpu, gpu, float_check, path): for index in range(len(cpu)): _assert_equal(cpu[index], gpu[index], float_check, path + [index]) elif (t is list): - assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu)) + assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu)) for index in range(len(cpu)): _assert_equal(cpu[index], gpu[index], float_check, path + [index]) elif (t is pytypes.GeneratorType): diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala index 92156e550e1..c62d417e158 100644 --- a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala +++ b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala @@ -148,12 +148,13 @@ case class GpuBroadcastHashJoinExec( val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - lazy val builtBatch = broadcastRelation.value.batch - val rdd = streamedPlan.executeColumnar() - rdd.mapPartitions(it => + rdd.mapPartitions { it => + val builtBatch = broadcastRelation.value.batch + GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected()) doJoin(builtBatch, it, targetSize, spillCallback, numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, - filterTime, totalTime)) + filterTime, totalTime) + } } } diff --git a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala index 8e5b1f34978..ffc51cc8ff2 100644 --- a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala @@ -146,12 +146,13 @@ case class GpuBroadcastHashJoinExec( val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - lazy val builtBatch = broadcastRelation.value.batch - val rdd = streamedPlan.executeColumnar() - rdd.mapPartitions(it => + rdd.mapPartitions { it => + val builtBatch = broadcastRelation.value.batch + GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected()) doJoin(builtBatch, it, targetSize, spillCallback, numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, - filterTime, totalTime)) + filterTime, totalTime) + } } } diff --git a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala index 7e7c2310559..b906663f95a 100644 --- a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala @@ -145,12 +145,13 @@ case class GpuBroadcastHashJoinExec( val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - lazy val builtBatch = broadcastRelation.value.batch - val rdd = streamedPlan.executeColumnar() - rdd.mapPartitions(it => + rdd.mapPartitions { it => + val builtBatch = broadcastRelation.value.batch + GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected()) doJoin(builtBatch, it, targetSize, spillCallback, numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, - filterTime, totalTime)) + filterTime, totalTime) + } } } diff --git a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala index a2ca9926f28..204e0cc06e1 100644 --- a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala +++ b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala @@ -150,12 +150,13 @@ case class GpuBroadcastHashJoinExec( val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() - lazy val builtBatch = broadcastRelation.value.batch - val rdd = streamedPlan.executeColumnar() - rdd.mapPartitions(it => + rdd.mapPartitions { it => + val builtBatch = broadcastRelation.value.batch + GpuColumnVector.extractBases(builtBatch).foreach(_.noWarnLeakExpected()) doJoin(builtBatch, it, targetSize, spillCallback, numOutputRows, joinOutputRows, numOutputBatches, streamTime, joinTime, - filterTime, totalTime)) + filterTime, totalTime) + } } } diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java index 003d4bc57bf..7ea5eaf0dbb 100644 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java +++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVector.java @@ -671,7 +671,8 @@ static boolean typeConversionAllowed(Table table, DataType[] colTypes, int start */ static boolean typeConversionAllowed(Table table, DataType[] colTypes) { final int numColumns = table.getNumberOfColumns(); - assert numColumns == colTypes.length: "The number of columns and the number of types don't match"; + assert numColumns == colTypes.length: "The number of columns and the number of types don't " + + "match " + table + " " + Arrays.toString(colTypes); boolean ret = true; for (int colIndex = 0; colIndex < numColumns; colIndex++) { ret = ret && typeConversionAllowed(table.getColumn(colIndex), colTypes[colIndex]); diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala index f01f63f3e80..c8101792d7e 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala @@ -96,6 +96,14 @@ case class GpuBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]" override def columnarEval(batch: ColumnarBatch): Any = { - batch.column(ordinal).asInstanceOf[GpuColumnVector].incRefCount() + batch.column(ordinal) match { + case fb: GpuColumnVectorFromBuffer => + // When doing a project we might re-order columns or do other things that make it + // so this no loner looks like the original contiguous buffer it came from + // so to avoid it appearing to down stream processing as the same buffer we change + // the type here. + new GpuColumnVector(fb.dataType(), fb.getBase.incRefCount()) + case cv: GpuColumnVector => cv.incRefCount() + } } } From 0ecd1b7fc6d878aab263214db05fd2444c6e97e7 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 30 Apr 2021 15:35:28 -0500 Subject: [PATCH 4/9] Some better profiling --- .../sql/rapids/execution/GpuHashJoin.scala | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index c47c12400e4..ff233e4e173 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -229,7 +229,8 @@ trait JoinGatherer extends AutoCloseable with Arm { */ class LazySpillableColumnarBatch( cb: ColumnarBatch, - spillCallback: SpillCallback) extends AutoCloseable with Arm { + spillCallback: SpillCallback, + name: String) extends AutoCloseable with Arm { private var cached: Option[ColumnarBatch] = Some(GpuColumnVector.incRefCounts(cb)) private var spill: Option[SpillableColumnarBatch] = None @@ -240,19 +241,23 @@ class LazySpillableColumnarBatch( def getBatch: ColumnarBatch = synchronized { if (cached.isEmpty) { - cached = Some(spill.get.getColumnarBatch()) + withResource(new NvtxRange("get batch " + name, NvtxColor.RED)) { _ => + cached = Some(spill.get.getColumnarBatch()) + } } cached.get } def allowSpilling(): Unit = synchronized { if (spill.isEmpty && cached.isDefined) { - // First time we need to allow for spilling - spill = Some(SpillableColumnarBatch(cached.get, - SpillPriorities.ACTIVE_ON_DECK_PRIORITY, - spillCallback)) - // Putting data in a SpillableColumnarBatch takes ownership of it. - cached = None + withResource(new NvtxRange("spill batch " + name, NvtxColor.RED)) { _ => + // First time we need to allow for spilling + spill = Some(SpillableColumnarBatch(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableColumnarBatch takes ownership of it. + cached = None + } } cached.foreach(_.close()) cached = None @@ -268,7 +273,8 @@ class LazySpillableColumnarBatch( class LazySpillableGatherMap( map: GatherMap, - spillCallback: SpillCallback) extends AutoCloseable with Arm { + spillCallback: SpillCallback, + name: String) extends AutoCloseable with Arm { val getRowCount: Long = map.getRowCount @@ -281,19 +287,23 @@ class LazySpillableGatherMap( private def getBuffer = synchronized { if (cached.isEmpty) { - cached = Some(spill.get.getDeviceBuffer()) + withResource(new NvtxRange("get map " + name, NvtxColor.RED)) { _ => + cached = Some(spill.get.getDeviceBuffer()) + } } cached.get } def allowSpilling(): Unit = synchronized { if (spill.isEmpty && cached.isDefined) { - // First time we need to allow for spilling - spill = Some(SpillableBuffer(cached.get, - SpillPriorities.ACTIVE_ON_DECK_PRIORITY, - spillCallback)) - // Putting data in a SpillableBuffer takes ownership of it. - cached = None + withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ => + // First time we need to allow for spilling + spill = Some(SpillableBuffer(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableBuffer takes ownership of it. + cached = None + } } cached.foreach(_.close()) cached = None @@ -598,13 +608,13 @@ class HashJoinIterator( private var nextCb: Option[ColumnarBatch] = None private var gathererStore: Option[JoinGatherer] = None private val builtKeys = { - val tmp = new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback) + val tmp = new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback, "build_keys") // Close the input keys, the lazy spillable batch now owns it. inputBuiltKeys.close() tmp } private val builtData = { - val tmp = new LazySpillableColumnarBatch(inputBuiltData, spillCallback) + val tmp = new LazySpillableColumnarBatch(inputBuiltData, spillCallback, "build_data") // Close the input data, the lazy spillable batch now owns it. inputBuiltData.close() tmp @@ -657,19 +667,19 @@ class HashJoinIterator( if (joinerOwnsRightData) { rightData.close() } - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), leftData, joinerOwnsLeftData) case 2 => if (rightData.numCols == 0) { if (joinerOwnsRightData) { rightData.close() } - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), leftData, joinerOwnsLeftData) } else { - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback), + JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), leftData, joinerOwnsLeftData, - new LazySpillableGatherMap(maps(1), spillCallback), + new LazySpillableGatherMap(maps(1), spillCallback, "right_map"), rightData, joinerOwnsRightData) } case other => @@ -742,7 +752,7 @@ class HashJoinIterator( withResource(GpuProjectExec.project(streamCb, boundStreamKeys)) { streamKeys => withResource(GpuProjectExec.project(streamCb, boundStreamData)) { streamData => joinGatherMap(buildKeys, buildData, - streamKeys, new LazySpillableColumnarBatch(streamData, spillCallback)) + streamKeys, new LazySpillableColumnarBatch(streamData, spillCallback, "stream_data")) } } } From b6a0c79de1b83f1c33b1ba75c7613c3797b8fcc9 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 30 Apr 2021 16:45:47 -0500 Subject: [PATCH 5/9] Addressed some review comments --- .../sql/rapids/execution/GpuHashJoin.scala | 125 ++++++++++-------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index ff233e4e173..5e6ba7a7fb3 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -130,7 +130,6 @@ object GpuHashJoin extends Arm { } finally { if (mask != null) { mask.close() - mask = null } } } @@ -176,12 +175,13 @@ trait JoinGatherer extends AutoCloseable with Arm { def realCheapPerRowSizeEstimate: Double /** - * Get the bit count size map for the next n rows to be gathered. + * Get the bit count size map for the next n rows to be gathered. The returned value is + * an INT64 for each row in the n rows requested. */ def getBitSizeMap(n: Int): ColumnView /** - * If the data is all fixed width return the size of each row, otherwise return null. + * If the data is all fixed width return the size of each row, otherwise return None. */ def getFixedWidthBitSize: Option[Int] @@ -194,8 +194,12 @@ trait JoinGatherer extends AutoCloseable with Arm { if (bitSizePerRow.isDefined) { Math.min(Math.min((targetSize/bitSizePerRow.get) / 8, numRowsLeft), Integer.MAX_VALUE).toInt } else { + // WARNING magic number below. The rowEstimateMultiplier is arbitrary, we want to get + // enough rows that we include that we go over the target size, but not too much so we + // waste memory. It could probably be tuned better. + val rowEstimateMultiplier = 1.1 val estimatedRows = Math.min( - ((targetSize / realCheapPerRowSizeEstimate) * 1.1).toLong, + ((targetSize / realCheapPerRowSizeEstimate) * rowEstimateMultiplier).toLong, numRowsLeft) val numRowsToProbe = Math.min(estimatedRows, Integer.MAX_VALUE).toInt if (numRowsToProbe <= 0) { @@ -215,8 +219,7 @@ trait JoinGatherer extends AutoCloseable with Arm { } withResource(cutoff) { cutoff => withResource(cutoff.copyToHost()) { hostCutoff => - hostCutoff.getInt(0) - hostCutoff.getInt(0) + Math.max(1, hostCutoff.getInt(0)) } } } @@ -239,7 +242,7 @@ class LazySpillableColumnarBatch( val dataTypes: Array[DataType] = GpuColumnVector.extractTypes(cb) val numCols: Int = dataTypes.length - def getBatch: ColumnarBatch = synchronized { + def getBatch: ColumnarBatch = { if (cached.isEmpty) { withResource(new NvtxRange("get batch " + name, NvtxColor.RED)) { _ => cached = Some(spill.get.getColumnarBatch()) @@ -248,7 +251,7 @@ class LazySpillableColumnarBatch( cached.get } - def allowSpilling(): Unit = synchronized { + def allowSpilling(): Unit = { if (spill.isEmpty && cached.isDefined) { withResource(new NvtxRange("spill batch " + name, NvtxColor.RED)) { _ => // First time we need to allow for spilling @@ -263,7 +266,7 @@ class LazySpillableColumnarBatch( cached = None } - override def close(): Unit = synchronized { + override def close(): Unit = { cached.foreach(_.close()) cached = None spill.foreach(_.close()) @@ -285,7 +288,7 @@ class LazySpillableGatherMap( ColumnView.fromDeviceBuffer(getBuffer, startRow * 4L, DType.INT32, numRows) } - private def getBuffer = synchronized { + private def getBuffer = { if (cached.isEmpty) { withResource(new NvtxRange("get map " + name, NvtxColor.RED)) { _ => cached = Some(spill.get.getDeviceBuffer()) @@ -294,7 +297,7 @@ class LazySpillableGatherMap( cached.get } - def allowSpilling(): Unit = synchronized { + def allowSpilling(): Unit = { if (spill.isEmpty && cached.isDefined) { withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ => // First time we need to allow for spilling @@ -309,7 +312,7 @@ class LazySpillableGatherMap( cached = None } - override def close(): Unit = synchronized { + override def close(): Unit = { cached.foreach(_.close()) cached = None spill.foreach(_.close()) @@ -385,8 +388,6 @@ class JoinGathererImpl( // How much of the gather map we have output so far private var gatheredUpTo: Long = 0 private val totalRows: Long = gatherMap.getRowCount - private val totalInputRows: Int = data.numRows - private val totalInputSize: Long = data.deviceMemorySize private val (fixedWidthRowSizeBits, nullRowSizeBits) = { val dts = data.dataTypes val fw = JoinGathererImpl.fixedWidthRowSizeBits(dts) @@ -395,6 +396,8 @@ class JoinGathererImpl( } override def realCheapPerRowSizeEstimate: Double = { + val totalInputRows: Int = data.numRows + val totalInputSize: Long = data.deviceMemorySize // Avoid divide by 0 here and later on if (totalInputRows > 0 && totalInputSize > 0) { totalInputSize.toDouble / totalInputRows @@ -405,15 +408,15 @@ class JoinGathererImpl( override def getFixedWidthBitSize: Option[Int] = fixedWidthRowSizeBits - override def gatherNext(n: Int): ColumnarBatch = synchronized { + override def gatherNext(n: Int): ColumnarBatch = { val start = gatheredUpTo assert((start + n) <= totalRows) val ret = withResource(gatherMap.toColumnView(start, n)) { gatherView => val batch = data.getBatch - val gatheredTab = withResource(GpuColumnVector.from(batch)) { table => + val gatheredTable = withResource(GpuColumnVector.from(batch)) { table => table.gather(gatherView) } - withResource(gatheredTab) { gt => + withResource(gatheredTable) { gt => GpuColumnVector.from(gt, GpuColumnVector.extractTypes(batch)) } } @@ -421,9 +424,8 @@ class JoinGathererImpl( ret } - override def isDone: Boolean = synchronized { + override def isDone: Boolean = gatheredUpTo >= totalRows - } override def numRowsLeft: Long = totalRows - gatheredUpTo @@ -432,7 +434,7 @@ class JoinGathererImpl( gatherMap.allowSpilling() } - override def getBitSizeMap(n: Int): ColumnView = synchronized { + override def getBitSizeMap(n: Int): ColumnView = { val cb = data.getBatch val inputBitCounts = withResource(GpuColumnVector.from(cb)) { table => withResource(table.rowBitCount()) { bits => @@ -466,7 +468,7 @@ class JoinGathererImpl( } } - override def close(): Unit = synchronized { + override def close(): Unit = { gatherMap.close() if (closeData) { data.close() @@ -512,7 +514,9 @@ case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends Join override def getBitSizeMap(n: Int): ColumnView = { (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { - case (Some(l), Some(r)) => // This should never happen, but just in case + case (Some(l), Some(r)) => + // This should never happen because all fixed width should be covered by + // a faster code path. But just in case we provide it anyways. withResource(GpuScalar.from(l.toLong + r.toLong, LongType)) { s => ai.rapids.cudf.ColumnVector.fromScalar(s, n) } @@ -607,26 +611,26 @@ class HashJoinIterator( private var initialJoin = true private var nextCb: Option[ColumnarBatch] = None private var gathererStore: Option[JoinGatherer] = None - private val builtKeys = { - val tmp = new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback, "build_keys") - // Close the input keys, the lazy spillable batch now owns it. - inputBuiltKeys.close() - tmp + // Close the input keys, the lazy spillable batch now owns it. + private val builtKeys = withResource(inputBuiltKeys) { inputBuiltKeys => + new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback, "build_keys") } - private val builtData = { - val tmp = new LazySpillableColumnarBatch(inputBuiltData, spillCallback, "build_data") - // Close the input data, the lazy spillable batch now owns it. - inputBuiltData.close() - tmp + // Close the input data, the lazy spillable batch now owns it. + private val builtData = withResource(inputBuiltData) { inputBuiltData => + new LazySpillableColumnarBatch(inputBuiltData, spillCallback, "build_data") } + private var closed = false def close(): Unit = { - builtKeys.close() - builtData.close() - nextCb.foreach(_.close()) - nextCb = None - gathererStore.foreach(_.close()) - gathererStore = None + if (!closed) { + builtKeys.close() + builtData.close() + nextCb.foreach(_.close()) + nextCb = None + gathererStore.foreach(_.close()) + gathererStore = None + closed = true + } } TaskContext.get().addTaskCompletionListener[Unit](_ => close()) @@ -642,7 +646,7 @@ class HashJoinIterator( gathererStore = None } - if (ret.isDefined && gathererStore.isDefined) { + if (ret.isDefined) { // We are about to return something. We got everything we need from it so now let it spill // if there is more to be gathered later on. gathererStore.foreach(_.allowSpilling()) @@ -662,28 +666,30 @@ class HashJoinIterator( case GpuBuildLeft => (false, true) } try { - val gatherer = maps.length match { - case 1 => + val leftMap = maps.head + val rightMap = if (maps.length > 1) { + if (rightData.numCols == 0) { + // No data so don't both with it + None + } else { + Some(maps(1)) + } + } else { + None + } + + val gatherer = rightMap match { + case None => if (joinerOwnsRightData) { rightData.close() } - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), + JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData, joinerOwnsLeftData) - case 2 => - if (rightData.numCols == 0) { - if (joinerOwnsRightData) { - rightData.close() - } - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), - leftData, joinerOwnsLeftData) - } else { - JoinGatherer(new LazySpillableGatherMap(maps(0), spillCallback, "left_map"), + case Some(right) => + JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData, joinerOwnsLeftData, - new LazySpillableGatherMap(maps(1), spillCallback, "right_map"), + new LazySpillableGatherMap(right, spillCallback, "right_map"), rightData, joinerOwnsRightData) - } - case other => - throw new IllegalArgumentException(s"Got back unexpected number of gather maps $other") } if (gatherer.isDone) { gatherer.close() @@ -922,7 +928,12 @@ trait GpuHashJoin extends GpuExec { // We cannot dedupe anything here because the we can get nulls in the key columns // at least one side (left.output, left.output, right.output, right.output) - case _: InnerLike | LeftSemi | LeftAnti => + case LeftSemi | LeftAnti => + // These only need the keys from the right hand side, in fact there should only be keys on + // the right hand side, except if there is a condition, but we don't support conditions for + // these joins, so it is OK + (left.output, left.output, Seq.empty, Seq.empty) + case _: InnerLike => val (rightData, remappedRightData) = dedupDataFromKeys(right.output, rightKeys, leftKeys) (left.output, left.output, rightData, remappedRightData) case x => @@ -934,8 +945,6 @@ trait GpuHashJoin extends GpuExec { val finalProject: Seq[Expression] = joinType match { case _: InnerLike | LeftOuter | RightOuter | FullOuter => remappedLeftOutput ++ remappedRightOutput -// case j: ExistenceJoin => -// remappedLeftOutput :+ j.exists case LeftExistence(_) => remappedLeftOutput case x => From 43fe63173ded27f49d4304a220addbc16dd35fea Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Sat, 1 May 2021 09:31:08 -0500 Subject: [PATCH 6/9] Refactored JoinGatherer to new file and cleanup of ownership --- .../nvidia/spark/rapids/JoinGatherer.scala | 566 ++++++++++++++++++ .../sql/rapids/execution/GpuHashJoin.scala | 500 +--------------- 2 files changed, 585 insertions(+), 481 deletions(-) create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala new file mode 100644 index 00000000000..530b3d3cc78 --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala @@ -0,0 +1,566 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import ai.rapids.cudf.{ColumnView, DeviceMemoryBuffer, DType, GatherMap, NvtxColor, NvtxRange, OrderByArg, Scalar, Table} +import com.nvidia.spark.rapids.RapidsBuffer.SpillCallback + +import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataType, DateType, DecimalType, LongType, NullType, NumericType, StringType, StructType, TimestampType} +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Holds something that can be spilled if it is marked as such, but it does not modify the + * data until it is ready to be spilled. This avoids the performance penalty of making reformatting + * the underlying data so it is ready to be spilled. + * + * Call `allowSpilling` to indicate that the data can be released for spilling and call `close` + * to indicate that the data is not needed any longer. + * + * If the data is needed after `allowSpilling` is called the implementations should get the data + * back and cache it again until allowSpilling is called once more. + */ +trait LazySpillable extends AutoCloseable { + + /** + * Indicate that we are done messing with the data for now and it can be spilled. + */ + def allowSpilling(): Unit +} + +/** + * Generic trait for all join gather instances. A JoinGatherer takes the gather maps that are the + * result of a cudf join call along with the data batches that need to be gathered and allow + * someone to materialize the join in batches. It also provides APIs to help decide on how + * many rows to gather. + * + * This is a LazySpillable instance so the life cycle follows that too. + */ +trait JoinGatherer extends LazySpillable with Arm { + /** + * Gather the next n rows from the join gather maps. + * + * @param n how many rows to gather + * @return the gathered data as a ColumnarBatch + */ + def gatherNext(n: Int): ColumnarBatch + + /** + * Is all of the data gathered so far. + */ + def isDone: Boolean + + /** + * Number of rows left to gather + */ + def numRowsLeft: Long + + /** + * A really fast and dirty way to estimate the size of each row in the join output measured as in + * bytes. + */ + def realCheapPerRowSizeEstimate: Double + + /** + * Get the bit count size map for the next n rows to be gathered. It returns a column of + * INT64 values. One for each of the next n rows requested. This is a bit count to deal with + * validity bits, etc. This is an INT64 to allow a prefix sum (running total) to be done on + * it without overflowing so we can compute an accurate cuttoff point for a batch size limit. + */ + def getBitSizeMap(n: Int): ColumnView + + /** + * If the data is all fixed width return the size of each row, otherwise return None. + */ + def getFixedWidthBitSize: Option[Int] + + /** + * Do a complete/expensive job to get the number of rows that can be gathered to get close + * to the targetSize for the final output. + * + * @param targetSize The target size in bytes for the final output batch. + */ + def gatherRowEstimate(targetSize: Long): Int = { + val bitSizePerRow = getFixedWidthBitSize + if (bitSizePerRow.isDefined) { + Math.min(Math.min((targetSize / bitSizePerRow.get) / 8, numRowsLeft), Integer.MAX_VALUE).toInt + } else { + // WARNING magic number below. The rowEstimateMultiplier is arbitrary, we want to get + // enough rows that we include that we go over the target size, but not too much so we + // waste memory. It could probably be tuned better. + val rowEstimateMultiplier = 1.1 + val estimatedRows = Math.min( + ((targetSize / realCheapPerRowSizeEstimate) * rowEstimateMultiplier).toLong, + numRowsLeft) + val numRowsToProbe = Math.min(estimatedRows, Integer.MAX_VALUE).toInt + if (numRowsToProbe <= 0) { + 1 + } else { + val sum = withResource(getBitSizeMap(numRowsToProbe)) { bitSizes => + bitSizes.prefixSum() + } + val cutoff = withResource(sum) { sum => + // Lower bound needs tables, so we have to wrap everything in tables... + withResource(new Table(sum)) { sumTable => + withResource(ai.rapids.cudf.ColumnVector.fromLongs(targetSize * 8)) { bound => + withResource(new Table(bound)) { boundTab => + sumTable.lowerBound(boundTab, OrderByArg.asc(0)) + } + } + } + } + withResource(cutoff) { cutoff => + withResource(cutoff.copyToHost()) { hostCutoff => + Math.max(1, hostCutoff.getInt(0)) + } + } + } + } + } +} + +object JoinGatherer extends Arm { + def apply(gatherMap: LazySpillableGatherMap, + inputData: LazySpillableColumnarBatch): JoinGatherer = + new JoinGathererImpl(gatherMap, inputData) + + def apply(leftMap: LazySpillableGatherMap, + leftData: LazySpillableColumnarBatch, + rightMap: LazySpillableGatherMap, + rightData: LazySpillableColumnarBatch): JoinGatherer = { + val left = JoinGatherer(leftMap, leftData) + val right = JoinGatherer(rightMap, rightData) + MultiJoinGather(left, right) + } + + def getRowsInNextBatch(gatherer: JoinGatherer, targetSize: Long): Int = { + withResource(new NvtxRange("calc gather size", NvtxColor.YELLOW)) { _ => + val rowsLeft = gatherer.numRowsLeft + val rowEstimate: Long = gatherer.getFixedWidthBitSize match { + case Some(fixedSize) => + // Odd corner cases for tests, make sure we do at least one row + Math.max(1, (targetSize / fixedSize) / 8) + case None => + // Heuristic to see if we need to do the expensive calculation + if (rowsLeft * gatherer.realCheapPerRowSizeEstimate <= targetSize * 0.75) { + rowsLeft + } else { + gatherer.gatherRowEstimate(targetSize) + } + } + Math.min(Math.min(rowEstimate, rowsLeft), Integer.MAX_VALUE).toInt + } + } +} + + +/** + * Holds a Columnar batch that is LazySpillable. + */ +trait LazySpillableColumnarBatch extends LazySpillable { + /** + * How many rows are in the underlying batch. Should not unspill the batch to get this into. + */ + def numRows: Int + + /** + * How many columns are in the underlying batch. Should not unspill the batch to get this info. + */ + def numCols: Int + + /** + * The amount of device memory in bytes that the underlying batch uses. Should not unspill the + * batch to get this info. + */ + def deviceMemorySize: Long + + /** + * The data types of the underlying batches columns. Should not unspill the batch to get this + * info. + */ + def dataTypes: Array[DataType] + + + /** + * Get the batch that this wraps and unspill it if needed. + */ + def getBatch: ColumnarBatch + +} + +object LazySpillableColumnarBatch { + def apply(cb: ColumnarBatch, + spillCallback: SpillCallback, + name: String): LazySpillableColumnarBatch = + new LazySpillableColumnarBatchImpl(cb, spillCallback, name) + + def spillOnly(wrapped: LazySpillableColumnarBatch): LazySpillableColumnarBatch = wrapped match { + case alreadyGood: AllowSpillOnlyLazySpillableColumnarBatchImpl => alreadyGood + case anythingElse => AllowSpillOnlyLazySpillableColumnarBatchImpl(anythingElse) + } +} + +/** + * A version of `LazySpillableColumnarBatch` where instead of closing the underlying + * batch it is only spilled. This is used for cases, like with a streaming hash join + * where the data itself needs to out live the JoinGatherer it is haded off to. + */ +case class AllowSpillOnlyLazySpillableColumnarBatchImpl(wrapped: LazySpillableColumnarBatch) + extends LazySpillableColumnarBatch { + override def getBatch: ColumnarBatch = + wrapped.getBatch + + override def numRows: Int = wrapped.numRows + override def numCols: Int = wrapped.numCols + override def deviceMemorySize: Long = wrapped.deviceMemorySize + override def dataTypes: Array[DataType] = wrapped.dataTypes + + override def allowSpilling(): Unit = + wrapped.allowSpilling() + + override def close(): Unit = { + // Don't actually close it, we don't own it, just allow it to be spilled. + wrapped.allowSpilling() + } +} + +/** + * Holds a columnar batch that is cached until it is marked that it can be spilled. + */ +class LazySpillableColumnarBatchImpl( + cb: ColumnarBatch, + spillCallback: SpillCallback, + name: String) extends LazySpillableColumnarBatch with Arm { + + private var cached: Option[ColumnarBatch] = Some(GpuColumnVector.incRefCounts(cb)) + private var spill: Option[SpillableColumnarBatch] = None + override val numRows: Int = cb.numRows() + override val deviceMemorySize: Long = GpuColumnVector.getTotalDeviceMemoryUsed(cb) + override val dataTypes: Array[DataType] = GpuColumnVector.extractTypes(cb) + override val numCols: Int = dataTypes.length + + override def getBatch: ColumnarBatch = { + if (cached.isEmpty) { + withResource(new NvtxRange("get batch " + name, NvtxColor.RED)) { _ => + cached = Some(spill.get.getColumnarBatch()) + } + } + cached.get + } + + override def allowSpilling(): Unit = { + if (spill.isEmpty && cached.isDefined) { + withResource(new NvtxRange("spill batch " + name, NvtxColor.RED)) { _ => + // First time we need to allow for spilling + spill = Some(SpillableColumnarBatch(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableColumnarBatch takes ownership of it. + cached = None + } + } + cached.foreach(_.close()) + cached = None + } + + override def close(): Unit = { + cached.foreach(_.close()) + cached = None + spill.foreach(_.close()) + spill = None + } +} + +/** + * Holds a gather map that is also lazy spillable. + */ +class LazySpillableGatherMap( + map: GatherMap, + spillCallback: SpillCallback, + name: String) extends LazySpillable with Arm { + + val getRowCount: Long = map.getRowCount + + private var cached: Option[DeviceMemoryBuffer] = Some(map.releaseBuffer()) + private var spill: Option[SpillableBuffer] = None + + /** + * Get a ColumnView that can be used to do a cudf gather. + */ + def toColumnView(startRow: Long, numRows: Int): ColumnView = { + ColumnView.fromDeviceBuffer(getBuffer, startRow * 4L, DType.INT32, numRows) + } + + private def getBuffer = { + if (cached.isEmpty) { + withResource(new NvtxRange("get map " + name, NvtxColor.RED)) { _ => + cached = Some(spill.get.getDeviceBuffer()) + } + } + cached.get + } + + def allowSpilling(): Unit = { + if (spill.isEmpty && cached.isDefined) { + withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ => + // First time we need to allow for spilling + spill = Some(SpillableBuffer(cached.get, + SpillPriorities.ACTIVE_ON_DECK_PRIORITY, + spillCallback)) + // Putting data in a SpillableBuffer takes ownership of it. + cached = None + } + } + cached.foreach(_.close()) + cached = None + } + + override def close(): Unit = { + cached.foreach(_.close()) + cached = None + spill.foreach(_.close()) + spill = None + } +} + +object JoinGathererImpl { + + /** + * Calculate the row size in bits for a fixed width schema. If a type is encountered that is + * not fixed width, or is not known a None is returned. + */ + def fixedWidthRowSizeBits(dts: Seq[DataType]): Option[Int] = + sumRowSizesBits(dts, nullValueCalc = false) + + /** + * Calculate the null row size for a given schema in bits. If an unexpected type is encountered + * an exception is thrown + */ + def nullRowSizeBits(dts: Seq[DataType]): Int = + sumRowSizesBits(dts, nullValueCalc = true).get + + + /** + * Sum the row sizes for each data type passed in. If any one of the sizes is not available + * the entire result is considered to not be available. If nullValueCalc is true a result is + * guaranteed to be returned or an exception thrown. + */ + private def sumRowSizesBits(dts: Seq[DataType], nullValueCalc: Boolean): Option[Int] = { + val allOptions = dts.map(calcRowSizeBits(_, nullValueCalc)) + if (allOptions.exists(_.isEmpty)) { + None + } else { + Some(allOptions.map(_.get).sum + 1) + } + } + + /** + * Calculate the row bit size for the given data type. If nullValueCalc is false + * then variable width types and unexpected types will result in a None being returned. + * If it is true variable width types will have a value returned that corresponds to a + * null, and unknown types will throw an exception. + */ + private def calcRowSizeBits(dt: DataType, nullValueCalc: Boolean): Option[Int] = dt match { + case StructType(fields) => + sumRowSizesBits(fields.map(_.dataType), nullValueCalc) + case dt: DecimalType if dt.precision > DType.DECIMAL64_MAX_PRECISION => + if (nullValueCalc) { + throw new IllegalArgumentException(s"Found an unsupported type $dt") + } else { + None + } + case _: NumericType | DateType | TimestampType | BooleanType | NullType => + Some(GpuColumnVector.getNonNestedRapidsType(dt).getSizeInBytes * 8 + 1) + case StringType | BinaryType | ArrayType(_, _) if nullValueCalc => + // Single offset value and a validity value + Some((DType.INT32.getSizeInBytes * 8) + 1) + case x if nullValueCalc => + throw new IllegalArgumentException(s"Found an unsupported type $x") + case _ => None + } +} + +/** + * JoinGatherer for a single map/table + */ +class JoinGathererImpl( + private val gatherMap: LazySpillableGatherMap, + private val data: LazySpillableColumnarBatch) extends JoinGatherer { + + // How much of the gather map we have output so far + private var gatheredUpTo: Long = 0 + private val totalRows: Long = gatherMap.getRowCount + private val (fixedWidthRowSizeBits, nullRowSizeBits) = { + val dts = data.dataTypes + val fw = JoinGathererImpl.fixedWidthRowSizeBits(dts) + val nullVal = JoinGathererImpl.nullRowSizeBits(dts) + (fw, nullVal) + } + + override def realCheapPerRowSizeEstimate: Double = { + val totalInputRows: Int = data.numRows + val totalInputSize: Long = data.deviceMemorySize + // Avoid divide by 0 here and later on + if (totalInputRows > 0 && totalInputSize > 0) { + totalInputSize.toDouble / totalInputRows + } else { + 1.0 + } + } + + override def getFixedWidthBitSize: Option[Int] = fixedWidthRowSizeBits + + override def gatherNext(n: Int): ColumnarBatch = { + val start = gatheredUpTo + assert((start + n) <= totalRows) + val ret = withResource(gatherMap.toColumnView(start, n)) { gatherView => + val batch = data.getBatch + val gatheredTable = withResource(GpuColumnVector.from(batch)) { table => + table.gather(gatherView) + } + withResource(gatheredTable) { gt => + GpuColumnVector.from(gt, GpuColumnVector.extractTypes(batch)) + } + } + gatheredUpTo += n + ret + } + + override def isDone: Boolean = + gatheredUpTo >= totalRows + + override def numRowsLeft: Long = totalRows - gatheredUpTo + + override def allowSpilling(): Unit = { + data.allowSpilling() + gatherMap.allowSpilling() + } + + override def getBitSizeMap(n: Int): ColumnView = { + val cb = data.getBatch + val inputBitCounts = withResource(GpuColumnVector.from(cb)) { table => + withResource(table.rowBitCount()) { bits => + bits.castTo(DType.INT64) + } + } + // Gather the bit counts so we know what the output table will look like + val gatheredBitCount = withResource(inputBitCounts) { inputBitCounts => + withResource(gatherMap.toColumnView(gatheredUpTo, n)) { gatherView => + // Gather only works on a table so wrap the single column + val gatheredTab = withResource(new Table(inputBitCounts)) { table => + table.gather(gatherView) + } + withResource(gatheredTab) { gatheredTab => + gatheredTab.getColumn(0).incRefCount() + } + } + } + // The gather could have introduced nulls in the case of outer joins. Because of that + // we need to replace them with an appropriate size + if (gatheredBitCount.hasNulls) { + withResource(gatheredBitCount) { gatheredBitCount => + withResource(Scalar.fromLong(nullRowSizeBits.toLong)) { nullSize => + withResource(gatheredBitCount.isNull) { nullMask => + nullMask.ifElse(nullSize, gatheredBitCount) + } + } + } + } else { + gatheredBitCount + } + } + + override def close(): Unit = { + gatherMap.close() + data.close() + } +} + +/** + * Join Gatherer for a left table and a right table + */ +case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends JoinGatherer { + assert(left.numRowsLeft == right.numRowsLeft, + "all gatherers much have the same number of rows to gather") + + override def gatherNext(n: Int): ColumnarBatch = { + withResource(left.gatherNext(n)) { leftGathered => + withResource(right.gatherNext(n)) { rightGathered => + val vectors = Seq(leftGathered, rightGathered).flatMap { batch => + (0 until batch.numCols()).map { i => + val col = batch.column(i) + col.asInstanceOf[GpuColumnVector].incRefCount() + col + } + }.toArray + new ColumnarBatch(vectors, n) + } + } + } + + override def isDone: Boolean = left.isDone + + override def numRowsLeft: Long = left.numRowsLeft + + override def allowSpilling(): Unit = { + left.allowSpilling() + right.allowSpilling() + } + + override def realCheapPerRowSizeEstimate: Double = + left.realCheapPerRowSizeEstimate + right.realCheapPerRowSizeEstimate + + override def getBitSizeMap(n: Int): ColumnView = { + (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { + case (Some(l), Some(r)) => + // This should never happen because all fixed width should be covered by + // a faster code path. But just in case we provide it anyways. + withResource(GpuScalar.from(l.toLong + r.toLong, LongType)) { s => + ai.rapids.cudf.ColumnVector.fromScalar(s, n) + } + case (Some(l), None) => + withResource(GpuScalar.from(l.toLong, LongType)) { ls => + withResource(right.getBitSizeMap(n)) { rightBits => + ls.add(rightBits, DType.INT64) + } + } + case (None, Some(r)) => + withResource(GpuScalar.from(r.toLong, LongType)) { rs => + withResource(left.getBitSizeMap(n)) { leftBits => + rs.add(leftBits, DType.INT64) + } + } + case _ => + withResource(left.getBitSizeMap(n)) { leftBits => + withResource(right.getBitSizeMap(n)) { rightBits => + leftBits.add(rightBits, DType.INT64) + } + } + } + } + + override def getFixedWidthBitSize: Option[Int] = { + (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { + case (Some(l), Some(r)) => Some(l + r) + case _ => None + } + } + + override def close(): Unit = { + left.close() + right.close() + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index 5e6ba7a7fb3..42c63a70141 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -15,7 +15,7 @@ */ package org.apache.spark.sql.rapids.execution -import ai.rapids.cudf.{ColumnView, DeviceMemoryBuffer, DType, GatherMap, NvtxColor, NvtxRange, OrderByArg, Scalar, Table} +import ai.rapids.cudf.{GatherMap, NvtxColor, Table} import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.RapidsBuffer.SpillCallback @@ -136,475 +136,22 @@ object GpuHashJoin extends Arm { } /** - * Generic trait for all join gather instances. - * All instances should be spillable. - * The life cycle of this assumes that when it is created that the data and - * gather maps will be used shortly. - * If you are not going to use these for a while, like when returning from an iterator, - * then allowSpilling should be called so that the cached data is released and spilling - * can be allowed. If you need/want to use the data again, just start using it, and it - * will be cached yet again until allowSpilling is called. - * When you are completely done with this object call close on it. + * An iterator that does a hash join against a stream of batches. */ -trait JoinGatherer extends AutoCloseable with Arm { - /** - * Gather the next n rows from the join gather maps. - * @param n how many rows to gather - * @return the gathered data as a ColumnarBatch - */ - def gatherNext(n: Int): ColumnarBatch - - /** - * Is all of the data gathered so far. - */ - def isDone: Boolean - - /** - * Number of rows left to gather - */ - def numRowsLeft: Long - - /** - * Indicate that we are done messing with the data for now and it can be spilled. - */ - def allowSpilling(): Unit - - /** - * A really fast and dirty way to estimate the size of each row in the join output - */ - def realCheapPerRowSizeEstimate: Double - - /** - * Get the bit count size map for the next n rows to be gathered. The returned value is - * an INT64 for each row in the n rows requested. - */ - def getBitSizeMap(n: Int): ColumnView - - /** - * If the data is all fixed width return the size of each row, otherwise return None. - */ - def getFixedWidthBitSize: Option[Int] - - /** - * Do a complete/expensive job to get the number of rows that can be gathered to get close - * to the targetSize for the final output. - */ - def gatherRowEstimate(targetSize: Long): Int = { - val bitSizePerRow = getFixedWidthBitSize - if (bitSizePerRow.isDefined) { - Math.min(Math.min((targetSize/bitSizePerRow.get) / 8, numRowsLeft), Integer.MAX_VALUE).toInt - } else { - // WARNING magic number below. The rowEstimateMultiplier is arbitrary, we want to get - // enough rows that we include that we go over the target size, but not too much so we - // waste memory. It could probably be tuned better. - val rowEstimateMultiplier = 1.1 - val estimatedRows = Math.min( - ((targetSize / realCheapPerRowSizeEstimate) * rowEstimateMultiplier).toLong, - numRowsLeft) - val numRowsToProbe = Math.min(estimatedRows, Integer.MAX_VALUE).toInt - if (numRowsToProbe <= 0) { - 1 - } else { - val sum = withResource(getBitSizeMap(numRowsToProbe)) { bitSizes => - bitSizes.prefixSum() - } - val cutoff = withResource(sum) { sum => - withResource(new Table(sum)) { sumTable => - withResource(ai.rapids.cudf.ColumnVector.fromLongs(targetSize * 8)) { bound => - withResource(new Table(bound)) { boundTab => - sumTable.lowerBound(boundTab, OrderByArg.asc(0)) - } - } - } - } - withResource(cutoff) { cutoff => - withResource(cutoff.copyToHost()) { hostCutoff => - Math.max(1, hostCutoff.getInt(0)) - } - } - } - } - } -} - -/** - * Holds a columnar batch that is cached until it is marked that it can be spilled. - */ -class LazySpillableColumnarBatch( - cb: ColumnarBatch, - spillCallback: SpillCallback, - name: String) extends AutoCloseable with Arm { - - private var cached: Option[ColumnarBatch] = Some(GpuColumnVector.incRefCounts(cb)) - private var spill: Option[SpillableColumnarBatch] = None - val numRows: Int = cb.numRows() - val deviceMemorySize: Long = GpuColumnVector.getTotalDeviceMemoryUsed(cb) - val dataTypes: Array[DataType] = GpuColumnVector.extractTypes(cb) - val numCols: Int = dataTypes.length - - def getBatch: ColumnarBatch = { - if (cached.isEmpty) { - withResource(new NvtxRange("get batch " + name, NvtxColor.RED)) { _ => - cached = Some(spill.get.getColumnarBatch()) - } - } - cached.get - } - - def allowSpilling(): Unit = { - if (spill.isEmpty && cached.isDefined) { - withResource(new NvtxRange("spill batch " + name, NvtxColor.RED)) { _ => - // First time we need to allow for spilling - spill = Some(SpillableColumnarBatch(cached.get, - SpillPriorities.ACTIVE_ON_DECK_PRIORITY, - spillCallback)) - // Putting data in a SpillableColumnarBatch takes ownership of it. - cached = None - } - } - cached.foreach(_.close()) - cached = None - } - - override def close(): Unit = { - cached.foreach(_.close()) - cached = None - spill.foreach(_.close()) - spill = None - } -} - -class LazySpillableGatherMap( - map: GatherMap, - spillCallback: SpillCallback, - name: String) extends AutoCloseable with Arm { - - val getRowCount: Long = map.getRowCount - - private var cached: Option[DeviceMemoryBuffer] = Some(map.releaseBuffer()) - private var spill: Option[SpillableBuffer] = None - - def toColumnView(startRow: Long, numRows: Int): ColumnView = { - ColumnView.fromDeviceBuffer(getBuffer, startRow * 4L, DType.INT32, numRows) - } - - private def getBuffer = { - if (cached.isEmpty) { - withResource(new NvtxRange("get map " + name, NvtxColor.RED)) { _ => - cached = Some(spill.get.getDeviceBuffer()) - } - } - cached.get - } - - def allowSpilling(): Unit = { - if (spill.isEmpty && cached.isDefined) { - withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ => - // First time we need to allow for spilling - spill = Some(SpillableBuffer(cached.get, - SpillPriorities.ACTIVE_ON_DECK_PRIORITY, - spillCallback)) - // Putting data in a SpillableBuffer takes ownership of it. - cached = None - } - } - cached.foreach(_.close()) - cached = None - } - - override def close(): Unit = { - cached.foreach(_.close()) - cached = None - spill.foreach(_.close()) - spill = None - } -} - -object JoinGathererImpl { - - /** - * Calculate the row size in bits for a fixed width schema. If a type is encountered that is - * not fixed width, or is not known a None is returned. - */ - def fixedWidthRowSizeBits(dts: Seq[DataType]): Option[Int] = - sumRowSizesBits(dts, nullValueCalc = false) - - /** - * Calculate the null row size for a given schema in bits. If an unexpected type is enountered - * an exception is thrown - */ - def nullRowSizeBits(dts: Seq[DataType]): Int = - sumRowSizesBits(dts, nullValueCalc = true).get - - - /** - * Sum the row sizes for each data type passed in. If any one of the sizes is not available - * the entire result is considered to not be available. If nullValueCalc is true a result is - * guaranteed to be returned or an exception thrown. - */ - private def sumRowSizesBits(dts: Seq[DataType], nullValueCalc: Boolean): Option[Int] = { - val allOptions = dts.map(calcRowSizeBits(_, nullValueCalc)) - if (allOptions.exists(_.isEmpty)) { - None - } else { - Some(allOptions.map(_.get).sum + 1) - } - } - - /** - * Calculate the row bit size for the given data type. If nullValueCalc is false - * then variable width types and unexpected types will result in a None being returned. - * If it is true variable width types will have a value returned that corresponds to a - * null, and unknown types will throw an exception. - */ - private def calcRowSizeBits(dt: DataType, nullValueCalc: Boolean): Option[Int] = dt match { - case StructType(fields) => - sumRowSizesBits(fields.map(_.dataType), nullValueCalc) - case dt: DecimalType if dt.precision > DType.DECIMAL64_MAX_PRECISION => - if (nullValueCalc) { - throw new IllegalArgumentException(s"Found an unsupported type $dt") - } else { - None - } - case _: NumericType | DateType | TimestampType | BooleanType | NullType => - Some(GpuColumnVector.getNonNestedRapidsType(dt).getSizeInBytes * 8 + 1) - case StringType | BinaryType | ArrayType(_, _) if nullValueCalc => - // Single offset value and a validity value - Some((DType.INT32.getSizeInBytes * 8) + 1) - case x if nullValueCalc => - throw new IllegalArgumentException(s"Found an unsupported type $x") - case _ => None - } -} - -/** - * JoinGatherer for a single map/table - */ -class JoinGathererImpl( - private val gatherMap: LazySpillableGatherMap, - private val data: LazySpillableColumnarBatch, - private val closeData: Boolean) extends JoinGatherer { - - // How much of the gather map we have output so far - private var gatheredUpTo: Long = 0 - private val totalRows: Long = gatherMap.getRowCount - private val (fixedWidthRowSizeBits, nullRowSizeBits) = { - val dts = data.dataTypes - val fw = JoinGathererImpl.fixedWidthRowSizeBits(dts) - val nullVal = JoinGathererImpl.nullRowSizeBits(dts) - (fw, nullVal) - } - - override def realCheapPerRowSizeEstimate: Double = { - val totalInputRows: Int = data.numRows - val totalInputSize: Long = data.deviceMemorySize - // Avoid divide by 0 here and later on - if (totalInputRows > 0 && totalInputSize > 0) { - totalInputSize.toDouble / totalInputRows - } else { - 1.0 - } - } - - override def getFixedWidthBitSize: Option[Int] = fixedWidthRowSizeBits - - override def gatherNext(n: Int): ColumnarBatch = { - val start = gatheredUpTo - assert((start + n) <= totalRows) - val ret = withResource(gatherMap.toColumnView(start, n)) { gatherView => - val batch = data.getBatch - val gatheredTable = withResource(GpuColumnVector.from(batch)) { table => - table.gather(gatherView) - } - withResource(gatheredTable) { gt => - GpuColumnVector.from(gt, GpuColumnVector.extractTypes(batch)) - } - } - gatheredUpTo += n - ret - } - - override def isDone: Boolean = - gatheredUpTo >= totalRows - - override def numRowsLeft: Long = totalRows - gatheredUpTo - - override def allowSpilling(): Unit = { - data.allowSpilling() - gatherMap.allowSpilling() - } - - override def getBitSizeMap(n: Int): ColumnView = { - val cb = data.getBatch - val inputBitCounts = withResource(GpuColumnVector.from(cb)) { table => - withResource(table.rowBitCount()) { bits => - bits.castTo(DType.INT64) - } - } - // Gather the bit counts so we know what the output table will look like - val gatheredBitCount = withResource(inputBitCounts) { inputBitCounts => - withResource(gatherMap.toColumnView(gatheredUpTo, n)) { gatherView => - // Gather only works on a table so wrap the single column - val gatheredTab = withResource(new Table(inputBitCounts)) { table => - table.gather(gatherView) - } - withResource(gatheredTab) { gatheredTab => - gatheredTab.getColumn(0).incRefCount() - } - } - } - // The gather could have introduced nulls in the case of outer joins. Because of that - // we need to replace them with an appropriate size - if (gatheredBitCount.hasNulls) { - withResource(gatheredBitCount) { gatheredBitCount => - withResource(Scalar.fromLong(nullRowSizeBits.toLong)) { nullSize => - withResource(gatheredBitCount.isNull) { nullMask => - nullMask.ifElse(nullSize, gatheredBitCount) - } - } - } - } else { - gatheredBitCount - } - } - - override def close(): Unit = { - gatherMap.close() - if (closeData) { - data.close() - } else { - data.allowSpilling() - } - } -} - -/** - * Join Gatherer for a left table and a right table - */ -case class MultiJoinGather(left: JoinGatherer, right: JoinGatherer) extends JoinGatherer { - assert(left.numRowsLeft == right.numRowsLeft, - "all gatherers much have the same number of rows to gather") - - override def gatherNext(n: Int): ColumnarBatch = { - withResource(left.gatherNext(n)) { leftGathered => - withResource(right.gatherNext(n)) { rightGathered => - val vectors = Seq(leftGathered, rightGathered).flatMap { batch => - (0 until batch.numCols()).map { i => - val col = batch.column(i) - col.asInstanceOf[GpuColumnVector].incRefCount() - col - } - }.toArray - new ColumnarBatch(vectors, n) - } - } - } - - override def isDone: Boolean = left.isDone - - override def numRowsLeft: Long = left.numRowsLeft - - override def allowSpilling(): Unit = { - left.allowSpilling() - right.allowSpilling() - } - - override def realCheapPerRowSizeEstimate: Double = - left.realCheapPerRowSizeEstimate + right.realCheapPerRowSizeEstimate - - override def getBitSizeMap(n: Int): ColumnView = { - (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { - case (Some(l), Some(r)) => - // This should never happen because all fixed width should be covered by - // a faster code path. But just in case we provide it anyways. - withResource(GpuScalar.from(l.toLong + r.toLong, LongType)) { s => - ai.rapids.cudf.ColumnVector.fromScalar(s, n) - } - case (Some(l), None) => - withResource(GpuScalar.from(l.toLong, LongType)) { ls => - withResource(right.getBitSizeMap(n)) { rightBits => - ls.add(rightBits, DType.INT64) - } - } - case (None, Some(r)) => - withResource(GpuScalar.from(r.toLong, LongType)) { rs => - withResource(left.getBitSizeMap(n)) { leftBits => - rs.add(leftBits, DType.INT64) - } - } - case _ => - withResource(left.getBitSizeMap(n)) { leftBits => - withResource(right.getBitSizeMap(n)) { rightBits => - leftBits.add(rightBits, DType.INT64) - } - } - } - } - - override def getFixedWidthBitSize: Option[Int] = { - (left.getFixedWidthBitSize, right.getFixedWidthBitSize) match { - case (Some(l), Some(r)) => Some(l + r) - case _ => None - } - } - - override def close(): Unit = { - left.close() - right.close() - } -} - -object JoinGatherer extends Arm { - def apply(gatherMap: LazySpillableGatherMap, - inputData: LazySpillableColumnarBatch, - closeData: Boolean): JoinGatherer = - new JoinGathererImpl(gatherMap, inputData, closeData) - - def apply(leftMap: LazySpillableGatherMap, - leftData: LazySpillableColumnarBatch, - closeLeftData: Boolean, - rightMap: LazySpillableGatherMap, - rightData: LazySpillableColumnarBatch, - closeRightData: Boolean): JoinGatherer = { - val left = JoinGatherer(leftMap, leftData, closeLeftData) - val right = JoinGatherer(rightMap, rightData, closeRightData) - MultiJoinGather(left, right) - } - - def getRowsInNextBatch(gatherer: JoinGatherer, targetSize: Long): Int = { - withResource(new NvtxRange("calc gather size", NvtxColor.YELLOW)) { _ => - val rowsLeft = gatherer.numRowsLeft - val rowEstimate: Long = gatherer.getFixedWidthBitSize match { - case Some(fixedSize) => - // Odd corner cases for tests, make sure we do at least one row - Math.max(1, (targetSize / fixedSize) / 8) - case None => - // Heuristic to see if we need to do the expensive calculation - if (rowsLeft * gatherer.realCheapPerRowSizeEstimate <= targetSize * 0.75) { - rowsLeft - } else { - gatherer.gatherRowEstimate(targetSize) - } - } - Math.min(Math.min(rowEstimate, rowsLeft), Integer.MAX_VALUE).toInt - } - } -} - class HashJoinIterator( inputBuiltKeys: ColumnarBatch, inputBuiltData: ColumnarBatch, - val stream: Iterator[ColumnarBatch], + private val stream: Iterator[ColumnarBatch], val boundStreamKeys: Seq[Expression], val boundStreamData: Seq[Expression], val streamAttributes: Seq[Attribute], val targetSize: Long, val joinType: JoinType, val buildSide: GpuBuildSide, - val spillCallback: SpillCallback, - streamTime: GpuMetric, - joinTime: GpuMetric, - totalTime: GpuMetric) extends Iterator[ColumnarBatch] with Arm { + private val spillCallback: SpillCallback, + private val streamTime: GpuMetric, + private val joinTime: GpuMetric, + private val totalTime: GpuMetric) extends Iterator[ColumnarBatch] with Arm { import scala.collection.JavaConverters._ // For some join types even if there is no stream data we might output something @@ -613,11 +160,11 @@ class HashJoinIterator( private var gathererStore: Option[JoinGatherer] = None // Close the input keys, the lazy spillable batch now owns it. private val builtKeys = withResource(inputBuiltKeys) { inputBuiltKeys => - new LazySpillableColumnarBatch(inputBuiltKeys, spillCallback, "build_keys") + LazySpillableColumnarBatch(inputBuiltKeys, spillCallback, "build_keys") } // Close the input data, the lazy spillable batch now owns it. private val builtData = withResource(inputBuiltData) { inputBuiltData => - new LazySpillableColumnarBatch(inputBuiltData, spillCallback, "build_data") + LazySpillableColumnarBatch(inputBuiltData, spillCallback, "build_data") } private var closed = false @@ -659,37 +206,28 @@ class HashJoinIterator( maps: Array[GatherMap], leftData: LazySpillableColumnarBatch, rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { - // The joiner should own/close the data that is on the stream side - // the build side is owned by the iterator. - val (joinerOwnsLeftData, joinerOwnsRightData) = buildSide match { - case GpuBuildRight => (true, false) - case GpuBuildLeft => (false, true) - } try { val leftMap = maps.head - val rightMap = if (maps.length > 1) { + val rightMap = if (maps.length == 2) { if (rightData.numCols == 0) { // No data so don't both with it None } else { Some(maps(1)) } - } else { + } else if (maps.length == 1) { None + } else { + throw new IllegalStateException("Internal Error got more gather maps than expected.") } val gatherer = rightMap match { case None => - if (joinerOwnsRightData) { - rightData.close() - } - JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), - leftData, joinerOwnsLeftData) + rightData.close() + JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData) case Some(right) => - JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), - leftData, joinerOwnsLeftData, - new LazySpillableGatherMap(right, spillCallback, "right_map"), - rightData, joinerOwnsRightData) + JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData, + new LazySpillableGatherMap(right, spillCallback, "right_map"), rightData) } if (gatherer.isDone) { gatherer.close() @@ -757,8 +295,8 @@ class HashJoinIterator( streamCb: ColumnarBatch): Option[JoinGatherer] = { withResource(GpuProjectExec.project(streamCb, boundStreamKeys)) { streamKeys => withResource(GpuProjectExec.project(streamCb, boundStreamData)) { streamData => - joinGatherMap(buildKeys, buildData, - streamKeys, new LazySpillableColumnarBatch(streamData, spillCallback, "stream_data")) + joinGatherMap(buildKeys, LazySpillableColumnarBatch.spillOnly(buildData), + streamKeys, LazySpillableColumnarBatch(streamData, spillCallback, "stream_data")) } } } From 3872ed4447f001422bcb3f52c6f9d991b085564f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Sat, 1 May 2021 10:40:56 -0500 Subject: [PATCH 7/9] More code cleanup --- .../spark/rapids/GpuBoundAttribute.scala | 2 +- .../rapids/GpuShuffledHashJoinBase.scala | 4 +- .../nvidia/spark/rapids/JoinGatherer.scala | 11 ++- .../sql/rapids/execution/GpuHashJoin.scala | 73 ++++++++++--------- 4 files changed, 48 insertions(+), 42 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala index c8101792d7e..3aed390246f 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBoundAttribute.scala @@ -99,7 +99,7 @@ case class GpuBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean batch.column(ordinal) match { case fb: GpuColumnVectorFromBuffer => // When doing a project we might re-order columns or do other things that make it - // so this no loner looks like the original contiguous buffer it came from + // so this no longer looks like the original contiguous buffer it came from // so to avoid it appearing to down stream processing as the same buffer we change // the type here. new GpuColumnVector(fb.dataType(), fb.getBase.incRefCount()) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala index 075f73dcf59..55cc69d47b8 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala @@ -16,10 +16,9 @@ package com.nvidia.spark.rapids -import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.FullOuter import org.apache.spark.sql.catalyst.plans.physical.{Distribution, HashClusteredDistribution} import org.apache.spark.sql.execution.BinaryExecNode @@ -71,6 +70,7 @@ abstract class GpuShuffledHashJoinBase( val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val localBuildOutput: Seq[Attribute] = buildPlan.output streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala index 530b3d3cc78..3bf18c354d2 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/JoinGatherer.scala @@ -36,7 +36,10 @@ import org.apache.spark.sql.vectorized.ColumnarBatch trait LazySpillable extends AutoCloseable { /** - * Indicate that we are done messing with the data for now and it can be spilled. + * Indicate that we are done using the data for now and it can be spilled. + * + * This method should not have issues with being called multiple times without the data being + * accessed. */ def allowSpilling(): Unit } @@ -255,7 +258,7 @@ class LazySpillableColumnarBatchImpl( override def getBatch: ColumnarBatch = { if (cached.isEmpty) { withResource(new NvtxRange("get batch " + name, NvtxColor.RED)) { _ => - cached = Some(spill.get.getColumnarBatch()) + cached = spill.map(_.getColumnarBatch()) } } cached.get @@ -307,13 +310,13 @@ class LazySpillableGatherMap( private def getBuffer = { if (cached.isEmpty) { withResource(new NvtxRange("get map " + name, NvtxColor.RED)) { _ => - cached = Some(spill.get.getDeviceBuffer()) + cached = spill.map(_.getDeviceBuffer()) } } cached.get } - def allowSpilling(): Unit = { + override def allowSpilling(): Unit = { if (spill.isEmpty && cached.isDefined) { withResource(new NvtxRange("spill map " + name, NvtxColor.RED)) { _ => // First time we need to allow for spilling diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index 42c63a70141..ca0dd8f17c8 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -206,30 +206,31 @@ class HashJoinIterator( maps: Array[GatherMap], leftData: LazySpillableColumnarBatch, rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { + assert(maps.length > 0 && maps.length <= 2) try { val leftMap = maps.head - val rightMap = if (maps.length == 2) { + val rightMap = if (maps.length > 1) { if (rightData.numCols == 0) { - // No data so don't both with it + // No data so don't bother with it None } else { Some(maps(1)) } - } else if (maps.length == 1) { - None } else { - throw new IllegalStateException("Internal Error got more gather maps than expected.") + None } + val lazyLeftMap = new LazySpillableGatherMap(leftMap, spillCallback, "left_map") val gatherer = rightMap match { case None => rightData.close() - JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData) + JoinGatherer(lazyLeftMap, leftData) case Some(right) => - JoinGatherer(new LazySpillableGatherMap(leftMap, spillCallback, "left_map"), leftData, - new LazySpillableGatherMap(right, spillCallback, "right_map"), rightData) + val lazyRightMap = new LazySpillableGatherMap(right, spillCallback, "right_map") + JoinGatherer(lazyLeftMap, leftData, lazyRightMap, rightData) } if (gatherer.isDone) { + // Nothing matched... gatherer.close() None } else { @@ -240,7 +241,7 @@ class HashJoinIterator( } } - private def joinGatherMapLeftRight( + private def joinGathererLeftRight( leftKeys: Table, leftData: LazySpillableColumnarBatch, rightKeys: Table, @@ -264,38 +265,38 @@ class HashJoinIterator( } } - private def joinGatherMapLeftRight( + private def joinGathererLeftRight( leftKeys: ColumnarBatch, leftData: LazySpillableColumnarBatch, rightKeys: ColumnarBatch, rightData: LazySpillableColumnarBatch): Option[JoinGatherer] = { withResource(GpuColumnVector.from(leftKeys)) { leftKeysTab => withResource(GpuColumnVector.from(rightKeys)) { rightKeysTab => - joinGatherMapLeftRight(leftKeysTab, leftData, rightKeysTab, rightData) + joinGathererLeftRight(leftKeysTab, leftData, rightKeysTab, rightData) } } } - private def joinGatherMap( + private def joinGatherer( buildKeys: ColumnarBatch, buildData: LazySpillableColumnarBatch, streamKeys: ColumnarBatch, streamData: LazySpillableColumnarBatch): Option[JoinGatherer] = { buildSide match { case GpuBuildLeft => - joinGatherMapLeftRight(buildKeys, buildData, streamKeys, streamData) + joinGathererLeftRight(buildKeys, buildData, streamKeys, streamData) case GpuBuildRight => - joinGatherMapLeftRight(streamKeys, streamData, buildKeys, buildData) + joinGathererLeftRight(streamKeys, streamData, buildKeys, buildData) } } - private def joinGatherMap( + private def joinGatherer( buildKeys: ColumnarBatch, buildData: LazySpillableColumnarBatch, streamCb: ColumnarBatch): Option[JoinGatherer] = { withResource(GpuProjectExec.project(streamCb, boundStreamKeys)) { streamKeys => withResource(GpuProjectExec.project(streamCb, boundStreamData)) { streamData => - joinGatherMap(buildKeys, LazySpillableColumnarBatch.spillOnly(buildData), + joinGatherer(buildKeys, LazySpillableColumnarBatch.spillOnly(buildData), streamKeys, LazySpillableColumnarBatch(streamData, spillCallback, "stream_data")) } } @@ -313,12 +314,12 @@ class HashJoinIterator( gathererStore = None withResource(stream.next()) { cb => streamTime += (System.nanoTime() - startTime) - gathererStore = joinGatherMap(builtKeys.getBatch, builtData, cb) + gathererStore = joinGatherer(builtKeys.getBatch, builtData, cb) } nextCb = nextCbFromGatherer() } else if (initialJoin) { withResource(GpuColumnVector.emptyBatch(streamAttributes.asJava)) { cb => - gathererStore = joinGatherMap(builtKeys.getBatch, builtData, cb) + gathererStore = joinGatherer(builtKeys.getBatch, builtData, cb) } nextCb = nextCbFromGatherer() } else { @@ -449,7 +450,7 @@ trait GpuHashJoin extends GpuExec { * * 2. After this we will do the join. We can produce multiple batches from a single * pair of input batches. The output of this stage is called the intermediate output and is the - * data columns each side of the join smashed together. + * data columns from each side of the join smashed together. * * 3. In some cases there is a condition that filters out data from the join that should not be * included. In the CPU code the condition will operate on the intermediate output. In some cases @@ -463,8 +464,8 @@ trait GpuHashJoin extends GpuExec { "Join keys from two sides should have same types") val (leftData, remappedLeftOutput, rightData, remappedRightOutput) = joinType match { case FullOuter | RightOuter | LeftOuter => - // We cannot dedupe anything here because the we can get nulls in the key columns - // at least one side + // We cannot dedupe anything here because we can get nulls in the key columns on + // at least one side, so they do not match (left.output, left.output, right.output, right.output) case LeftSemi | LeftAnti => // These only need the keys from the right hand side, in fact there should only be keys on @@ -508,8 +509,6 @@ trait GpuHashJoin extends GpuExec { } } - val localBuildOutput: Seq[Attribute] = buildPlan.output - def doJoin( builtBatch: ColumnarBatch, stream: Iterator[ColumnarBatch], @@ -522,6 +521,7 @@ trait GpuHashJoin extends GpuExec { joinTime: GpuMetric, filterTime: GpuMetric, totalTime: GpuMetric): Iterator[ColumnarBatch] = { + // The 10k is mostly for tests, hopefully no one is setting anything that low in production. val realTarget = Math.max(targetSize, 10 * 1024) val (builtKeys, builtData) = { @@ -552,22 +552,25 @@ trait GpuHashJoin extends GpuExec { val condition = boundCondition.get joinIterator.flatMap { cb => joinOutputRows += cb.numRows() - val tmp = GpuFilter(cb, condition, numOutputRows, numOutputBatches, filterTime) - if (tmp.numRows == 0) { - // Not sure if there is a better way to work around this - numOutputBatches.set(numOutputBatches.value - 1) - tmp.close() - None - } else { - Some(GpuProjectExec.projectAndClose(tmp, boundFinal, NoopMetric)) + withResource( + GpuFilter(cb, condition, numOutputRows, numOutputBatches, filterTime)) { filtered => + if (filtered.numRows == 0) { + // Not sure if there is a better way to work around this + numOutputBatches.set(numOutputBatches.value - 1) + None + } else { + Some(GpuProjectExec.project(filtered, boundFinal)) + } } } } else { joinIterator.map { cb => - joinOutputRows += cb.numRows() - numOutputRows += cb.numRows() - numOutputBatches += 1 - GpuProjectExec.projectAndClose(cb, boundFinal, NoopMetric) + withResource(cb) { cb => + joinOutputRows += cb.numRows() + numOutputRows += cb.numRows() + numOutputBatches += 1 + GpuProjectExec.project(cb, boundFinal) + } } } } From 50f25807579a20809501ffb88f19ea7d76134823 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 3 May 2021 08:46:41 -0500 Subject: [PATCH 8/9] Addressed review comments --- .../rapids/shims/spark300/GpuBroadcastHashJoinExec.scala | 4 ++-- .../rapids/shims/spark301/GpuBroadcastHashJoinExec.scala | 4 ++-- .../rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala | 4 ++-- .../rapids/shims/spark311/GpuBroadcastHashJoinExec.scala | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala index c62d417e158..e9930f5f48a 100644 --- a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala +++ b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/GpuBroadcastHashJoinExec.scala @@ -94,8 +94,6 @@ case class GpuBroadcastHashJoinExec( right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ - private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) - override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -145,6 +143,8 @@ case class GpuBroadcastHashJoinExec( val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() diff --git a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala index ffc51cc8ff2..c3ac7040458 100644 --- a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/GpuBroadcastHashJoinExec.scala @@ -92,8 +92,6 @@ case class GpuBroadcastHashJoinExec( right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ - private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) - override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -143,6 +141,8 @@ case class GpuBroadcastHashJoinExec( val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() diff --git a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala index b906663f95a..92f63db58c1 100644 --- a/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala +++ b/shims/spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuBroadcastHashJoinExec.scala @@ -91,8 +91,6 @@ case class GpuBroadcastHashJoinExec( right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ - private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) - override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -142,6 +140,8 @@ case class GpuBroadcastHashJoinExec( val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() diff --git a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala index 204e0cc06e1..0b81f0354e0 100644 --- a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala +++ b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/GpuBroadcastHashJoinExec.scala @@ -96,8 +96,6 @@ case class GpuBroadcastHashJoinExec( right: SparkPlan) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ - private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) - override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -147,6 +145,8 @@ case class GpuBroadcastHashJoinExec( val spillCallback = GpuMetric.makeSpillCallback(allMetrics) + val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) + val broadcastRelation = broadcastExchange .executeColumnarBroadcast[SerializeConcatHostBuffersDeserializeBatch]() From 464a313b8568030327d83bc9dadf8f1ccc9681fc Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 3 May 2021 11:32:18 -0500 Subject: [PATCH 9/9] Cleanup and fixes --- .../rapids/GpuShuffledHashJoinBase.scala | 3 +- .../sql/rapids/execution/GpuHashJoin.scala | 52 +++++-------------- 2 files changed, 14 insertions(+), 41 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala index 55cc69d47b8..f9ee1c62d15 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinBase.scala @@ -33,8 +33,6 @@ abstract class GpuShuffledHashJoinBase( val isSkewJoin: Boolean) extends BinaryExecNode with GpuHashJoin { import GpuMetric._ - private [this] lazy val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) - override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL override lazy val additionalMetrics: Map[String, GpuMetric] = Map( @@ -69,6 +67,7 @@ abstract class GpuShuffledHashJoinBase( val joinTime = gpuLongMetric(JOIN_TIME) val filterTime = gpuLongMetric(FILTER_TIME) val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS) + val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf) val spillCallback = GpuMetric.makeSpillCallback(allMetrics) val localBuildOutput: Seq[Attribute] = buildPlan.output diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala index 64c89552507..fa3255e0810 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuHashJoin.scala @@ -102,19 +102,16 @@ object GpuHashJoin extends Arm { } /** - * Filter rows from the batch where all of the keys are null. + * Filter rows from the batch where any of the keys are null. */ def filterNulls(cb: ColumnarBatch, boundKeys: Seq[Expression]): ColumnarBatch = { var mask: ai.rapids.cudf.ColumnVector = null try { withResource(GpuProjectExec.project(cb, boundKeys)) { keys => val keyColumns = GpuColumnVector.extractBases(keys) - // to remove a row all of the key columns must be null for that row - // If there is even one key column with no nulls in it, don't filter anything - // we do this by leaving mask as null - if (keyColumns.forall(_.hasNulls)) { - keyColumns.foreach { column => - withResource(column.isNull) { nn => + keyColumns.foreach { column => + if (column.hasNulls) { + withResource(column.isNotNull) { nn => if (mask == null) { mask = nn.incRefCount() } else { @@ -156,12 +153,8 @@ object GpuHashJoin extends Arm { * yet. */ def anyNullableStructChild(expressions: Seq[Expression]): Boolean = { - System.err.println(s"LOOKING FOR NULLABLE STRUCT CHILDREN IN " + - s"${expressions.map(_.dataType).toArray.toSeq}") def anyNullableChild(struct: StructType): Boolean = { - System.err.println(s"CHECK FOR NULLABLE CHILDREN $struct") - val ret = struct.fields.exists { field => - System.err.println(s"IS NULLABLE FIELD $field? ${field.nullable}") + struct.fields.exists { field => if (field.nullable) { true } else field.dataType match { @@ -170,18 +163,13 @@ object GpuHashJoin extends Arm { case _ => false } } - System.err.println(s"HAS NULLABLE CHILDREN $struct? $ret") - ret } - val ret = expressions.map(_.dataType).exists { + expressions.map(_.dataType).exists { case st: StructType => anyNullableChild(st) case _ => false } - System.err.println(s"NULLABLE STRUCT CHILDREN IN " + - s"${expressions.map(_.dataType).toArray.toSeq}? $ret") - ret } } @@ -363,16 +351,9 @@ class HashJoinIterator( // Need to refill the gatherer gathererStore.foreach(_.close()) gathererStore = None - val filtered = withResource(stream.next()) { cb => + withResource(stream.next()) { cb => streamTime += (System.nanoTime() - startTime) - if (compareNullsEqual) { // TODO need some checks for nullability... - GpuHashJoin.filterNulls(cb, boundStreamKeys) - } else { - GpuColumnVector.incRefCounts(cb) - } - } - withResource(filtered) { filtered => - gathererStore = joinGatherer(builtKeys.getBatch, builtData, filtered) + gathererStore = joinGatherer(builtKeys.getBatch, builtData, cb) } nextCb = nextCbFromGatherer() } else if (initialJoin) { @@ -497,11 +478,8 @@ trait GpuHashJoin extends GpuExec { // struct keys with nullable children. Non-nested keys can also be correctly processed with // compareNullsEqual = true, because we filter all null records from build table before join. // For some details, please refer the issue: https://github.com/NVIDIA/spark-rapids/issues/2126 - protected lazy val compareNullsEqual: Boolean = { - val ret = (joinType != FullOuter) && GpuHashJoin.anyNullableStructChild(buildKeys) - System.err.println(s"SHOULD NULLS BE EQUAL $joinType => $ret") - ret - } + protected lazy val compareNullsEqual: Boolean = (joinType != FullOuter) && + GpuHashJoin.anyNullableStructChild(buildKeys) /** * Spark does joins rather simply. They do it row by row, and as such don't really worry @@ -596,12 +574,10 @@ trait GpuHashJoin extends GpuExec { // Filtering nulls on the build side is a workaround. // 1) For a performance issue in LeftSemi and LeftAnti joins // https://github.com/rapidsai/cudf/issues/7300 - // 2) As a work around to Struct joins with nullable children no doing the right thing + // 2) As a work around to Struct joins with nullable children // see https://github.com/NVIDIA/spark-rapids/issues/2126 for more info - val builtAnyNullable = compareNullsEqual || joinType == LeftSemi || joinType == LeftAnti - - System.err.println(s"SHOULD FILTER NULL KEYS FOR BUILT TABLE? $builtAnyNullable " + - s"${builtBatch.numRows()}") + val builtAnyNullable = (compareNullsEqual || joinType == LeftSemi || joinType == LeftAnti) && + buildKeys.exists(_.nullable) val cb = if (builtAnyNullable) { GpuHashJoin.filterNulls(builtBatch, boundBuildKeys) @@ -609,8 +585,6 @@ trait GpuHashJoin extends GpuExec { GpuColumnVector.incRefCounts(builtBatch) } - System.err.println(s"AFTER NULL FILTER (IF NEEDED) ${cb.numRows()}") - withResource(cb) { cb => closeOnExcept(GpuProjectExec.project(cb, boundBuildKeys)) { builtKeys => (builtKeys, GpuProjectExec.project(cb, boundBuildData))