NVIDIA · nartal1 · Dec 21, 2021 · Dec 20, 2021 · Dec 20, 2021 · Dec 20, 2021
diff --git a/docs/configs.md b/docs/configs.md
@@ -265,6 +265,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.RegExpExtract"></a>spark.rapids.sql.expression.RegExpExtract|`regexp_extract`|RegExpExtract|false|This is disabled by default because the implementation is not 100% compatible. See the compatibility guide for more information.|
 <a name="sql.expression.RegExpReplace"></a>spark.rapids.sql.expression.RegExpReplace|`regexp_replace`|RegExpReplace support for string literal input patterns|false|This is disabled by default because the implementation is not 100% compatible. See the compatibility guide for more information.|
 <a name="sql.expression.Remainder"></a>spark.rapids.sql.expression.Remainder|`%`, `mod`|Remainder or modulo|true|None|
+<a name="sql.expression.ReplicateRows"></a>spark.rapids.sql.expression.ReplicateRows| |Given an input row replicates the row N times|true|None|
 <a name="sql.expression.Rint"></a>spark.rapids.sql.expression.Rint|`rint`|Rounds up a double value to the nearest double equal to an integer|true|None|
 <a name="sql.expression.Round"></a>spark.rapids.sql.expression.Round|`round`|Round an expression to d decimal places using HALF_UP rounding mode|true|None|
 <a name="sql.expression.RowNumber"></a>spark.rapids.sql.expression.RowNumber|`row_number`|Window function that returns the index for the row within the aggregation window|true|None|

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -10119,6 +10119,53 @@ are limited.
 <th>UDT</th>
 </tr>
 <tr>
+<td rowSpan="2">ReplicateRows</td>
+<td rowSpan="2"> </td>
+<td rowSpan="2">Given an input row replicates the row N times</td>
+<td rowSpan="2">None</td>
+<td rowSpan="2">project</td>
+<td>input</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td><em>PS<br/>UTC is only supported TZ for TIMESTAMP</em></td>
+<td>S</td>
+<td><em>PS<br/>max DECIMAL precision of 18</em></td>
+<td>S</td>
+<td><b>NS</b></td>
+<td><b>NS</b></td>
+<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, MAP, UDT</em></td>
+<td><b>NS</b></td>
+<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, MAP, UDT</em></td>
+<td><b>NS</b></td>
+</tr>
+<tr>
+<td>result</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, MAP, UDT</em></td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
 <td rowSpan="4">Rint</td>
 <td rowSpan="4">`rint`</td>
 <td rowSpan="4">Rounds up a double value to the nearest double equal to an integer</td>

diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -416,6 +416,23 @@ def test_hash_avg_nulls_partial_only(data_gen):
         conf=_no_nans_float_conf_partial
     )
 
+@approximate_float
+@ignore_order
+@incompat
+@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimal, ids=idfn)
+def test_intersectAll(data_gen):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100)),
+        conf=allow_negative_scale_of_decimal_conf)
+
+@approximate_float
+@ignore_order
+@incompat
+@pytest.mark.parametrize('data_gen', _init_list_no_nans_with_decimal, ids=idfn)
+def test_exceptAll(data_gen):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b')),
+        conf=allow_negative_scale_of_decimal_conf)
 
 @approximate_float
 @ignore_order(local=True)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
@@ -24,11 +24,11 @@ import com.nvidia.spark.rapids.shims.v2.ShimUnaryExecNode
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, Generator}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, Generator, ReplicateRows}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{GenerateExec, SparkPlan}
 import org.apache.spark.sql.rapids.GpuCreateArray
-import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, MapType, StructType}
+import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, MapType, StructField, StructType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class GpuGenerateExecSparkPlanMeta(
@@ -68,6 +68,22 @@ abstract class GeneratorExprMeta[INPUT <: Generator](
   val supportOuter: Boolean = false
 }
 
+/**
+ * Base class for metadata around GeneratorExprMeta.
+ */
+abstract class ReplicateRowsExprMeta[INPUT <: ReplicateRows](
+    gen: INPUT,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends GeneratorExprMeta[INPUT](gen, conf, parent, rule) {
+
+  override final def convertToGpu(): GpuExpression =
+    convertToGpu(childExprs.map(_.convertToGpu()))
+
+  def convertToGpu(childExprs: Seq[Expression]): GpuExpression
+}
+
 /**
  * GPU overrides of `Generator`, corporate with `GpuGenerateExec`.
  */
@@ -164,6 +180,63 @@ trait GpuGenerator extends GpuUnevaluable {
   }
 }
 
+case class GpuReplicateRows(children: Seq[Expression]) extends GpuGenerator {
+
+  override def elementSchema: StructType =
+    StructType(children.tail.zipWithIndex.map {
+      case (e, index) => StructField(s"col$index", e.dataType)
+    })
+
+  override def generate(inputBatch: ColumnarBatch,
+      generatorOffset: Int,
+      outer: Boolean): ColumnarBatch = {
+
+    val schema = GpuColumnVector.extractTypes(inputBatch)
+    val vectors = GpuColumnVector.extractBases(inputBatch)
+    val replicateVector = vectors(generatorOffset)
+
+    def replicateRows(inputTable: Table, genOffset: ColumnVector): Table = {
+      inputTable.repeat(genOffset)
+    }
+
+    withResource(GpuColumnVector.from(inputBatch)) { table =>
+      withResource(replicateRows(table, replicateVector)) { replicatedTable =>
+        GpuColumnVector.from(replicatedTable, schema)
+      }
+    }
+  }
+
+  override def inputSplitIndices(inputBatch: ColumnarBatch,
+      generatorOffset: Int,
+      outer: Boolean,
+      targetSizeBytes: Long): Array[Int] = {
+    val vectors = GpuColumnVector.extractBases(inputBatch)
+    val inputRows = inputBatch.numRows()
+    if (inputRows == 0) return Array()
+
+    // Calculate the number of rows that needs to be replicated. Here we find the mean of the
+    // generator column. Multiplying the mean with size of projected columns would give us the
+    // approximate memory required.
+    val meanOutputRows = math.ceil(vectors(generatorOffset).mean().getDouble)
+    val estimatedOutputRows = meanOutputRows * inputRows
+
+    // input size of columns to be repeated
+    val repeatColsInputSize = vectors.slice(0, generatorOffset).map(_.getDeviceMemorySize).sum
+    // estimated total output size
+    val estimatedOutputSizeBytes = repeatColsInputSize * estimatedOutputRows / inputRows
+
+    // how may splits will we need to keep the output size under the target size
+    val numSplitsForTargetSize = math.ceil(estimatedOutputSizeBytes / targetSizeBytes).toInt
+    // how may splits will we need to keep the output rows under max value
+    val numSplitsForTargetRow = math.ceil(estimatedOutputRows / Int.MaxValue).toInt
+    // how may splits will we need to keep replicateRows working safely
+    val numSplits = numSplitsForTargetSize max numSplitsForTargetRow
+
+    if (numSplits == 0) Array()
+    else GpuBatchUtils.generateSplitIndices(inputRows, numSplits)
+  }
+}
+
 abstract class GpuExplodeBase extends GpuUnevaluableUnaryExpression with GpuGenerator {
 
   /** The position of an element within the collection should also be returned. */

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3104,6 +3104,22 @@ object GpuOverrides extends Logging {
         override val supportOuter: Boolean = true
         override def convertToGpu(): GpuExpression = GpuPosExplode(childExprs.head.convertToGpu())
       }),
+    expr[ReplicateRows](
+      "Given an input row replicates the row N times",
+      ExprChecks.projectOnly(
+        // The plan is optimized to run HashAggregate on the rows to be replicated. So this
+        // currently supports DECIMAL 64 and HashAggregateExec doesn't support DECIMAL 128 yet.
+        TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_64 +
+            TypeSig.ARRAY + TypeSig.STRUCT),
+        TypeSig.ARRAY.nested(TypeSig.all),
+        repeatingParamCheck = Some(RepeatingParamCheck("input",
+          (TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_64
+              + TypeSig.ARRAY + TypeSig.STRUCT).nested(),
+          TypeSig.all))),
+      (a, conf, p, r) => new ReplicateRowsExprMeta[ReplicateRows](a, conf, p, r) {
+        override def convertToGpu(childExpr: Seq[Expression]): GpuExpression =
+          GpuReplicateRows(childExpr)
+      }),
     expr[CollectList](
       "Collect a list of non-unique elements, not supported in reduction",
       // GpuCollectList is not yet supported in Reduction context.