NVIDIA · revans2 · Feb 5, 2021 · Jan 22, 2021 · Jan 27, 2021 · Jan 27, 2021
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -4437,7 +4437,7 @@ Accelerator support is described below.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><b>NS</b></td>
+<td>S*</td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -4458,7 +4458,7 @@ Accelerator support is described below.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><b>NS</b></td>
+<td>S*</td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -4479,7 +4479,7 @@ Accelerator support is described below.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><b>NS</b></td>
+<td><em>PS* (Because of Spark's inner workings the full range of decimal precision (even for 64-bit values) is not supported.)</em></td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -9168,7 +9168,7 @@ Accelerator support is described below.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><em>PS* (Because of Spark's inner workings the full range of decimal precision (even for 64-bit value) is not supported.)</em></td>
+<td><em>PS* (Because of Spark's inner workings the full range of decimal precision (even for 64-bit values) is not supported.)</em></td>
 <td> </td>
 <td> </td>
 <td> </td>

diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py
@@ -69,7 +69,7 @@ def test_multiplication_mixed(lhs, rhs):
                 f.col('a') * f.col('b')),
             conf=allow_negative_scale_of_decimal_conf)
 
-@pytest.mark.parametrize('data_gen', numeric_gens, ids=idfn)
+@pytest.mark.parametrize('data_gen', [double_gen, decimal_gen_neg_scale, DecimalGen(6, 3), DecimalGen(5, 5), DecimalGen(6, 0)], ids=idfn)
 def test_division(data_gen):
     data_type = data_gen.data_type
     assert_gpu_and_cpu_are_equal_collect(
@@ -78,7 +78,16 @@ def test_division(data_gen):
                 f.lit(-12).cast(data_type) / f.col('b'),
                 f.lit(None).cast(data_type) / f.col('a'),
                 f.col('b') / f.lit(None).cast(data_type),
-                f.col('a') / f.col('b')))
+                f.col('a') / f.col('b')),
+            conf=allow_negative_scale_of_decimal_conf)
+
+@pytest.mark.parametrize('lhs', [DecimalGen(5, 3), DecimalGen(4, 2), DecimalGen(1, -2)], ids=idfn)
+@pytest.mark.parametrize('rhs', [DecimalGen(4, 1)], ids=idfn)
+def test_division_mixed(lhs, rhs):
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark : two_col_df(spark, lhs, rhs).select(
+                f.col('a') / f.col('b')),
+            conf=allow_negative_scale_of_decimal_conf)
 
 @pytest.mark.parametrize('data_gen', integral_gens +  [decimal_gen_default, decimal_gen_scale_precision,
         decimal_gen_same_scale_precision, decimal_gen_64bit], ids=idfn)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -1410,7 +1410,7 @@ object GpuOverrides {
       "Multiplication",
       ExprChecks.binaryProjectNotLambda(TypeSig.numeric +
           TypeSig.psNote(TypeEnum.DECIMAL, "Because of Spark's inner workings the full range " +
-              "of decimal precision (even for 64-bit value) is not supported."),
+              "of decimal precision (even for 64-bit values) is not supported."),
         TypeSig.numeric,
         ("lhs", TypeSig.numeric, TypeSig.numeric),
         ("rhs", TypeSig.numeric, TypeSig.numeric)),
@@ -1614,10 +1614,42 @@ object GpuOverrides {
     expr[Divide](
       "Division",
       ExprChecks.binaryProjectNotLambda(
-        TypeSig.DOUBLE, TypeSig.DOUBLE + TypeSig.DECIMAL,
-        ("lhs", TypeSig.DOUBLE, TypeSig.DOUBLE + TypeSig.DECIMAL),
-        ("rhs", TypeSig.DOUBLE, TypeSig.DOUBLE + TypeSig.DECIMAL)),
+        TypeSig.DOUBLE +
+            TypeSig.psNote(TypeEnum.DECIMAL, "Because of Spark's inner workings the full range " +
+                "of decimal precision (even for 64-bit values) is not supported."),
+        TypeSig.DOUBLE + TypeSig.DECIMAL,
+        ("lhs", TypeSig.DOUBLE + TypeSig.DECIMAL, TypeSig.DOUBLE + TypeSig.DECIMAL),
+        ("rhs", TypeSig.DOUBLE + TypeSig.DECIMAL, TypeSig.DOUBLE + TypeSig.DECIMAL)),
       (a, conf, p, r) => new BinaryExprMeta[Divide](a, conf, p, r) {
+        override def tagExprForGpu(): Unit = {
+          // Division of Decimal types is a little odd. Spark will cast the inputs
+          // to a common wider value where scale is max of the two input scales, and precision is
+          // max of the two input non-scale portions + the new scale. Then it will do the divide,
+          // which the rules for it are a little complex, but lie about it
+          // in the return type of the Divide operator. Then in CheckOverflow it will reset the
+          // scale and check the precision so that they know it fits in final desired result which
+          // We would like to avoid all of this if possible because having a temporary intermediate
+          // value that can have a scale quite a bit larger than the final result reduces the
+          // maximum precision that we could support, as we don't have unlimited precision. But
+          // sadly because of how the logical plan is compiled down to the physical plan we have
+          // lost what the original types were and cannot recover it. As such for now we are going
+          // to do what Spark does, but we have to recompute/recheck the temporary precision to be
+          // sure it will fit on the GPU. In addition to this we have it a little harder because
+          // the decimal divide itself will do rounding on the result before it is returned,
+          // effectively calculating an extra digit of precision. Because cudf does not support this
+          // right now we actually increase the scale (and corresponding precision) to get an extra
+          // decimal place so we can round it in GpuCheckOverflow
+          (childExprs.head.dataType, childExprs(1).dataType) match {
+            case (l: DecimalType, r: DecimalType) =>
+              val intermediateResult = GpuDivideUtil.decimalDataType(l, r)
+              if (intermediateResult.precision > DType.DECIMAL64_MAX_PRECISION) {
+                willNotWorkOnGpu("The actual output precision of the divide is too large" +
+                    s" to fit on the GPU $intermediateResult")
+              }
+            case _ => // NOOP
+          }
+        }
+
         override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression =
           GpuDivide(lhs, rhs)
       }),

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/decimalExpressions.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/decimalExpressions.scala
@@ -33,7 +33,7 @@ case class GpuCheckOverflow(child: Expression,
     val expectedCudfScale = -dataType.scale
     if (foundCudfScale == expectedCudfScale) {
       base.incRefCount()
-    } else if (foundCudfScale < expectedCudfScale) {
+    } else if (-foundCudfScale < -expectedCudfScale) {
       base.castTo(DType.create(DType.DTypeEnum.DECIMAL64, expectedCudfScale))
     } else {
       // need to round off

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/arithmetic.scala
@@ -197,7 +197,7 @@ trait GpuDivModLike extends CudfBinaryArithmetic {
 
   override def doColumnar(lhs: GpuColumnVector, rhs: Scalar): ColumnVector = {
     if (isScalarZero(rhs)) {
-      withResource(Scalar.fromNull(lhs.getBase.getType)) { nullScalar =>
+      withResource(Scalar.fromNull(outputType(lhs.getBase, rhs))) { nullScalar =>
         ColumnVector.fromScalar(nullScalar, lhs.getRowCount.toInt)
       }
     } else {
@@ -206,13 +206,35 @@ trait GpuDivModLike extends CudfBinaryArithmetic {
   }
 }
 
+object GpuDivideUtil {
+  def decimalDataType(l: DecimalType, r: DecimalType): DecimalType = {
+    // Spark does
+    // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1)
+    // Scale: max(6, s1 + p2 + 1)
+    // But ... We need to do rounding at the end so we need one more than that.
+    val s = math.max(6, l.scale + r.precision + 1) + 1
+    val p = l.precision - l.scale + r.scale + s
+    // TODO once we have 128-bit decimal support we should match the config for precision loss.
+    DecimalType(math.min(p, 38), math.min(s, 38))
+  }
+}
+
 // This is for doubles and floats...
 case class GpuDivide(left: Expression, right: Expression) extends GpuDivModLike {
   override def inputType: AbstractDataType = TypeCollection(DoubleType, DecimalType)
 
   override def symbol: String = "/"
 
   override def binaryOp: BinaryOp = BinaryOp.TRUE_DIV
+
+  override def outputTypeOverride: DType =
+    GpuColumnVector.getNonNestedRapidsType(dataType)
+
+  // Override the output type as a special case for decimal
+  override def dataType: DataType = (left.dataType, right.dataType) match {
+    case (l: DecimalType, r: DecimalType) =>  GpuDivideUtil.decimalDataType(l, r)
+    case _ => super.dataType
+  }
 }
 
 case class GpuIntegralDivide(left: Expression, right: Expression) extends GpuDivModLike {