From c14b199d13fab316841900459dac7353eaabcce9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 15 Jul 2024 20:05:54 -0700
Subject: [PATCH] Use new conversion code

---
 src/main/cpp/src/decimal_utils.cu | 121 ++++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 8 deletions(-)
diff --git a/src/main/cpp/src/decimal_utils.cu b/src/main/cpp/src/decimal_utils.cu
index 504d58233b..a95cdac480 100644
--- a/src/main/cpp/src/decimal_utils.cu
+++ b/src/main/cpp/src/decimal_utils.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -28,6 +29,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/cmath>
+
 #include <cmath>
 #include <cstddef>
 
@@ -1202,11 +1205,110 @@ FloatingType __device__ fix_before_round(FloatingType floating, int const exp10)
   return std::nextafter(floating, direction);
 }
 
-template <typename FloatingType>
+using namespace numeric;
+using namespace numeric::detail;
+
+/**
+ * @brief Perform floating-point -> integer decimal conversion, matching Spark
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param floating The floating point value to convert
+ * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE Rep convert_floating_to_integral_SPARK_RAPIDS(FloatingType floating,
+                                                               scale_type const& scale)
+{
+  // The rounding and precision decisions made here are chosen to match Apache Spark.
+  // Spark wants to perform the conversion as double to have the most precision.
+  // However, the behavior is still slightly different if the original type was float.
+
+  // Extract components of the (double-ized) floating point number
+  using converter        = floating_converter<double>;
+  auto const integer_rep = converter::bit_cast_to_integer(double(floating));
+  if (converter::is_zero(integer_rep)) { return 0; }
+
+  // Note that the significand here is an unsigned integer with sizeof(double)
+  auto const is_negative                  = converter::get_is_negative(integer_rep);
+  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
+
+  // Spark often wants to round the last decimal place, so we'll perform the conversion
+  // with one lower power of 10 so that we can (optionally) round at the end.
+  auto const pow10          = static_cast<int>(scale);
+  auto const shifting_pow10 = pow10 - 1;
+
+  // Sometimes add half a bit to correct for compiler rounding text to nearest floating-point value.
+  // See comments in add_half_if_truncates(), with differences detailed below.
+  // Even if we don't add the bit, shift bits to line up with what the shifting algorithm is
+  // expecting.
+  bool const is_whole_number     = (cuda::std::floor(floating) == floating);
+  auto const [base2_value, pow2] = [is_whole_number](auto significand, auto floating_pow2) {
+    if constexpr (cuda::std::is_same_v<FloatingType, double>) {
+      // Add the 1/2 bit regardless of truncation, but still not for whole numbers
+      auto const base2_value =
+        (significand << 1) + static_cast<decltype(significand)>(!is_whole_number);
+      return std::make_pair(base2_value, floating_pow2 - 1);
+    } else {
+      // Input was float: never add 1/2 bit.
+      // Why? Because we converted to double, and the 1/2 bit beyond float is WAY too large compared
+      // to double's precision. And the 1/2 bit beyond double is not due to user input.
+      return std::make_pair(significand << 1, floating_pow2 - 1);
+    }
+  }(significand, floating_pow2);
+
+  // Main algorithm: Apply the powers of 2 and 10 (except for the last power-of-10)
+  auto magnitude =
+    convert_floating_to_integral_shifting<Rep, double>(base2_value, shifting_pow10, pow2);
+
+  // Spark wants to floor the last digits of the output, clearing data that was beyond the
+  // precision that was available in double.
+  // How many digits do we need to floor?
+  // From the decimal digit corresponding to pow2 (just past double precision) to the end (pow10).
+  // The conversion from pow2 to pow10 is log10(2), which is ~ 90/299 (close enough for ints)
+  int const floor_pow10 = (90 * pow2) / 299 - pow10;
+  if (floor_pow10 < 0) {
+    // Truncated: The scale factor cut off the extra, imprecise bits.
+    // To round to the final decimal place, add 5 to one past the last decimal place
+    magnitude += 5U;
+    magnitude /= 10U;  // Apply the last power of 10
+  } else {
+    // We are keeping decimal digits with data beyond the precision of double
+    // We want to truncate these digits, but sometimes we want to round first
+    // We will round if and only if we didn't already add a half-bit earlier
+    if constexpr (cuda::std::is_same_v<FloatingType, double>) {
+      // For doubles, only round the extra digits of whole numbers
+      // If it was not a whole number, we already added 1/2 a bit at higher precision than this
+      // earlier.
+      if (is_whole_number) {
+        magnitude += multiply_power10<Rep>(decltype(magnitude)(5), floor_pow10);
+      }
+    } else {
+      // Input was float: we didn't add a half-bit earlier, so round at the edge of precision here.
+      magnitude += multiply_power10<Rep>(decltype(magnitude)(5), floor_pow10);
+    }
+
+    // +1: Divide the last power-of-10 that we postponed earlier to do rounding.
+    auto const truncated = divide_power10<Rep>(magnitude, floor_pow10 + 1);
+    magnitude            = multiply_power10<Rep>(truncated, floor_pow10);
+  }
+
+  // Reapply the sign and return
+  // NOTE: Cast can overflow!
+  auto const signed_magnitude = static_cast<Rep>(magnitude);
+  return is_negative ? -signed_magnitude : signed_magnitude;
+}
+
+template <typename FloatingType, typename IntType>
 FloatingType __device__ scaled_round(FloatingType floating, int const exp10)
 {
-  auto const scale_factor = std::pow(10, exp10);
-  return std::round(scale_factor * fix_before_round(floating, exp10));
+  //  auto const scale_factor = std::pow(10, exp10);
+  //  return std::round(scale_factor * fix_before_round(floating, exp10));
+  return static_cast<FloatingType>(convert_floating_to_integral_SPARK_RAPIDS<IntType, FloatingType>(
+    fix_before_round(floating, exp10), numeric::scale_type{-exp10}));
 }
 
 struct float_to_decimal_fn {
@@ -1219,10 +1321,10 @@ struct float_to_decimal_fn {
                   int32_t precision,
                   rmm::cuda_stream_view stream) const
   {
-    if constexpr ((std::is_same_v<FloatType, float> || std::is_same_v<FloatType, double>)&&(
-                    std::is_same_v<DecimalType, numeric::decimal32> ||
-                    std::is_same_v<DecimalType, numeric::decimal64> ||
-                    std::is_same_v<DecimalType, numeric::decimal128>)) {
+    if constexpr ((std::is_same_v<FloatType, float> || std::is_same_v<FloatType, double>) &&
+                  (std::is_same_v<DecimalType, numeric::decimal32> ||
+                   std::is_same_v<DecimalType, numeric::decimal64> ||
+                   std::is_same_v<DecimalType, numeric::decimal128>)) {
       using DecimalRepType = cudf::device_storage_type_t<DecimalType>;
 
       // Exclusive bound
@@ -1261,7 +1363,10 @@ struct float_to_decimal_fn {
                           //        scaled_rounded,
                           //        scale * std::nextafter(static_cast<double>(x), direction));
 #endif
-          auto const scaled_rounded = scaled_round<double>(static_cast<double>(x), decimal_places);
+          //          auto const scaled_rounded =
+          //          scaled_round<double>(static_cast<double>(x), decimal_places);
+          auto const scaled_rounded =
+            scaled_round<double, DecimalType::rep>(static_cast<double>(x), decimal_places);
 
           auto const is_out_of_bound =
             (min_ex_bound >= scaled_rounded) || (scaled_rounded >= max_ex_bound);