From c8a171c74ae215dfc12079750eb98f795126c0f3 Mon Sep 17 00:00:00 2001
From: choekstra <choekstra@nvidia.com>
Date: Tue, 20 Jul 2021 02:07:34 +0000
Subject: [PATCH 001/112] Initial changes

---
 cpp/include/cudf/fixed_point/fixed_point.hpp   | 9 ++++++---
 cpp/include/cudf/utilities/traits.hpp          | 4 +++-
 cpp/include/cudf/utilities/type_dispatcher.hpp | 3 ++-
 cpp/tests/fixed_point/fixed_point_tests.cu     | 2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index d7bc9e02eff..9c3e69c3ea8 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -48,7 +48,9 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same<T, int32_t>::value || cuda::std::is_same<T, int64_t>::value;
+  return cuda::std::is_same<T, int32_t>::value ||
+         cuda::std::is_same<T, int64_t>::value ||
+         cuda::std::is_same<T, __int128_t>::value;
 }
 
 template <typename T>
@@ -750,8 +752,9 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point<Rep1, Rad1> const& lhs,
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
 }
 
-using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+using decimal32  = fixed_point<int32_t, Radix::BASE_10>;
+using decimal64  = fixed_point<int64_t, Radix::BASE_10>;
+using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;
 
 /** @} */  // end of group
 }  // namespace numeric
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 2cdc455e05c..808fc45d07f 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -379,7 +379,9 @@ constexpr inline bool is_timestamp(data_type type)
 template <typename T>
 constexpr inline bool is_fixed_point()
 {
-  return std::is_same<numeric::decimal32, T>::value || std::is_same<numeric::decimal64, T>::value;
+  return std::is_same<numeric::decimal32, T>::value ||
+         std::is_same<numeric::decimal64, T>::value ||
+         std::is_same<numeric::decimal128, T>::value;
 }
 
 struct is_fixed_point_impl {
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index bd9ea015a32..48259d3ee0d 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -99,7 +99,8 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 template <typename T>
 using device_storage_type_t =
   std::conditional_t<std::is_same<numeric::decimal32, T>::value, int32_t,
-  std::conditional_t<std::is_same<numeric::decimal64, T>::value, int64_t, T>>;
+  std::conditional_t<std::is_same<numeric::decimal64, T>::value, int64_t,
+  std::conditional_t<std::is_same<numeric::decimal128, T>::value, __int128_t, T>>>;
 // clang-format on
 
 /**
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 7244b913a6a..2627ab6d48d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -43,7 +43,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
 TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
 

From afe6ec6a3f3fdaa221a3fbfe902c3977d366a84b Mon Sep 17 00:00:00 2001
From: choekstra <choekstra@nvidia.com>
Date: Tue, 20 Jul 2021 06:38:45 +0000
Subject: [PATCH 002/112] More changes

---
 .../cudf/column/column_device_view.cuh        | 18 ++++++++
 .../cudf/detail/aggregation/aggregation.cuh   | 42 ++++++++++++-------
 cpp/include/cudf/detail/copy_if.cuh           |  5 +++
 .../cudf/detail/utilities/device_atomics.cuh  |  2 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  | 29 +++++++------
 cpp/include/cudf/types.hpp                    |  1 +
 cpp/include/cudf/utilities/traits.hpp         |  4 +-
 .../cudf/utilities/type_dispatcher.hpp        | 15 ++++++-
 cpp/src/io/json/json_gpu.cu                   |  8 ++++
 cpp/src/jit/type.cpp                          |  1 +
 cpp/src/quantiles/quantile.cu                 |  4 +-
 cpp/src/round/round.cu                        |  3 +-
 cpp/src/scalar/scalar.cpp                     |  3 ++
 .../strings/convert/convert_fixed_point.cu    | 41 ++++++++++--------
 cpp/src/unary/math_ops.cu                     |  5 ++-
 15 files changed, 128 insertions(+), 53 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 02e3eee6b43..4639cb4f357 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -455,6 +455,24 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
     return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
   }
 
+  /**
+   * @brief Returns a `numeric::decimal128` element at the specified index for a `fixed_point`
+   * column.
+   *
+   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
+   * then any attempt to use the result will lead to undefined behavior.
+   *
+   * @param element_index Position of the desired element
+   * @return numeric::decimal128 representing the element at this index
+   */
+   template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal128>::value)>
+   __device__ T element(size_type element_index) const noexcept
+   {
+     using namespace numeric;
+     auto const scale = scale_type{_type.scale()};
+     return decimal128{scaled_integer<__int128_t>{data<__int128_t>()[element_index], scale}};
+   }
+
   /**
    * @brief For a given `T`, indicates if `column_device_view::element<T>()` has a valid overload.
    *
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 53c1f47c201..c64fba286d4 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -139,9 +139,11 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    using Target = target_type_t<Source, aggregation::MIN>;
-    atomicMin(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    if constexpr (not std::is_same<Source, __int128_t>::value) {
+      using Target = target_type_t<Source, aggregation::MIN>;
+      atomicMin(&target.element<Target>(target_index),
+                static_cast<Target>(source.element<Source>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -164,8 +166,10 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMin(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+      atomicMin(&target.element<DeviceTarget>(target_index),
+                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -185,9 +189,11 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    using Target = target_type_t<Source, aggregation::MAX>;
-    atomicMax(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    if constexpr (not std::is_same<Source, __int128_t>::value) {
+      using Target = target_type_t<Source, aggregation::MAX>;
+      atomicMax(&target.element<Target>(target_index),
+                static_cast<Target>(source.element<Source>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -210,8 +216,10 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicMax(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+      atomicMax(&target.element<DeviceTarget>(target_index),
+                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -231,9 +239,11 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    using Target = target_type_t<Source, aggregation::SUM>;
-    atomicAdd(&target.element<Target>(target_index),
-              static_cast<Target>(source.element<Source>(source_index)));
+    if constexpr (not std::is_same<Source, __int128_t>::value) {
+      using Target = target_type_t<Source, aggregation::SUM>;
+      atomicAdd(&target.element<Target>(target_index),
+                static_cast<Target>(source.element<Source>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -256,8 +266,10 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    atomicAdd(&target.element<DeviceTarget>(target_index),
-              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+      atomicAdd(&target.element<DeviceTarget>(target_index),
+                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 2051daec00b..372a2ece6de 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -223,6 +223,11 @@ struct DeviceType<T, std::enable_if_t<std::is_same<numeric::decimal64, T>::value
   using type = typename cudf::device_storage_type_t<T>;
 };
 
+template <typename T>
+struct DeviceType<T, std::enable_if_t<std::is_same<numeric::decimal128, T>::value>> {
+  using type = typename cudf::device_storage_type_t<T>;
+};
+
 // Dispatch functor which performs the scatter for fixed column types and gather for other
 template <typename Filter, int block_size>
 struct scatter_gather_functor {
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 6380e76fdfa..8e340408449 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -426,7 +426,7 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  */
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<cudf::is_numeric<T>(), T> __forceinline__ __device__
+typename std::enable_if_t<cudf::is_numeric<T>() && not std::is_same<T, __int128_t>::value, T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 {
   auto fun = cudf::detail::genericAtomicOperationImpl<T, BinaryOp>{};
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 9c3e69c3ea8..ccfd4a7aab4 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -550,19 +550,24 @@ class fixed_point {
    */
   explicit operator std::string() const
   {
-    if (_scale < 0) {
-      auto const av   = std::abs(_value);
-      int64_t const n = std::pow(10, -_scale);
-      int64_t const f = av % n;
-      auto const num_zeros =
-        std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
-      auto const zeros = std::string(num_zeros, '0');
-      auto const sign  = _value < 0 ? std::string("-") : std::string();
-      return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n);
-    } else {
-      auto const zeros = std::string(_scale, '0');
-      return std::to_string(_value) + zeros;
+    if constexpr (not std::is_same<Rep, __int128_t>::value) {
+      if (_scale < 0) {
+        auto const av   = std::abs(_value); 
+        int64_t const n = std::pow(10, -_scale);
+        int64_t const f = av % n;
+        auto const num_zeros =
+          std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
+        auto const zeros = std::string(num_zeros, '0');
+        auto const sign  = _value < 0 ? std::string("-") : std::string();
+        return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n);
+      } else {
+        auto const zeros = std::string(_scale, '0');
+        return std::to_string(_value) + zeros;
+      }
     }
+
+    // std::abs and std::to_string don't work on __int128_t
+    return "TODO";
   }
 };
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index e1037efb5c8..f639c2dae6b 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -228,6 +228,7 @@ enum class type_id : int32_t {
   LIST,                    ///< List elements
   DECIMAL32,               ///< Fixed-point type with int32_t
   DECIMAL64,               ///< Fixed-point type with int64_t
+  DECIMAL128,              ///< Fixed-point type with __int128_t
   STRUCT,                  ///< Struct elements
   // `NUM_TYPE_IDS` must be last!
   NUM_TYPE_IDS  ///< Total number of type ids
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 808fc45d07f..8bae0d5d150 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -152,7 +152,7 @@ constexpr inline bool is_equality_comparable()
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value;
+  return std::is_integral<T>::value or std::is_floating_point<T>::value || std::is_same<T, __int128_t>::value;
 }
 
 struct is_numeric_impl {
@@ -489,7 +489,7 @@ constexpr inline bool is_chrono(data_type type)
 template <typename T>
 constexpr bool is_rep_layout_compatible()
 {
-  return cudf::is_numeric<T>() or cudf::is_chrono<T>() or cudf::is_boolean<T>();
+  return cudf::is_numeric<T>() or cudf::is_chrono<T>() or cudf::is_boolean<T>() || std::is_same<T, __int128_t>::value;
 }
 
 /**
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 48259d3ee0d..1fe2692834a 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -114,6 +114,7 @@ inline type_id device_storage_type_id(type_id id)
   switch (id) {
     case type_id::DECIMAL32: return type_id::INT32;
     case type_id::DECIMAL64: return type_id::INT64;
+    // case type_id::DECIMAL128: return type_id::INT128; // TODO: avoid this (need for type_id::INT128)
     default: return id;
   }
 }
@@ -131,7 +132,8 @@ template <typename T>
 bool type_id_matches_device_storage_type(type_id id)
 {
   return (id == type_id::DECIMAL32 && std::is_same<T, int32_t>::value) ||
-         (id == type_id::DECIMAL64 && std::is_same<T, int64_t>::value) || id == type_to_id<T>();
+         (id == type_id::DECIMAL64 && std::is_same<T, int64_t>::value) ||
+         (id == type_id::DECIMAL128 && std::is_same<T, __int128_t>::value) || id == type_to_id<T>();
 }
 
 /**
@@ -189,6 +191,7 @@ CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32);
 CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST);
 CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
 CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
+CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128);
 CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
 
 /**
@@ -222,6 +225,7 @@ MAP_NUMERIC_SCALAR(int8_t)
 MAP_NUMERIC_SCALAR(int16_t)
 MAP_NUMERIC_SCALAR(int32_t)
 MAP_NUMERIC_SCALAR(int64_t)
+MAP_NUMERIC_SCALAR(__int128_t)
 MAP_NUMERIC_SCALAR(uint8_t)
 MAP_NUMERIC_SCALAR(uint16_t)
 MAP_NUMERIC_SCALAR(uint32_t)
@@ -254,6 +258,12 @@ struct type_to_scalar_type_impl<numeric::decimal64> {
   using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal64>;
 };
 
+template <>
+struct type_to_scalar_type_impl<numeric::decimal128> {
+  using ScalarType       = cudf::fixed_point_scalar<numeric::decimal128>;
+  using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal128>;
+};
+
 template <>  // TODO: this is a temporary solution for make_pair_iterator
 struct type_to_scalar_type_impl<cudf::dictionary32> {
   using ScalarType       = cudf::numeric_scalar<int32_t>;
@@ -495,6 +505,9 @@ CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher(cudf::data_ty
     case type_id::DECIMAL64:
       return f.template operator()<typename IdTypeMap<type_id::DECIMAL64>::type>(
         std::forward<Ts>(args)...);
+    case type_id::DECIMAL128:
+      return f.template operator()<typename IdTypeMap<type_id::DECIMAL128>::type>(
+        std::forward<Ts>(args)...);
     case type_id::STRUCT:
       return f.template operator()<typename IdTypeMap<type_id::STRUCT>::type>(
         std::forward<Ts>(args)...);
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index ba6bc30e0d4..9e8922211b2 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -285,6 +285,14 @@ __inline__ __device__ numeric::decimal64 decode_value(const char*,
   return numeric::decimal64{};
 }
 
+template <>
+__inline__ __device__ numeric::decimal128 decode_value(const char*,
+                                                      const char*,
+                                                      parse_options_view const&)
+{
+  return numeric::decimal128{};
+}
+
 /**
  * @brief Functor for converting plain text data to cuDF data type value.
  */
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index 16894168b31..cf91932ca19 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -76,6 +76,7 @@ std::string get_type_name(data_type type)
     case type_id::STRUCT: return CUDF_STRINGIFY(Struct);
     case type_id::DECIMAL32: return CUDF_STRINGIFY(int32_t);
     case type_id::DECIMAL64: return CUDF_STRINGIFY(int64_t);
+    case type_id::DECIMAL128: return CUDF_STRINGIFY(__int128_t);
 
     default: break;
   }
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 25bf4a436ad..831e309961a 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -47,7 +47,7 @@ struct quantile_functor {
   rmm::mr::device_memory_resource* mr;
 
   template <typename T>
-  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>(),
+  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>() or std::is_same<T, numeric::decimal128>::value, // TODO 
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {
@@ -55,7 +55,7 @@ struct quantile_functor {
   }
 
   template <typename T>
-  std::enable_if_t<std::is_arithmetic<T>::value or cudf::is_fixed_point<T>(),
+  std::enable_if_t<std::is_arithmetic<T>::value or cudf::is_fixed_point<T>() and not std::is_same<T, numeric::decimal128>::value, // TODO
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index b8c48434f5c..88a1b93e088 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -74,7 +74,8 @@ int16_t __device__ generic_sign(T)
 template <typename T>
 constexpr inline auto is_supported_round_type()
 {
-  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) || cudf::is_fixed_point<T>();
+  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) || cudf::is_fixed_point<T>() 
+         && not std::is_same<T, numeric::decimal128>::value;
 }
 
 template <typename T>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 045bfbe0327..606cdab8368 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -209,6 +209,7 @@ typename fixed_point_scalar<T>::rep_type const* fixed_point_scalar<T>::data() co
  */
 template class fixed_point_scalar<numeric::decimal32>;
 template class fixed_point_scalar<numeric::decimal64>;
+template class fixed_point_scalar<numeric::decimal128>;
 
 namespace detail {
 
@@ -282,6 +283,7 @@ template class fixed_width_scalar<int8_t>;
 template class fixed_width_scalar<int16_t>;
 template class fixed_width_scalar<int32_t>;
 template class fixed_width_scalar<int64_t>;
+template class fixed_width_scalar<__int128_t>;
 template class fixed_width_scalar<uint8_t>;
 template class fixed_width_scalar<uint16_t>;
 template class fixed_width_scalar<uint32_t>;
@@ -340,6 +342,7 @@ template class numeric_scalar<int8_t>;
 template class numeric_scalar<int16_t>;
 template class numeric_scalar<int32_t>;
 template class numeric_scalar<int64_t>;
+template class numeric_scalar<__int128_t>;
 template class numeric_scalar<uint8_t>;
 template class numeric_scalar<uint16_t>;
 template class numeric_scalar<uint32_t>;
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 2f57b38249f..aee05419b67 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -206,15 +206,18 @@ struct decimal_to_string_size_fn {
 
     if (scale >= 0) return count_digits(value) + scale;
 
-    auto const abs_value = std::abs(value);
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
-    auto const fraction  = count_digits(abs_value % exp_ten);
-    auto const num_zeros = std::max(0, (-scale - fraction));
-    return static_cast<int32_t>(value < 0) +    // sign if negative
-           count_digits(abs_value / exp_ten) +  // integer
-           1 +                                  // decimal point
-           num_zeros +                          // zeros padding
-           fraction;                            // size of fraction
+    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {
+      auto const abs_value = std::abs(value);
+      auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+      auto const fraction  = count_digits(abs_value % exp_ten);
+      auto const num_zeros = std::max(0, (-scale - fraction));
+      return static_cast<int32_t>(value < 0) +    // sign if negative
+            count_digits(abs_value / exp_ten) +  // integer
+            1 +                                  // decimal point
+            num_zeros +                          // zeros padding
+            fraction;                            // size of fraction
+      } 
+    return 0;
   }
 };
 
@@ -247,18 +250,20 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    auto const abs_value = std::abs(value);
-    if (value < 0) *d_buffer++ = '-';  // add sign
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
-    auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+    if constexpr (not std::is_same<DecimalType, __int128_t>::value) { // TODO fix
+      auto const abs_value = std::abs(value);
+      if (value < 0) *d_buffer++ = '-';  // add sign
+      auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+      auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
-    d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
-    *d_buffer++ = '.';                                             // add decimal point
+      d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
+      *d_buffer++ = '.';                                             // add decimal point
 
-    thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
-    d_buffer += num_zeros;
+      thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
+      d_buffer += num_zeros;
 
-    integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
+      integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
+    }
   }
 };
 
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 0f8d9228310..3849b56d4eb 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -271,7 +271,10 @@ struct fixed_point_floor {
 template <typename T>
 struct fixed_point_abs {
   T n;
-  __device__ T operator()(T data) { return std::abs(data); }
+  __device__ T operator()(T data) { 
+    // std::abs does not work for __int128_t
+    return data > 0 ? data : data * -1; 
+  }
 };
 
 template <typename T, template <typename> typename FixedPointFunctor>

From 43b615a6d0b4779343bce185d31203f147f3efe6 Mon Sep 17 00:00:00 2001
From: choekstra <choekstra@nvidia.com>
Date: Tue, 20 Jul 2021 06:52:58 +0000
Subject: [PATCH 003/112] Small cleanup

---
 cpp/include/cudf/utilities/traits.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 8bae0d5d150..d7a297d63e7 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -489,7 +489,7 @@ constexpr inline bool is_chrono(data_type type)
 template <typename T>
 constexpr bool is_rep_layout_compatible()
 {
-  return cudf::is_numeric<T>() or cudf::is_chrono<T>() or cudf::is_boolean<T>() || std::is_same<T, __int128_t>::value;
+  return cudf::is_numeric<T>() or cudf::is_chrono<T>() or cudf::is_boolean<T>();
 }
 
 /**

From ebedcadb97eb8633e615fde9d087797cec2cf3dd Mon Sep 17 00:00:00 2001
From: choekstra <choekstra@nvidia.com>
Date: Tue, 20 Jul 2021 07:45:48 +0000
Subject: [PATCH 004/112] Small cleanup

---
 cpp/include/cudf/detail/utilities/device_atomics.cuh | 2 +-
 cpp/include/cudf/utilities/traits.hpp                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 8e340408449..6380e76fdfa 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -426,7 +426,7 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  */
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<cudf::is_numeric<T>() && not std::is_same<T, __int128_t>::value, T> __forceinline__ __device__
+typename std::enable_if_t<cudf::is_numeric<T>(), T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 {
   auto fun = cudf::detail::genericAtomicOperationImpl<T, BinaryOp>{};
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index d7a297d63e7..b8255d4eb45 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -152,7 +152,7 @@ constexpr inline bool is_equality_comparable()
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value || std::is_same<T, __int128_t>::value;
+  return std::is_integral<T>::value or std::is_floating_point<T>::value or std::is_same<T, __int128_t>::value;
 }
 
 struct is_numeric_impl {

From 1d2e0b4a3de4f02932325fd08fbf37868bce08fd Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 21 Jul 2021 00:02:21 +0000
Subject: [PATCH 005/112] Removal of device_storage_type_id, formatting and
 more

---
 cpp/include/cudf/detail/iterator.cuh          |  3 +--
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  5 ++---
 cpp/include/cudf/utilities/traits.hpp         |  6 +++---
 .../cudf/utilities/type_dispatcher.hpp        | 20 ++-----------------
 cpp/src/aggregation/aggregation.cu            |  3 +--
 5 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index deb161fd9c2..3e789299716 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -102,9 +102,8 @@ struct null_replaced_value_accessor {
                                bool has_nulls = true)
     : col{col}, null_replacement{null_val}, has_nulls{has_nulls}
   {
-    CUDF_EXPECTS(type_to_id<Element>() == device_storage_type_id(col.type().id()),
+    CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(col.type().id()),
                  "the data type mismatch");
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index ccfd4a7aab4..d195f976419 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -48,8 +48,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same<T, int32_t>::value ||
-         cuda::std::is_same<T, int64_t>::value ||
+  return cuda::std::is_same<T, int32_t>::value || cuda::std::is_same<T, int64_t>::value ||
          cuda::std::is_same<T, __int128_t>::value;
 }
 
@@ -552,7 +551,7 @@ class fixed_point {
   {
     if constexpr (not std::is_same<Rep, __int128_t>::value) {
       if (_scale < 0) {
-        auto const av   = std::abs(_value); 
+        auto const av   = std::abs(_value);
         int64_t const n = std::pow(10, -_scale);
         int64_t const f = av % n;
         auto const num_zeros =
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index b8255d4eb45..402e2461da7 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -152,7 +152,8 @@ constexpr inline bool is_equality_comparable()
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value or std::is_same<T, __int128_t>::value;
+  return std::is_integral<T>::value or std::is_floating_point<T>::value or
+         std::is_same<T, __int128_t>::value;
 }
 
 struct is_numeric_impl {
@@ -379,8 +380,7 @@ constexpr inline bool is_timestamp(data_type type)
 template <typename T>
 constexpr inline bool is_fixed_point()
 {
-  return std::is_same<numeric::decimal32, T>::value ||
-         std::is_same<numeric::decimal64, T>::value ||
+  return std::is_same<numeric::decimal32, T>::value || std::is_same<numeric::decimal64, T>::value ||
          std::is_same<numeric::decimal128, T>::value;
 }
 
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 1fe2692834a..f6c07eb25ca 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -98,27 +98,11 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 // clang-format off
 template <typename T>
 using device_storage_type_t =
-  std::conditional_t<std::is_same<numeric::decimal32, T>::value, int32_t,
-  std::conditional_t<std::is_same<numeric::decimal64, T>::value, int64_t,
+  std::conditional_t<std::is_same<numeric::decimal32,  T>::value, int32_t,
+  std::conditional_t<std::is_same<numeric::decimal64,  T>::value, int64_t,
   std::conditional_t<std::is_same<numeric::decimal128, T>::value, __int128_t, T>>>;
 // clang-format on
 
-/**
- * @brief Returns the corresponding `type_id` of type stored on device for a given `type_id`
- *
- * @param id   The given `type_id`
- * @return     Corresponding `type_id` of type stored on device
- */
-inline type_id device_storage_type_id(type_id id)
-{
-  switch (id) {
-    case type_id::DECIMAL32: return type_id::INT32;
-    case type_id::DECIMAL64: return type_id::INT64;
-    // case type_id::DECIMAL128: return type_id::INT128; // TODO: avoid this (need for type_id::INT128)
-    default: return id;
-  }
-}
-
 /**
  * @brief Checks if `fixed_point`-like types have template type `T` matching the column's
  * stored type id
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index b9193345c94..5b48282cb5b 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -28,8 +28,7 @@ void initialize_with_identity(mutable_table_view& table,
   // kernel per column
   for (size_type i = 0; i < table.num_columns(); ++i) {
     auto col        = table.column(i);
-    auto const type = data_type{device_storage_type_id(col.type().id())};
-    dispatch_type_and_aggregation(type, aggs[i], identity_initializer{}, col, stream);
+    dispatch_type_and_aggregation(col.type(), aggs[i], identity_initializer{}, col, stream); // TODO SFINAE for decimal
   }
 }
 

From 2ea39fec28906be12be389b5bae29306efc3efa5 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 21 Jul 2021 01:22:55 +0000
Subject: [PATCH 006/112] Formatting

---
 .../cudf/column/column_device_view.cuh        | 14 ++++----
 cpp/src/aggregation/aggregation.cu            |  5 +--
 cpp/src/io/json/json_gpu.cu                   | 12 +++----
 cpp/src/quantiles/quantile.cu                 |  7 ++--
 cpp/src/round/round.cu                        |  4 +--
 .../strings/convert/convert_fixed_point.cu    | 34 +++++++++----------
 cpp/src/unary/math_ops.cu                     |  5 +--
 7 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 4639cb4f357..c1babff9df3 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -465,13 +465,13 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @param element_index Position of the desired element
    * @return numeric::decimal128 representing the element at this index
    */
-   template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal128>::value)>
-   __device__ T element(size_type element_index) const noexcept
-   {
-     using namespace numeric;
-     auto const scale = scale_type{_type.scale()};
-     return decimal128{scaled_integer<__int128_t>{data<__int128_t>()[element_index], scale}};
-   }
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal128>::value)>
+  __device__ T element(size_type element_index) const noexcept
+  {
+    using namespace numeric;
+    auto const scale = scale_type{_type.scale()};
+    return decimal128{scaled_integer<__int128_t>{data<__int128_t>()[element_index], scale}};
+  }
 
   /**
    * @brief For a given `T`, indicates if `column_device_view::element<T>()` has a valid overload.
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 5b48282cb5b..b4d4b99b87a 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -27,8 +27,9 @@ void initialize_with_identity(mutable_table_view& table,
   // TODO: Initialize all the columns in a single kernel instead of invoking one
   // kernel per column
   for (size_type i = 0; i < table.num_columns(); ++i) {
-    auto col        = table.column(i);
-    dispatch_type_and_aggregation(col.type(), aggs[i], identity_initializer{}, col, stream); // TODO SFINAE for decimal
+    auto col = table.column(i);
+    dispatch_type_and_aggregation(
+      col.type(), aggs[i], identity_initializer{}, col, stream);  // TODO SFINAE for decimal
   }
 }
 
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 9e8922211b2..244081ed286 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -63,9 +63,9 @@ __device__ std::pair<char const*, char const*> limit_range_to_brackets(char cons
   auto const data_begin = thrust::next(thrust::find_if(
     thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }));
   auto const data_end   = thrust::next(thrust::find_if(thrust::seq,
-                                                     thrust::make_reverse_iterator(end),
-                                                     thrust::make_reverse_iterator(data_begin),
-                                                     [](auto c) { return c == ']' || c == '}'; }))
+                                                       thrust::make_reverse_iterator(end),
+                                                       thrust::make_reverse_iterator(data_begin),
+                                                       [](auto c) { return c == ']' || c == '}'; }))
                           .base();
   return {data_begin, data_end};
 }
@@ -287,8 +287,8 @@ __inline__ __device__ numeric::decimal64 decode_value(const char*,
 
 template <>
 __inline__ __device__ numeric::decimal128 decode_value(const char*,
-                                                      const char*,
-                                                      parse_options_view const&)
+                                                       const char*,
+                                                       parse_options_view const&)
 {
   return numeric::decimal128{};
 }
@@ -655,7 +655,7 @@ __global__ void detect_data_types_kernel(
       bool is_negative       = (*desc.value_begin == '-');
       char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
       cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
-        data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
+          data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
       atomicAdd(ptr, 1);
     } else if (is_like_float(
                  value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 831e309961a..89ec7ee7a47 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -47,7 +47,8 @@ struct quantile_functor {
   rmm::mr::device_memory_resource* mr;
 
   template <typename T>
-  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>() or std::is_same<T, numeric::decimal128>::value, // TODO 
+  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>() or
+                     std::is_same<T, numeric::decimal128>::value,  // TODO
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {
@@ -55,7 +56,9 @@ struct quantile_functor {
   }
 
   template <typename T>
-  std::enable_if_t<std::is_arithmetic<T>::value or cudf::is_fixed_point<T>() and not std::is_same<T, numeric::decimal128>::value, // TODO
+  std::enable_if_t<std::is_arithmetic<T>::value or
+                     cudf::is_fixed_point<T>() and
+                       not std::is_same<T, numeric::decimal128>::value,  // TODO
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 88a1b93e088..16167a82024 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -74,8 +74,8 @@ int16_t __device__ generic_sign(T)
 template <typename T>
 constexpr inline auto is_supported_round_type()
 {
-  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) || cudf::is_fixed_point<T>() 
-         && not std::is_same<T, numeric::decimal128>::value;
+  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) ||
+         cudf::is_fixed_point<T>() && not std::is_same<T, numeric::decimal128>::value;
 }
 
 template <typename T>
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index aee05419b67..be8993cd7ef 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -136,11 +136,11 @@ struct dispatch_to_fixed_point_fn {
 
     // create output column
     auto results   = make_fixed_point_column(output_type,
-                                           input.size(),
-                                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                           input.null_count(),
-                                           stream,
-                                           mr);
+                                             input.size(),
+                                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                             input.null_count(),
+                                             stream,
+                                             mr);
     auto d_results = results->mutable_view().data<DecimalType>();
 
     // convert strings into decimal values
@@ -206,17 +206,17 @@ struct decimal_to_string_size_fn {
 
     if (scale >= 0) return count_digits(value) + scale;
 
-    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {
+    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {  // TODO
       auto const abs_value = std::abs(value);
       auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
       auto const fraction  = count_digits(abs_value % exp_ten);
       auto const num_zeros = std::max(0, (-scale - fraction));
       return static_cast<int32_t>(value < 0) +    // sign if negative
-            count_digits(abs_value / exp_ten) +  // integer
-            1 +                                  // decimal point
-            num_zeros +                          // zeros padding
-            fraction;                            // size of fraction
-      } 
+             count_digits(abs_value / exp_ten) +  // integer
+             1 +                                  // decimal point
+             num_zeros +                          // zeros padding
+             fraction;                            // size of fraction
+    }
     return 0;
   }
 };
@@ -250,7 +250,7 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    if constexpr (not std::is_same<DecimalType, __int128_t>::value) { // TODO fix
+    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {  // TODO fix
       auto const abs_value = std::abs(value);
       if (value < 0) *d_buffer++ = '-';  // add sign
       auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
@@ -352,11 +352,11 @@ struct dispatch_is_fixed_point_fn {
 
     // create output column
     auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                       input.size(),
-                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                       input.null_count(),
-                                       stream,
-                                       mr);
+                                         input.size(),
+                                         cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                         input.null_count(),
+                                         stream,
+                                         mr);
     auto d_results = results->mutable_view().data<bool>();
 
     // check strings for valid fixed-point chars
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 3849b56d4eb..5c44ca3e445 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -271,9 +271,10 @@ struct fixed_point_floor {
 template <typename T>
 struct fixed_point_abs {
   T n;
-  __device__ T operator()(T data) { 
+  __device__ T operator()(T data)
+  {
     // std::abs does not work for __int128_t
-    return data > 0 ? data : data * -1; 
+    return data > 0 ? data : data * -1;
   }
 };
 

From 606d6e3ec6b7091bf726ea9a45b81cbdbf774b1a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 21 Jul 2021 02:21:36 +0000
Subject: [PATCH 007/112] `cudf::round` support for `__int128_t`

---
 cpp/src/round/round.cu | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 16167a82024..715ee64b103 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -74,8 +74,7 @@ int16_t __device__ generic_sign(T)
 template <typename T>
 constexpr inline auto is_supported_round_type()
 {
-  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) ||
-         cudf::is_fixed_point<T>() && not std::is_same<T, numeric::decimal128>::value;
+  return (cudf::is_numeric<T>() && not std::is_same<T, bool>::value) || cudf::is_fixed_point<T>();
 }
 
 template <typename T>
@@ -87,7 +86,9 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -106,7 +107,9 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -123,7 +126,9 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -140,7 +145,9 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -159,7 +166,9 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -176,7 +185,9 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U                                                     = T,
+            typename std::enable_if_t<std::is_integral<U>::value or
+                                      std::is_same<U, __int128_t>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case

From ee70203bc37f9308a9dff45627393124ad98b4a7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 21 Jul 2021 07:13:04 +0000
Subject: [PATCH 008/112] Enable tests & fixes

---
 cpp/include/cudf/fixed_point/fixed_point.hpp | 7 +++++--
 cpp/include/cudf_test/type_lists.hpp         | 3 ++-
 cpp/tests/reductions/scan_tests.cpp          | 7 ++++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index d195f976419..abaecf00925 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -52,10 +52,12 @@ constexpr inline auto is_supported_representation_type()
          cuda::std::is_same<T, __int128_t>::value;
 }
 
+// TODO make a temporary::is_integral function
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
-  return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value ||
+         cuda::std::is_same<T, __int128_t>::value;
 }
 
 // Helper functions for `fixed_point` type
@@ -277,7 +279,8 @@ class fixed_point {
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U,
-            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
+            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value or
+                                            std::is_same<U, __int128_t>::value>* = nullptr>
   explicit constexpr operator U() const
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 5c1b0c6c458..e3158d1937b 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -264,7 +264,8 @@ using ListTypes = cudf::test::Types<list_view>;
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedPointTypes);
  * ```
  */
-using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using FixedPointTypes =
+  cudf::test::Types<numeric::decimal32, numeric::decimal64, numeric::decimal128>;
 
 /**
  * @brief Provides a list of all fixed-width element types for use in GTest
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index ef5a66a2019..23931a8d5af 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -63,6 +63,11 @@ struct TypeParam_to_host_type<numeric::decimal64> {
   using type = numeric::decimal64::rep;
 };
 
+template <>
+struct TypeParam_to_host_type<numeric::decimal128> {
+  using type = numeric::decimal128::rep;
+};
+
 template <typename TypeParam, typename T>
 typename std::enable_if<std::is_same_v<TypeParam, cudf::string_view>,
                         thrust::host_vector<std::string>>::type
@@ -408,7 +413,7 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
     return std::string(s);
   });
   auto validity   = cudf::detail::make_counting_transform_iterator(
-    0, [](auto idx) -> bool { return (idx % 23) != 22; });
+      0, [](auto idx) -> bool { return (idx % 23) != 22; });
   cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity);
 
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);

From fd6157b1abf4ec02ab3bf98b7f95ab521bd34a2f Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 23 Jul 2021 17:35:35 +0000
Subject: [PATCH 009/112] Missing changes

---
 cpp/include/cudf/types.hpp               |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index f639c2dae6b..4cba88e08bb 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -264,7 +264,7 @@ class data_type {
    */
   explicit data_type(type_id id, int32_t scale) : _id{id}, _fixed_point_scale{scale}
   {
-    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64);
+    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128);
   }
 
   /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index a4857552831..341500d3c92 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -509,11 +509,12 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same<Rep, int32_t>::value;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = std::is_same<Rep, int32_t>::value   ? type_id::DECIMAL32
+                           : std::is_same<Rep, int64_t>::value ? type_id::DECIMAL64
+                                                               : type_id::DECIMAL128;
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,

From d4506af5efbc23a0e60f92bc28359ee0779c76c5 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 23 Jul 2021 18:27:51 +0000
Subject: [PATCH 010/112] Scan, column_wrapper, orc, etc

---
 cpp/include/cudf_test/column_wrapper.hpp  | 11 ++++++-----
 cpp/src/io/orc/stripe_enc.cu              | 17 +++++++++++++----
 cpp/src/io/orc/writer_impl.cu             |  4 +++-
 cpp/src/io/parquet/reader_impl.cu         |  6 ++++--
 cpp/src/reductions/scan/scan_exclusive.cu |  8 ++++++--
 cpp/src/reductions/scan/scan_inclusive.cu |  7 +++++--
 6 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 341500d3c92..2de72321e0d 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -575,11 +575,12 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same<Rep, int32_t>::value;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = std::is_same<Rep, int32_t>::value   ? type_id::DECIMAL32
+                           : std::is_same<Rep, int64_t>::value ? type_id::DECIMAL64
+                                                               : type_id::DECIMAL128;
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index e007c49e61c..e351416314b 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -109,6 +109,13 @@ static inline __device__ uint64_t zigzag(int64_t v)
   return ((v ^ -s) * 2) + s;
 }
 
+static inline __device__ uint64_t zigzag(__int128_t v)
+{
+  // TODO
+  int64_t s = (v < 0) ? 1 : 0;
+  return ((v ^ -s) * 2) + s;
+}
+
 static inline __device__ uint32_t CountLeadingBytes32(uint32_t v) { return __clz(v) >> 3; }
 static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clzll(v) >> 3; }
 
@@ -664,7 +671,7 @@ __global__ void __launch_bounds__(block_size)
     if (s->present_rows < s->chunk.num_rows) {
       uint32_t present_rows = s->present_rows;
       uint32_t nrows        = min(s->chunk.num_rows - present_rows,
-                           512 * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7)));
+                                  512 * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7)));
       uint32_t nrows_out;
       if (t * 8 < nrows) {
         uint32_t row  = s->chunk.start_row + present_rows + t * 8;
@@ -870,9 +877,11 @@ __global__ void __launch_bounds__(block_size)
             break;
           case DECIMAL: {
             if (valid) {
-              uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32)
-                                        ? zigzag(s->chunk.leaf_column->element<int32_t>(row))
-                                        : zigzag(s->chunk.leaf_column->element<int64_t>(row));
+              auto const id = s->chunk.leaf_column->type().id();
+              uint64_t const zz_val =
+                id == type_id::DECIMAL32   ? zigzag(s->chunk.leaf_column->element<int32_t>(row))
+                : id == type_id::DECIMAL64 ? zigzag(s->chunk.leaf_column->element<int64_t>(row))
+                                           : zigzag(s->chunk.leaf_column->element<__int128_t>(row));
               auto const offset =
                 (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
               StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 0cd3f333ba3..3875f69dbef 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -94,7 +94,8 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
     case cudf::type_id::TIMESTAMP_NANOSECONDS: return TypeKind::TIMESTAMP;
     case cudf::type_id::STRING: return TypeKind::STRING;
     case cudf::type_id::DECIMAL32:
-    case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
+    case cudf::type_id::DECIMAL64:
+    case cudf::type_id::DECIMAL128: return TypeKind::DECIMAL;
     default: return TypeKind::INVALID_TYPE_KIND;
   }
 }
@@ -121,6 +122,7 @@ constexpr auto orc_precision(cudf::type_id decimal_id)
   switch (decimal_id) {
     case cudf::type_id::DECIMAL32: return 9;
     case cudf::type_id::DECIMAL64: return 18;
+    case cudf::type_id::DECIMAL128: return 38;
     default: return 0;
   }
 }
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 3bf11063035..92f1a19fe2d 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -224,7 +224,8 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
 
   int8_t converted_type = converted;
   if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      column_type_id != type_id::DECIMAL32 && column_type_id != type_id::DECIMAL64) {
+      column_type_id != type_id::DECIMAL32 && column_type_id != type_id::DECIMAL64 &&
+      column_type_id != type_id::DECIMAL128) {
     converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
@@ -593,7 +594,8 @@ class aggregate_metadata {
     nesting.push_back(static_cast<int>(output_columns.size()));
     auto const col_type =
       to_type_id(schema, strings_to_categorical, timestamp_type_id, strict_decimal_types);
-    auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
+    auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 ||
+                           col_type == type_id::DECIMAL128
                          ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}}
                          : data_type{col_type};
     output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false);
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 383b64d45a1..c55cb789f7f 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,7 +50,9 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  template <typename T,
+            typename std::enable_if_t<std::is_arithmetic<T>::value ||
+                                      std::is_same<T, __int128_t>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -72,7 +74,9 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  std::enable_if_t<not std::is_arithmetic<T>::value and not std::is_same<T, __int128_t>::value,
+                   std::unique_ptr<column>>
+  operator()(Args&&...)
   {
     CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
   }
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 1beb9ecb282..5cd935759f4 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -124,11 +124,14 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_arithmetic<T>::value || is_string_supported<T>();
+    return std::is_arithmetic<T>::value || is_string_supported<T>() ||
+           std::is_same<T, __int128_t>::value;
   }
 
   // for arithmetic types
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  template <
+    typename T,
+    std::enable_if_t<std::is_arithmetic<T>::value || std::is_same<T, __int128_t>::value>* = nullptr>
   auto inclusive_scan(const column_view& input_view,
                       null_policy,
                       rmm::cuda_stream_view stream,

From 791e91cc5d327b98f04b88b3c20f143fcbae817e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 23 Jul 2021 22:36:10 +0000
Subject: [PATCH 011/112] Binop changes

---
 cpp/src/binaryop/binaryop.cpp | 54 ++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index aaf193ff5cf..d6a605307a0 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -412,13 +412,19 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+      } else if (lhs.type().id() == type_id::DECIMAL64) {
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
         auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value();
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
+      } else {
+        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+        auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+        auto const val    = static_cast<fixed_point_scalar<decimal128> const&>(lhs).value();
+        auto const scale  = scale_type{rhs.type().scale()};
+        auto const scalar = make_fixed_point_scalar<decimal128>(val * factor, scale);
+        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
       }
     } else {
       auto const diff   = rhs.type().scale() - lhs.type().scale();
@@ -427,11 +433,15 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+        } else if (lhs.type().id() == type_id::DECIMAL64) {
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+        } else {
+          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -482,13 +492,19 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         auto const scale  = scale_type{lhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      } else {
-        CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+      } else if (rhs.type().id() == type_id::DECIMAL64) {
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
         auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value();
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
+      } else {
+        CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+        auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+        auto const val    = static_cast<fixed_point_scalar<decimal128> const&>(rhs).value();
+        auto const scale  = scale_type{rhs.type().scale()};
+        auto const scalar = make_fixed_point_scalar<decimal128>(val * factor, scale);
+        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
       }
     } else {
       auto const diff   = lhs.type().scale() - rhs.type().scale();
@@ -497,11 +513,15 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+        } else if (rhs.type().id() == type_id::DECIMAL64) {
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+        } else {
+          CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -550,11 +570,15 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+        } else if (lhs.type().id() == type_id::DECIMAL64) {
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+        } else {
+          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -565,11 +589,15 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
+        } else if (lhs.type().id() == type_id::DECIMAL64) {
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+        } else {
+          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
+          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
+          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -684,7 +712,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out      = make_fixed_width_column(
-    output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
+         output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
   // Check for 0 sized data
   if (lhs.is_empty() or rhs.is_empty()) return out;

From ad5fe3537d4aaf580bf3f51e4eb3d505584098d5 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Sat, 24 Jul 2021 21:43:00 +0000
Subject: [PATCH 012/112] detail::to_string

---
 cpp/include/cudf/fixed_point/fixed_point.hpp | 54 ++++++++++++++------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index abaecf00925..cb18bb8ef8d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -154,6 +154,30 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scal
     return left_shift<Rep, Rad>(val, scale);
 }
 
+template <typename T>
+auto to_string(T value) -> std::string
+{
+  if constexpr (std::is_same<T, __int128>::value) {
+    auto s          = std::string{};
+    auto const sign = value < 0;
+    while (value) {
+      s.push_back("0123456789"[value % 10]);
+      value /= 10;
+    }
+    if (sign) s.push_back('-');
+    std::reverse(s.begin(), s.end());
+    return s;
+  } else {
+    return std::to_string(value);
+  }
+}
+
+template <typename T>
+auto abs(T value)
+{
+  return value >= 0 ? value : -value;
+}
+
 }  // namespace detail
 
 /**
@@ -552,24 +576,20 @@ class fixed_point {
    */
   explicit operator std::string() const
   {
-    if constexpr (not std::is_same<Rep, __int128_t>::value) {
-      if (_scale < 0) {
-        auto const av   = std::abs(_value);
-        int64_t const n = std::pow(10, -_scale);
-        int64_t const f = av % n;
-        auto const num_zeros =
-          std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
-        auto const zeros = std::string(num_zeros, '0');
-        auto const sign  = _value < 0 ? std::string("-") : std::string();
-        return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n);
-      } else {
-        auto const zeros = std::string(_scale, '0');
-        return std::to_string(_value) + zeros;
-      }
+    if (_scale < 0) {
+      auto const av = detail::abs(_value);
+      Rep const n   = std::pow(10, -_scale);  // does this work for all values of __int128
+      Rep const f   = av % n;
+      auto const num_zeros =
+        std::max(0, (-_scale - static_cast<int32_t>(detail::to_string(f).size())));
+      auto const zeros = std::string(num_zeros, '0');
+      auto const sign  = _value < 0 ? std::string("-") : std::string();
+      return sign + detail::to_string(av / n) + std::string(".") + zeros +
+             detail::to_string(av % n);
+    } else {
+      auto const zeros = std::string(_scale, '0');
+      return detail::to_string(_value) + zeros;
     }
-
-    // std::abs and std::to_string don't work on __int128_t
-    return "TODO";
   }
 };
 

From 7cc9db1ca5a2752be0405bdaedcf7480a49f5027 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Sat, 24 Jul 2021 21:43:13 +0000
Subject: [PATCH 013/112] Aggregation changes

---
 .../cudf/detail/aggregation/aggregation.cuh       | 13 ++++++++-----
 .../cudf/detail/aggregation/aggregation.hpp       | 15 ++++++++++++---
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index c64fba286d4..848d52f3a44 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -593,9 +593,7 @@ struct identity_initializer {
   template <typename T, aggregation::Kind k>
   static constexpr bool is_supported()
   {
-    // Note: !is_fixed_point<T>() means that aggregations for fixed_point should happen on the
-    //       underlying type (see device_storage_type_t), not that fixed_point is not supported
-    return cudf::is_fixed_width<T>() && !is_fixed_point<T>() and
+    return cudf::is_fixed_width<T>() and
            (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or
             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or
             k == aggregation::ARGMAX or k == aggregation::ARGMIN or
@@ -608,7 +606,8 @@ struct identity_initializer {
   std::enable_if_t<not std::is_same<corresponding_operator_t<k>, void>::value, T>
   identity_from_operator()
   {
-    return corresponding_operator_t<k>::template identity<T>();
+    using DeviceType = device_storage_type_t<T>;
+    return corresponding_operator_t<k>::template identity<DeviceType>();
   }
 
   template <typename T, aggregation::Kind k>
@@ -637,7 +636,11 @@ struct identity_initializer {
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
                                                           rmm::cuda_stream_view stream)
   {
-    thrust::fill(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), get_identity<T, k>());
+    using DeviceType = device_storage_type_t<T>;
+    thrust::fill(rmm::exec_policy(stream),
+                 col.begin<DeviceType>(),
+                 col.end<DeviceType>(),
+                 get_identity<DeviceType, k>());
   }
 
   template <typename T, aggregation::Kind k>
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 10d9d8c1b92..3830a6c96a2 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -923,13 +923,22 @@ struct target_type_impl<
   using type = int64_t;
 };
 
-// Summing fixed_point numbers, always use the decimal64 accumulator
+// Summing fixed_point numbers
+template <typename Source, aggregation::Kind k>
+struct target_type_impl<Source,
+                        k,
+                        std::enable_if_t<cudf::is_fixed_point<Source>() &&
+                                         not std::is_same<Source, numeric::decimal128>::value &&
+                                         (k == aggregation::SUM)>> {
+  using type = numeric::decimal64;
+};
+
 template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<cudf::is_fixed_point<Source>() && (k == aggregation::SUM)>> {
-  using type = numeric::decimal64;
+  std::enable_if_t<std::is_same<Source, numeric::decimal128>::value && (k == aggregation::SUM)>> {
+  using type = numeric::decimal128;
 };
 
 // Summing/Multiplying float/doubles, use same type accumulator

From 5dd6874bfd66348a0848a2d810f50fd584ccf5cf Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Sun, 25 Jul 2021 01:33:23 +0000
Subject: [PATCH 014/112] Small fix in fixed_point.hpp

---
 cpp/include/cudf/fixed_point/fixed_point.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index cb18bb8ef8d..0d0d3938588 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -157,7 +157,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scal
 template <typename T>
 auto to_string(T value) -> std::string
 {
-  if constexpr (std::is_same<T, __int128>::value) {
+  if constexpr (cuda::std::is_same<T, __int128>::value) {
     auto s          = std::string{};
     auto const sign = value < 0;
     while (value) {
@@ -304,7 +304,7 @@ class fixed_point {
    */
   template <typename U,
             typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value or
-                                            std::is_same<U, __int128_t>::value>* = nullptr>
+                                            cuda::std::is_same<U, __int128_t>::value>* = nullptr>
   explicit constexpr operator U() const
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting

From a89f958773aab316302c609288d9a1878220e3ba Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Sun, 25 Jul 2021 01:33:49 +0000
Subject: [PATCH 015/112] Enable quantile

---
 cpp/src/quantiles/quantile.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 89ec7ee7a47..25bf4a436ad 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -47,8 +47,7 @@ struct quantile_functor {
   rmm::mr::device_memory_resource* mr;
 
   template <typename T>
-  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>() or
-                     std::is_same<T, numeric::decimal128>::value,  // TODO
+  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>(),
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {
@@ -56,9 +55,7 @@ struct quantile_functor {
   }
 
   template <typename T>
-  std::enable_if_t<std::is_arithmetic<T>::value or
-                     cudf::is_fixed_point<T>() and
-                       not std::is_same<T, numeric::decimal128>::value,  // TODO
+  std::enable_if_t<std::is_arithmetic<T>::value or cudf::is_fixed_point<T>(),
                    std::unique_ptr<column>>
   operator()(column_view const& input)
   {

From a16a2b8e282917967b806391d04faf2473ca6ac6 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Mon, 26 Jul 2021 15:53:58 +0000
Subject: [PATCH 016/112] Comment update

---
 cpp/src/reductions/simple.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 61002481ddc..58443fc10f9 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -396,7 +396,7 @@ struct element_type_dispatcher {
   }
 
   /**
-   * @brief Specialization for reducing integer column types to any output type.
+   * @brief Specialization for reducing fixed_point column types to fixed_point number
    */
   template <typename ElementType,
             typename std::enable_if_t<cudf::is_fixed_point<ElementType>()>* = nullptr>

From e89a9ba43f2d08e37fb7222a185e3e6722dfb51e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Mon, 26 Jul 2021 15:54:52 +0000
Subject: [PATCH 017/112] REDUCTION_TEST working changes

---
 .../detail/utilities/device_operators.cuh     | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 3a1bb91b56c..0eed84880ea 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -89,7 +89,7 @@ struct DeviceMin {
   template <typename T>
   CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
   {
-    return std::min(lhs, rhs);
+    return lhs < rhs ? lhs : rhs;
   }
 
   template <
@@ -98,6 +98,14 @@ struct DeviceMin {
                               !cudf::is_dictionary<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    if constexpr (std::is_same<T, __int128_t>::value) {
+      __int128_t max = 1;
+      for (int i = 0; i < 126; ++i) {
+        max *= 2;
+      }
+      return max + (max - 1);
+    }
+
     return std::numeric_limits<T>::max();
   }
 
@@ -128,7 +136,7 @@ struct DeviceMax {
   template <typename T>
   CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
   {
-    return std::max(lhs, rhs);
+    return lhs > rhs ? lhs : rhs;
   }
 
   template <
@@ -137,6 +145,14 @@ struct DeviceMax {
                               !cudf::is_dictionary<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    if constexpr (std::is_same<T, __int128_t>::value) {
+      __int128_t lowest = -1;
+      for (int i = 0; i < 127; ++i) {
+        lowest *= 2;
+      }
+      return lowest;
+    }
+
     return std::numeric_limits<T>::lowest();
   }
 

From 7ef28bf706a022a16696a388458c6976ca17b368 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Mon, 26 Jul 2021 18:39:53 +0000
Subject: [PATCH 018/112] ROLLING_TEST changes

---
 cpp/src/aggregation/aggregation.cpp |  5 +++--
 cpp/tests/rolling/rolling_test.cpp  | 14 ++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 53a55351f8e..8eff18bf966 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -594,8 +594,9 @@ struct target_type_functor {
   constexpr data_type operator()() const noexcept
   {
     auto const id = type_to_id<target_type_t<Source, k>>();
-    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 ? data_type{id, type.scale()}
-                                                                : data_type{id};
+    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128
+             ? data_type{id, type.scale()}
+             : data_type{id};
   }
 };
 
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index a67e670acb7..19778e22f10 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -32,6 +32,7 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <type_traits>
 #include <vector>
 
 using cudf::bitmask_type;
@@ -1087,15 +1088,16 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
 {
   using namespace numeric;
   using namespace cudf;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using fw_wrapper   = cudf::test::fixed_width_column_wrapper<size_type>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using sum_type   = std::conditional_t<std::is_same_v<RepType, __int128_t>, __int128_t, int64_t>;
+  using fpsum_wrapper = cudf::test::fixed_point_column_wrapper<sum_type>;
+  using fw_wrapper    = cudf::test::fixed_width_column_wrapper<size_type>;
 
   auto const scale              = scale_type{-1};
   auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};
-  auto const expected_sum       = fp64_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_sum       = fpsum_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_min       = fp_wrapper{{42, 42, 55, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_max       = fp_wrapper{{42, 55, 55, 55, 2, 2}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_lag       = fp_wrapper{{0, 42, 1729, 55, 343, 1}, {0, 1, 0, 1, 0, 1}, scale};

From 7fd4ac41debff8cfbf5551d6fa959dcc8c98ddf6 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Mon, 26 Jul 2021 19:47:05 +0000
Subject: [PATCH 019/112] Initial changes for STRINGS_TEST

---
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  2 +-
 .../strings/convert/convert_fixed_point.cu    | 44 +++++++++----------
 cpp/src/strings/convert/utilities.cuh         |  1 +
 3 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 0d0d3938588..05ca724c358 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -173,7 +173,7 @@ auto to_string(T value) -> std::string
 }
 
 template <typename T>
-auto abs(T value)
+CUDA_HOST_DEVICE_CALLABLE auto abs(T value)
 {
   return value >= 0 ? value : -value;
 }
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index be8993cd7ef..20dba8689d8 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -115,7 +115,7 @@ struct string_to_decimal_check_fn {
     return (exp_ten < scale)
              ? true
              : value <= static_cast<uint64_t>(
-                          std::numeric_limits<DecimalType>::max() /
+                          std::numeric_limits<DecimalType>::max() /  // TODO probably broken
                           static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
@@ -206,18 +206,16 @@ struct decimal_to_string_size_fn {
 
     if (scale >= 0) return count_digits(value) + scale;
 
-    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {  // TODO
-      auto const abs_value = std::abs(value);
-      auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
-      auto const fraction  = count_digits(abs_value % exp_ten);
-      auto const num_zeros = std::max(0, (-scale - fraction));
-      return static_cast<int32_t>(value < 0) +    // sign if negative
-             count_digits(abs_value / exp_ten) +  // integer
-             1 +                                  // decimal point
-             num_zeros +                          // zeros padding
-             fraction;                            // size of fraction
-    }
-    return 0;
+    auto const abs_value = numeric::detail::abs(value);
+    auto const exp_ten   = static_cast<int64_t>(exp10(
+        static_cast<double>(-scale)));  // TODO probably broken (might need numeric::detail::exp10)
+    auto const fraction  = count_digits(abs_value % exp_ten);
+    auto const num_zeros = std::max(0, (-scale - fraction));
+    return static_cast<int32_t>(value < 0) +    // sign if negative
+           count_digits(abs_value / exp_ten) +  // integer
+           1 +                                  // decimal point
+           num_zeros +                          // zeros padding
+           fraction;                            // size of fraction
   }
 };
 
@@ -250,20 +248,18 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    if constexpr (not std::is_same<DecimalType, __int128_t>::value) {  // TODO fix
-      auto const abs_value = std::abs(value);
-      if (value < 0) *d_buffer++ = '-';  // add sign
-      auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
-      auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+    auto const abs_value = numeric::detail::abs(value);
+    if (value < 0) *d_buffer++ = '-';  // add sign
+    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
-      d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
-      *d_buffer++ = '.';                                             // add decimal point
+    d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
+    *d_buffer++ = '.';                                             // add decimal point
 
-      thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
-      d_buffer += num_zeros;
+    thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
+    d_buffer += num_zeros;
 
-      integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
-    }
+    integer_to_string(abs_value % exp_ten, d_buffer);  // add the fraction part
   }
 };
 
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 746923526a1..66606314261 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -96,6 +96,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
 template <typename IntegerType>
 constexpr size_type count_digits(IntegerType value)
 {
+  // TODO definitely broken
   if (value == 0) return 1;
   bool is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;

From 016c35aed3d40277efb1023a2a8d7883df04ceee Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 06:26:09 +0000
Subject: [PATCH 020/112] STRINGS changes

---
 .../cudf/detail/utilities/integer_utils.hpp   |  12 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  25 +---
 cpp/include/cudf/fixed_point/temporary.hpp    | 109 ++++++++++++++++++
 .../strings/convert/convert_fixed_point.cu    |   2 +-
 cpp/src/strings/convert/utilities.cuh         |   6 +-
 5 files changed, 118 insertions(+), 36 deletions(-)
 create mode 100644 cpp/include/cudf/fixed_point/temporary.hpp

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index dc919433da7..365ee1e91f4 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -22,6 +22,8 @@
  * @file Utility code involving integer arithmetic
  */
 
+#include <cudf/fixed_point/temporary.hpp>
+
 #include <cmath>
 #include <cstdlib>
 #include <stdexcept>
@@ -151,17 +153,11 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
-{
-  return std::abs(value);
-}
-// Unsigned type just returns itself.
-template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+constexpr inline auto absolute_value(T value) -> T
 {
+  if constexpr (numeric::detail::is_signed<T>()) return numeric::detail::abs(value);
   return value;
 }
 
 }  // namespace util
-
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 05ca724c358..229f39f9389 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/assert.cuh>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/types.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
@@ -154,30 +155,6 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scal
     return left_shift<Rep, Rad>(val, scale);
 }
 
-template <typename T>
-auto to_string(T value) -> std::string
-{
-  if constexpr (cuda::std::is_same<T, __int128>::value) {
-    auto s          = std::string{};
-    auto const sign = value < 0;
-    while (value) {
-      s.push_back("0123456789"[value % 10]);
-      value /= 10;
-    }
-    if (sign) s.push_back('-');
-    std::reverse(s.begin(), s.end());
-    return s;
-  } else {
-    return std::to_string(value);
-  }
-}
-
-template <typename T>
-CUDA_HOST_DEVICE_CALLABLE auto abs(T value)
-{
-  return value >= 0 ? value : -value;
-}
-
 }  // namespace detail
 
 /**
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
new file mode 100644
index 00000000000..3c487a28d74
--- /dev/null
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/types.hpp>
+
+// Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
+//       Jitify is needed for several algorithms (binaryop, rolling, etc)
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>  // add cuda namespace
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <string>
+
+//! `fixed_point` and supporting types
+namespace numeric {
+namespace detail {
+namespace numeric_limits {
+
+template <typename T>
+auto max() -> T
+{
+  if constexpr (std::is_same_v<T, __int128_t>) {
+    // −170,141,183,460,469,231,731,687,303,715,884,105,728
+    __int128_t max = 1;
+    for (int i = 0; i < 126; ++i) {
+      max *= 2;
+    }
+    return max + (max - 1);
+  }
+
+  return std::numeric_limits<T>::max();
+}
+
+template <typename T>
+auto lowest() -> T
+{
+  if constexpr (std::is_same_v<T, __int128_t>) {
+    // 170,141,183,460,469,231,731,687,303,715,884,105,728
+    __int128_t lowest = -1;
+    for (int i = 0; i < 127; ++i) {
+      lowest *= 2;
+    }
+    return lowest;
+  }
+
+  return std::numeric_limits<T>::lowest();
+}
+
+}  // namespace numeric_limits
+
+template <typename T>
+auto to_string(T value) -> std::string
+{
+  if constexpr (cuda::std::is_same<T, __int128_t>::value) {
+    auto s          = std::string{};
+    auto const sign = value < 0;
+    if (sign) {
+      value += 1;  // avoid overflowing if value == _int128_t lowest
+      value *= -1;
+      if (value == detail::numeric_limits::max<__int128_t>())
+        return "-170141183460469231731687303715884105728";
+      value += 1;  // can add back the one, not need to avoid overflow anymore
+    }
+    while (value) {
+      s.push_back("0123456789"[value % 10]);
+      value /= 10;
+    }
+    if (sign) s.push_back('-');
+    std::reverse(s.begin(), s.end());
+    return s;
+  } else {
+    return std::to_string(value);
+  }
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE constexpr auto abs(T value)
+{
+  return value >= 0 ? value : -value;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE constexpr auto is_signed()
+{
+  return std::is_signed<T>::value || std::is_same_v<T, __int128_t>;
+}
+
+}  // namespace detail
+
+/** @} */  // end of group
+}  // namespace numeric
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 20dba8689d8..9d0a6a3fdd2 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -248,8 +248,8 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    auto const abs_value = numeric::detail::abs(value);
     if (value < 0) *d_buffer++ = '-';  // add sign
+    auto const abs_value = numeric::detail::abs(value);
     auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
     auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 66606314261..6a6c92ba7c7 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -64,8 +64,8 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
     *d_buffer = '0';
     return 1;
   }
-  bool const is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
-  //
+  bool const is_negative = numeric::detail::is_signed<IntegerType>() ? (value < 0) : false;
+
   constexpr IntegerType base = 10;
   constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
   char digits[MAX_DIGITS];          // place-holder for digit chars
@@ -98,7 +98,7 @@ constexpr size_type count_digits(IntegerType value)
 {
   // TODO definitely broken
   if (value == 0) return 1;
-  bool is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
+  bool is_negative = numeric::detail::is_signed<IntegerType>() ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;
   // for all integer types, the max() and min() values have the same number of digits
   value = (value == std::numeric_limits<IntegerType>::min())

From dbd050429adff62d945fb71852c9ca42f543ba0e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 07:23:25 +0000
Subject: [PATCH 021/112] Clean up

---
 .../detail/utilities/device_operators.cuh     |  5 ++--
 cpp/include/cudf/fixed_point/temporary.hpp    | 24 +++++++++++++++----
 cpp/src/unary/math_ops.cu                     |  6 +----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 0eed84880ea..57065989df9 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -22,6 +22,7 @@
  */
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
@@ -89,7 +90,7 @@ struct DeviceMin {
   template <typename T>
   CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
   {
-    return lhs < rhs ? lhs : rhs;
+    return numeric::detail::min(lhs, rhs);
   }
 
   template <
@@ -136,7 +137,7 @@ struct DeviceMax {
   template <typename T>
   CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
   {
-    return lhs > rhs ? lhs : rhs;
+    return numeric::detail::max(lhs, rhs);
   }
 
   template <
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 3c487a28d74..e3e598daa55 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -38,7 +38,7 @@ template <typename T>
 auto max() -> T
 {
   if constexpr (std::is_same_v<T, __int128_t>) {
-    // −170,141,183,460,469,231,731,687,303,715,884,105,728
+    // 170,141,183,460,469,231,731,687,303,715,884,105,727
     __int128_t max = 1;
     for (int i = 0; i < 126; ++i) {
       max *= 2;
@@ -53,7 +53,7 @@ template <typename T>
 auto lowest() -> T
 {
   if constexpr (std::is_same_v<T, __int128_t>) {
-    // 170,141,183,460,469,231,731,687,303,715,884,105,728
+    // -170,141,183,460,469,231,731,687,303,715,884,105,728
     __int128_t lowest = -1;
     for (int i = 0; i < 127; ++i) {
       lowest *= 2;
@@ -92,17 +92,33 @@ auto to_string(T value) -> std::string
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr auto abs(T value)
+constexpr auto abs(T value)
 {
   return value >= 0 ? value : -value;
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr auto is_signed()
+CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs)
+{
+  return lhs < rhs ? lhs : rhs;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
+{
+  return lhs > rhs ? lhs : rhs;
+}
+
+template <typename T>
+constexpr auto is_signed()
 {
   return std::is_signed<T>::value || std::is_same_v<T, __int128_t>;
 }
 
+// TODO add is_integral
+// TODO add is_arithmetic
+// TODO pull down upstream, then regex replace is_same<>::value
+
 }  // namespace detail
 
 /** @} */  // end of group
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 5c44ca3e445..6359c50c21a 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -271,11 +271,7 @@ struct fixed_point_floor {
 template <typename T>
 struct fixed_point_abs {
   T n;
-  __device__ T operator()(T data)
-  {
-    // std::abs does not work for __int128_t
-    return data > 0 ? data : data * -1;
-  }
+  __device__ T operator()(T data) { return numeric::detail::abs(data); }
 };
 
 template <typename T, template <typename> typename FixedPointFunctor>

From bf34d20b545c239dc12e6cf2544e488dfab6ac12 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 14:57:02 +0000
Subject: [PATCH 022/112] std::is_same_v

---
 .../cudf/column/column_device_view.cuh        |  2 +-
 .../cudf/detail/aggregation/aggregation.cuh   | 12 +++++-----
 cpp/include/cudf/detail/copy_if.cuh           |  2 +-
 .../detail/utilities/device_operators.cuh     |  4 ++--
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  6 ++---
 cpp/include/cudf/fixed_point/temporary.hpp    |  6 ++---
 cpp/include/cudf/utilities/traits.hpp         |  2 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |  4 ++--
 cpp/src/reductions/scan/scan_inclusive.cu     |  4 ++--
 cpp/src/round/round.cu                        | 24 +++++++++----------
 10 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 6aa6eaf7de8..07e05083734 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -465,7 +465,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @param element_index Position of the desired element
    * @return numeric::decimal128 representing the element at this index
    */
-  template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal128>::value)>
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal128>)>
   __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 848d52f3a44..af76f07af16 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -139,7 +139,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same<Source, __int128_t>::value) {
+    if constexpr (not std::is_same_v<Source, __int128_t>) {
       using Target = target_type_t<Source, aggregation::MIN>;
       atomicMin(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -166,7 +166,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
       atomicMin(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }
@@ -189,7 +189,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same<Source, __int128_t>::value) {
+    if constexpr (not std::is_same_v<Source, __int128_t>) {
       using Target = target_type_t<Source, aggregation::MAX>;
       atomicMax(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -216,7 +216,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
       atomicMax(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }
@@ -239,7 +239,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same<Source, __int128_t>::value) {
+    if constexpr (not std::is_same_v<Source, __int128_t>) {
       using Target = target_type_t<Source, aggregation::SUM>;
       atomicAdd(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -266,7 +266,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same<DeviceSource, __int128_t>::value) {
+    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
       atomicAdd(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index ca2592eab3d..06efcaedf6c 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -224,7 +224,7 @@ struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal64, T>>> {
 };
 
 template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same<numeric::decimal128, T>::value>> {
+struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal128, T>>> {
   using type = typename cudf::device_storage_type_t<T>;
 };
 
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 5d6ec6e94bf..90449982cc2 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -99,7 +99,7 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (std::is_same<T, __int128_t>::value) {
+    if constexpr (std::is_same_v<T, __int128_t>) {
       __int128_t max = 1;
       for (int i = 0; i < 126; ++i) {
         max *= 2;
@@ -145,7 +145,7 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (std::is_same<T, __int128_t>::value) {
+    if constexpr (std::is_same_v<T, __int128_t>) {
       __int128_t lowest = -1;
       for (int i = 0; i < 127; ++i) {
         lowest *= 2;
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 08dbb20a794..8dcc2952bbe 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -49,7 +49,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same_v<T, int32_t> || cuda::std::is_same<T, int64_t> ||
+  return cuda::std::is_same_v<T, int32_t> || cuda::std::is_same_v<T, int64_t> ||
          cuda::std::is_same_v<T, __int128_t>;
 }
 
@@ -58,7 +58,7 @@ template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
   return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value ||
-         cuda::std::is_same<T, __int128_t>::value;
+         cuda::std::is_same_v<T, __int128_t>;
 }
 
 // Helper functions for `fixed_point` type
@@ -281,7 +281,7 @@ class fixed_point {
    */
   template <typename U,
             typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value or
-                                            cuda::std::is_same<U, __int128_t>::value>* = nullptr>
+                                            cuda::std::is_same_v<U, __int128_t>>* = nullptr>
   explicit constexpr operator U() const
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index e3e598daa55..82a12540001 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -32,7 +32,7 @@
 //! `fixed_point` and supporting types
 namespace numeric {
 namespace detail {
-namespace numeric_limits {
+namespace numeric_limits {  // TODO switch this to struct
 
 template <typename T>
 auto max() -> T
@@ -69,7 +69,7 @@ auto lowest() -> T
 template <typename T>
 auto to_string(T value) -> std::string
 {
-  if constexpr (cuda::std::is_same<T, __int128_t>::value) {
+  if constexpr (cuda::std::is_same_v<T, __int128_t>) {
     auto s          = std::string{};
     auto const sign = value < 0;
     if (sign) {
@@ -117,7 +117,7 @@ constexpr auto is_signed()
 
 // TODO add is_integral
 // TODO add is_arithmetic
-// TODO pull down upstream, then regex replace is_same<>::value
+// TODO pull down upstream, then regex replace is_same_v<>
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e6d50442cc6..a0d654b4307 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -153,7 +153,7 @@ template <typename T>
 constexpr inline bool is_numeric()
 {
   return std::is_integral<T>::value or std::is_floating_point<T>::value or
-         std::is_same<T, __int128_t>::value;
+         std::is_same_v<T, __int128_t>;
 }
 
 struct is_numeric_impl {
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 4e40e47538f..664d3bcebad 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -52,7 +52,7 @@ struct scan_dispatcher {
    */
   template <typename T,
             typename std::enable_if_t<std::is_arithmetic<T>::value ||
-                                      std::is_same<T, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<T, __int128_t>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -74,7 +74,7 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_arithmetic<T>::value and not std::is_same<T, __int128_t>::value,
+  std::enable_if_t<not std::is_arithmetic<T>::value and not std::is_same_v<T, __int128_t>,
                    std::unique_ptr<column>>
   operator()(Args&&...)
   {
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index cc5107f591a..aa075cd5543 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -123,13 +123,13 @@ struct scan_dispatcher {
   static constexpr bool is_supported()
   {
     return std::is_arithmetic<T>::value || is_string_supported<T>() ||
-           std::is_same<T, __int128_t>::value;
+           std::is_same_v<T, __int128_t>;
   }
 
   // for arithmetic types
   template <
     typename T,
-    std::enable_if_t<std::is_arithmetic<T>::value || std::is_same<T, __int128_t>::value>* = nullptr>
+    std::enable_if_t<std::is_arithmetic<T>::value || std::is_same_v<T, __int128_t>>* = nullptr>
   auto inclusive_scan(column_view const& input_view,
                       null_policy,
                       rmm::cuda_stream_view stream,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 559735cc8d3..d974da7353a 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -86,9 +86,9 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -107,9 +107,9 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -126,9 +126,9 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -145,9 +145,9 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -166,9 +166,9 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -185,9 +185,9 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U                                                     = T,
+  template <typename U                                                = T,
             typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same<U, __int128_t>::value>* = nullptr>
+                                      std::is_same_v<U, __int128_t>>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case

From 103a4db3e2c4ad66b92d2031e2a666e2aa455a34 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 15:51:57 +0000
Subject: [PATCH 023/112] is_integral & is_arithmetic

---
 cpp/include/cudf/fixed_point/fixed_point.hpp |  6 ++----
 cpp/include/cudf/fixed_point/temporary.hpp   | 19 +++++++++++++------
 cpp/include/cudf/utilities/traits.hpp        |  3 +--
 cpp/src/reductions/scan/scan_exclusive.cu    |  9 +++------
 cpp/src/reductions/scan/scan_inclusive.cu    |  7 ++-----
 cpp/src/round/round.cu                       | 16 ++++------------
 6 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 8dcc2952bbe..f4254ffe4ba 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -57,8 +57,7 @@ constexpr inline auto is_supported_representation_type()
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
-  return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value ||
-         cuda::std::is_same_v<T, __int128_t>;
+  return numeric::detail::is_integral<T>() || cuda::std::is_floating_point<T>::value;
 }
 
 // Helper functions for `fixed_point` type
@@ -280,8 +279,7 @@ class fixed_point {
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U,
-            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value or
-                                            cuda::std::is_same_v<U, __int128_t>>* = nullptr>
+            typename cuda::std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   explicit constexpr operator U() const
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 82a12540001..8a33ec498ee 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -27,9 +27,10 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <limits>
 #include <string>
+#include <type_traits>
 
-//! `fixed_point` and supporting types
 namespace numeric {
 namespace detail {
 namespace numeric_limits {  // TODO switch this to struct
@@ -115,11 +116,17 @@ constexpr auto is_signed()
   return std::is_signed<T>::value || std::is_same_v<T, __int128_t>;
 }
 
-// TODO add is_integral
-// TODO add is_arithmetic
-// TODO pull down upstream, then regex replace is_same_v<>
+template <typename T>
+constexpr auto is_integral()
+{
+  return cuda::std::is_integral<T>::value || cuda::std::is_same_v<T, __int128_t>;
+}
 
-}  // namespace detail
+template <typename T>
+constexpr auto is_arithmetic()
+{
+  return numeric::detail::is_integral<T>() || cuda::std::is_floating_point_v<T>;
+}
 
-/** @} */  // end of group
+}  // namespace detail
 }  // namespace numeric
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index a0d654b4307..dbb06865f20 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -152,8 +152,7 @@ constexpr inline bool is_equality_comparable()
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value or
-         std::is_same_v<T, __int128_t>;
+  return numeric::detail::is_integral<T>() or std::is_floating_point<T>::value;
 }
 
 struct is_numeric_impl {
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 664d3bcebad..5c3810743a9 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,9 +50,7 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value ||
-                                      std::is_same_v<T, __int128_t>>* = nullptr>
+  template <typename T, typename std::enable_if_t<numeric::detail::is_arithmetic<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -74,9 +72,8 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_arithmetic<T>::value and not std::is_same_v<T, __int128_t>,
-                   std::unique_ptr<column>>
-  operator()(Args&&...)
+  std::enable_if_t<not numeric::detail::is_arithmetic<T>(), std::unique_ptr<column>> operator()(
+    Args&&...)
   {
     CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
   }
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index aa075cd5543..8ffcf85a492 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -122,14 +122,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_arithmetic<T>::value || is_string_supported<T>() ||
-           std::is_same_v<T, __int128_t>;
+    return numeric::detail::is_arithmetic<T>::value || is_string_supported<T>();
   }
 
   // for arithmetic types
-  template <
-    typename T,
-    std::enable_if_t<std::is_arithmetic<T>::value || std::is_same_v<T, __int128_t>>* = nullptr>
+  template <typename T, std::enable_if_t<numeric::detail::is_arithmetic<T>()>* = nullptr>
   auto inclusive_scan(column_view const& input_view,
                       null_policy,
                       rmm::cuda_stream_view stream,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index d974da7353a..d79e60bfb53 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -126,9 +126,7 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -145,9 +143,7 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -166,9 +162,7 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -185,9 +179,7 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case

From 575fca771d18afd9ca8d41ffbed26706795674ca Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 15:58:17 +0000
Subject: [PATCH 024/112] Clean up

---
 cpp/src/round/round.cu | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index d79e60bfb53..e5a4961b3c1 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -86,9 +86,7 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -107,10 +105,8 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U                                                = T,
-            typename std::enable_if_t<std::is_integral<U>::value or
-                                      std::is_same_v<U, __int128_t>>* = nullptr>
-  __device__ U operator()(U)
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>>
+  * = nullptr > __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
     return U{};

From 85497531140b31ff148cc8f8c203e8295059deba Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 17:28:16 +0000
Subject: [PATCH 025/112] Fixes / cleanup

---
 cpp/src/quantiles/quantiles_util.hpp      | 6 +++++-
 cpp/src/reductions/scan/scan_inclusive.cu | 2 +-
 cpp/src/round/round.cu                    | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 1df0a4ab41a..0ab047bf97c 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -18,6 +18,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 
 namespace cudf {
 namespace detail {
@@ -152,7 +153,10 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
   }
 }
 
-template <typename Result, typename Iterator>
+template <typename Result,
+          typename Iterator,
+          typename std::enable_if_t<not cudf::is_fixed_point<Result>()>* =
+            nullptr>  // TODO revisit if this is needed
 CUDA_HOST_DEVICE_CALLABLE Result
 select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
 {
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 8ffcf85a492..5ba500b10a7 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -122,7 +122,7 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return numeric::detail::is_arithmetic<T>::value || is_string_supported<T>();
+    return numeric::detail::is_arithmetic<T>() || is_string_supported<T>();
   }
 
   // for arithmetic types
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index e5a4961b3c1..ab2acc91c9d 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -105,8 +105,8 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>>
-  * = nullptr > __device__ U operator()(U)
+  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
     return U{};

From 22de55a7eed5517f8b6aee99ba29170e51c79a5b Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 19:52:48 +0000
Subject: [PATCH 026/112] DECIMAL128 custom reduction tests

---
 cpp/tests/reductions/reduction_tests.cpp | 45 ++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index da9032737f2..69718259d00 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1374,7 +1374,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
     for (auto const i : {0, 1, 2, 3, 4}) {
       auto const expected = decimalXX{scaled_integer<RepType>{i + 1, scale}};
       auto const result   = cudf::reduce(
-        column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
+          column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
@@ -1397,13 +1397,54 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
     for (auto const i : {0, 1, 2, 3}) {
       auto const expected = decimalXX{scaled_integer<RepType>{values[i], scale}};
       auto const result   = cudf::reduce(
-        column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
+          column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
   }
 }
 
+struct Decimal128Only : public cudf::test::BaseFixture {
+};
+
+TEST_F(Decimal128Only, Decimal128ProductReduction)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{2, 2, 2, 2, 2, 2, 2, 2, 2}, scale};
+    auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{512, scale_type{i * 9}}};
+
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TEST_F(Decimal128Only, Decimal128ProductReduction2)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4, -5, -6}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 3, 4, 5, 6}, scale};
+    auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{720, scale_type{i * 6}}};
+
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
 TYPED_TEST(ReductionTest, NthElement)
 {
   using T = TypeParam;

From 5b69c0c82a41046ece38f25a7c41e5d7c6b2aa00 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 27 Jul 2021 22:03:05 +0000
Subject: [PATCH 027/112] Another REDUCTION test

---
 cpp/tests/reductions/reduction_tests.cpp | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 69718259d00..37dbb913781 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1416,9 +1416,9 @@ TEST_F(Decimal128Only, Decimal128ProductReduction)
   for (auto const i : {0, -1, -2, -3}) {
     auto const scale    = scale_type{i};
     auto const column   = fp_wrapper{{2, 2, 2, 2, 2, 2, 2, 2, 2}, scale};
-    auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const expected = decimal128{scaled_integer<RepType>{512, scale_type{i * 9}}};
 
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
 
@@ -1435,9 +1435,9 @@ TEST_F(Decimal128Only, Decimal128ProductReduction2)
   for (auto const i : {0, -1, -2, -3, -4, -5, -6}) {
     auto const scale    = scale_type{i};
     auto const column   = fp_wrapper{{1, 2, 3, 4, 5, 6}, scale};
-    auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const expected = decimal128{scaled_integer<RepType>{720, scale_type{i * 6}}};
 
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
 
@@ -1445,6 +1445,25 @@ TEST_F(Decimal128Only, Decimal128ProductReduction2)
   }
 }
 
+TEST_F(Decimal128Only, Decimal128ProductReduction3)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const values   = std::vector(127, -2);
+  auto const scale    = scale_type{0};
+  auto const column   = fp_wrapper{values.cbegin(), values.cend(), scale};
+  auto const lowest   = numeric::detail::numeric_limits::lowest<RepType>();
+  auto const expected = decimal128{scaled_integer<RepType>{lowest, scale}};
+
+  auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+  auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+  auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+  EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+}
+
 TYPED_TEST(ReductionTest, NthElement)
 {
   using T = TypeParam;

From 95667c85da9ba7bf37ed93021cb804bf98a4edb7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 28 Jul 2021 21:00:00 +0000
Subject: [PATCH 028/112] numeric_limits / temporary cleanup

---
 .../detail/utilities/device_operators.cuh     | 20 ++-----------------
 cpp/include/cudf/fixed_point/temporary.hpp    | 19 ++++++------------
 2 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 90449982cc2..535f8d52ab4 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -99,15 +99,7 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (std::is_same_v<T, __int128_t>) {
-      __int128_t max = 1;
-      for (int i = 0; i < 126; ++i) {
-        max *= 2;
-      }
-      return max + (max - 1);
-    }
-
-    return std::numeric_limits<T>::max();
+    return numeric::detail::numeric_limits::max<T>();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
@@ -145,15 +137,7 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (std::is_same_v<T, __int128_t>) {
-      __int128_t lowest = -1;
-      for (int i = 0; i < 127; ++i) {
-        lowest *= 2;
-      }
-      return lowest;
-    }
-
-    return std::numeric_limits<T>::lowest();
+    return numeric::detail::numeric_limits::lowest<T>();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 8a33ec498ee..9af205d8bb4 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -16,34 +16,28 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
 //       Jitify is needed for several algorithms (binaryop, rolling, etc)
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>  // add cuda namespace
+#include <cuda/std/type_traits>
 
 #include <algorithm>
-#include <cassert>
-#include <cmath>
 #include <limits>
 #include <string>
-#include <type_traits>
 
 namespace numeric {
 namespace detail {
-namespace numeric_limits {  // TODO switch this to struct
+namespace numeric_limits {
 
 template <typename T>
-auto max() -> T
+static constexpr auto max() -> T
 {
   if constexpr (std::is_same_v<T, __int128_t>) {
     // 170,141,183,460,469,231,731,687,303,715,884,105,727
     __int128_t max = 1;
-    for (int i = 0; i < 126; ++i) {
+    for (int i = 0; i < 126; ++i)
       max *= 2;
-    }
     return max + (max - 1);
   }
 
@@ -51,14 +45,13 @@ auto max() -> T
 }
 
 template <typename T>
-auto lowest() -> T
+static constexpr auto lowest() -> T
 {
   if constexpr (std::is_same_v<T, __int128_t>) {
     // -170,141,183,460,469,231,731,687,303,715,884,105,728
     __int128_t lowest = -1;
-    for (int i = 0; i < 127; ++i) {
+    for (int i = 0; i < 127; ++i)
       lowest *= 2;
-    }
     return lowest;
   }
 

From 825ab86c183ccd4eb89858242262ffcd29921a54 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 29 Jul 2021 03:43:18 +0000
Subject: [PATCH 029/112] More changes, 10+ files

---
 .../cudf/detail/aggregation/aggregation.cuh       |  8 +++++---
 .../cudf/detail/utilities/hash_functions.cuh      | 14 ++++++++++++++
 cpp/include/cudf/fixed_point/temporary.hpp        |  3 ++-
 cpp/include/cudf/scalar/scalar.hpp                |  2 +-
 cpp/include/cudf/utilities/type_dispatcher.hpp    | 15 ++++++++++++---
 cpp/src/aggregation/aggregation.cpp               |  7 +++----
 cpp/src/io/orc/writer_impl.cu                     |  7 ++++---
 cpp/src/io/parquet/reader_impl.cu                 |  6 ++----
 cpp/src/io/parquet/writer_impl.cu                 |  4 ++++
 cpp/src/reductions/scan/scan.cuh                  |  2 +-
 cpp/src/reductions/simple.cuh                     |  2 +-
 cpp/src/transform/row_bit_count.cu                |  2 +-
 cpp/src/unary/cast_ops.cu                         |  8 ++++----
 13 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index af76f07af16..e05e83991cd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -624,9 +624,11 @@ struct identity_initializer {
       if constexpr (cudf::is_timestamp<T>())
         return k == aggregation::ARGMAX ? T{typename T::duration(ARGMAX_SENTINEL)}
                                         : T{typename T::duration(ARGMIN_SENTINEL)};
-      else
-        return k == aggregation::ARGMAX ? static_cast<T>(ARGMAX_SENTINEL)
-                                        : static_cast<T>(ARGMIN_SENTINEL);
+      else {
+        using DeviceType = device_storage_type_t<T>;
+        return k == aggregation::ARGMAX ? static_cast<DeviceType>(ARGMAX_SENTINEL)
+                                        : static_cast<DeviceType>(ARGMIN_SENTINEL);
+      }
     }
     return identity_from_operator<T, k>();
   }
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 6eab13ae9af..a28827b05d2 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -562,6 +562,13 @@ MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) co
   return this->compute(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
@@ -707,6 +714,13 @@ SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& ke
   return this->compute<uint64_t>(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute<__int128_t>(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 9af205d8bb4..12b10fee91d 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -71,7 +71,7 @@ auto to_string(T value) -> std::string
       value *= -1;
       if (value == detail::numeric_limits::max<__int128_t>())
         return "-170141183460469231731687303715884105728";
-      value += 1;  // can add back the one, not need to avoid overflow anymore
+      value += 1;  // can add back the one, no need to avoid overflow anymore
     }
     while (value) {
       s.push_back("0123456789"[value % 10]);
@@ -83,6 +83,7 @@ auto to_string(T value) -> std::string
   } else {
     return std::to_string(value);
   }
+  return std::string{};  // won't ever hit here, need to supress warning though
 }
 
 template <typename T>
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 0e14b0c6bf5..7bf92fd6520 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -359,7 +359,7 @@ class fixed_point_scalar : public scalar {
   rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Get the decimal32 or decimal64.
+   * @brief Get the decimal32, decimal64 or decimal128.
    *
    * @param stream CUDA stream used for device memory operations.
    */
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 07daf8d0417..e0e7254588d 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -85,8 +85,9 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 /**
  * @brief "Returns" the corresponding type that is stored on the device when using `cudf::column`
  *
- * For `decimal32`, the storage type is an `int32_t`.
- * For `decimal64`, the storage type is an `int64_t`.
+ * For `decimal32`,  the storage type is an `int32_t`.
+ * For `decimal64`,  the storage type is an `int64_t`.
+ * For `decimal128`, the storage type is an `__int128_t`.
  *
  * Use this "type function" with the `using` type alias:
  * @code
@@ -113,13 +114,21 @@ using device_storage_type_t =
  * @return     `false` If T does not match the stored column `type_id`
  */
 template <typename T>
-bool type_id_matches_device_storage_type(type_id id)
+constexpr bool type_id_matches_device_storage_type(type_id id)
 {
   return (id == type_id::DECIMAL32 && std::is_same_v<T, int32_t>) ||
          (id == type_id::DECIMAL64 && std::is_same_v<T, int64_t>) ||
          (id == type_id::DECIMAL128 && std::is_same_v<T, __int128_t>) || id == type_to_id<T>();
 }
 
+// TODO docs
+constexpr bool is_fixed_point(cudf::type_id id)
+{
+  return id == type_id::DECIMAL32 or  //
+         id == type_id::DECIMAL64 or  //
+         id == type_id::DECIMAL128;
+}
+
 /**
  * @brief Macro used to define a mapping between a concrete C++ type and a
  *`cudf::type_id` enum.
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 687477658f5..f4628d500bb 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -631,10 +631,9 @@ struct target_type_functor {
   template <typename Source, aggregation::Kind k>
   constexpr data_type operator()() const noexcept
   {
-    auto const id = type_to_id<target_type_t<Source, k>>();
-    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128
-             ? data_type{id, type.scale()}
-             : data_type{id};
+    using Type    = target_type_t<Source, k>;
+    auto const id = type_to_id<Type>();
+    return cudf::is_fixed_point<Type>() ? data_type{id, type.scale()} : data_type{id};
   }
 };
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 7d7548c8858..9181a4dcc4c 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1270,9 +1270,10 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                         col_idx = orc_col.index()] __device__(auto idx) {
                          auto const& col = d_cols[col_idx].cudf_column;
                          if (col.is_null(idx)) return 0u;
-                         int64_t const element   = (col.type().id() == type_id::DECIMAL32)
-                                                     ? col.element<int32_t>(idx)
-                                                     : col.element<int64_t>(idx);
+                         int64_t const element =
+                           col.type().id() == type_id::DECIMAL32   ? col.element<int32_t>(idx)
+                           : col.type().id() == type_id::DECIMAL64 ? col.element<int64_t>(idx)
+                                                                   : col.element<__int128_t>(idx);
                          int64_t const sign      = (element < 0) ? 1 : 0;
                          uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index ec8041c933e..16dbcda958d 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -224,8 +224,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
 
   int8_t converted_type = converted;
   if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      column_type_id != type_id::DECIMAL32 && column_type_id != type_id::DECIMAL64 &&
-      column_type_id != type_id::DECIMAL128) {
+      not cudf::is_fixed_point(column_type_id)) {
     converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
@@ -594,8 +593,7 @@ class aggregate_metadata {
     nesting.push_back(static_cast<int>(output_columns.size()));
     auto const col_type =
       to_type_id(schema, strings_to_categorical, timestamp_type_id, strict_decimal_types);
-    auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 ||
-                           col_type == type_id::DECIMAL128
+    auto const dtype = cudf::is_fixed_point(col_type)
                          ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}}
                          : data_type{col_type};
     output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 7c0ce03886d..b0ac3ccf4c7 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -339,6 +339,10 @@ struct leaf_schema_fn {
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
+    } else if (std::is_same_v<T, numeric::decimal128>) {
+      // TODO
+      // col_schema.type        = Type::INT64;
+      // col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 3853e34e97b..cba27c0cd54 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -67,7 +67,7 @@ std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
     case aggregation::PRODUCT:
       // a product scan on a decimal type with non-zero scale would result in each element having
       // a different scale, and because scale is stored once per column, this is not possible
-      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64 cannot support product scan");
+      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
       return type_dispatcher<dispatch_storage_type>(
         input.type(), DispatchFn<DeviceProduct>(), input, null_handling, stream, mr);
     case aggregation::RANK: return inclusive_rank_scan(input, stream, mr);
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 5aa42dbda74..6cbfb220a9d 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -74,7 +74,7 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
 /**
  * @brief Reduction for `sum`, `product`, `min` and `max` for decimal types
  *
- * @tparam DecimalXX  The `decimal32` or `decimal64` type
+ * @tparam DecimalXX  The `decimal32`, `decimal64` or `decimal128` type
  * @tparam Op         The operator of cudf::reduction::op::
  * @param col         Input column of data to reduce
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 620504f5c93..a129fc56846 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -205,7 +205,7 @@ struct flatten_functor {
                   thrust::optional<int> parent_index)
   {
     // track branch depth as we reach this list and after we pass it
-    size_type const branch_depth_start = cur_branch_depth;
+    auto const branch_depth_start = cur_branch_depth;
     auto const is_list_inside_struct =
       parent_index && out[parent_index.value()].type().id() == type_id::STRUCT;
     if (is_list_inside_struct) {
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index d01d0a8cbbc..f201667cfd0 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -160,7 +160,7 @@ struct device_cast {
  * @brief Takes a `fixed_point` column_view as @p input and returns a `fixed_point` column with new
  * @p scale
  *
- * @tparam T     Type of the `fixed_point` column_view (`decimal32` or `decimal64`)
+ * @tparam T     Type of the `fixed_point` column_view (`decimal32`, `decimal64` or `decimal128`)
  * @param input  Input `column_view`
  * @param scale  `scale` of the returned `column`
  * @param mr     Device memory resource used to allocate the returned column's device memory
@@ -338,9 +338,9 @@ struct dispatch_unary_cast_to {
 
   {
     if (!cudf::is_fixed_width<TargetT>())
-      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
     else if (cudf::is_fixed_point<SourceT>())
-      CUDF_FAIL("Currently only decimal32/64 to floating point/integral is supported");
+      CUDF_FAIL("Currently only decimal32/64/128 to floating point/integral is supported");
     else if (cudf::is_timestamp<SourceT>() && is_numeric<TargetT>())
       CUDF_FAIL("Timestamps can be created only from duration");
     else
@@ -364,7 +364,7 @@ struct dispatch_unary_cast_from {
   template <typename T, typename... Args>
   std::enable_if_t<!cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
   {
-    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
   }
 };
 }  // anonymous namespace

From 321761c6e5e6ded79edffc93a28754d544ff8e84 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 29 Jul 2021 15:54:17 +0000
Subject: [PATCH 030/112] Fix for TRANSFORM_TEST

---
 cpp/tests/transform/row_bit_count_test.cu | 58 ++++++++++-------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 0081cf0d467..ccae898cd2e 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -81,16 +81,13 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_list_column()
 {
   using LCW                     = cudf::test::lists_column_wrapper<T, int>;
   constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
-
-  // clang-format off
-  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {3, 4, 5}}, 
-                                                LCW{LCW{}}, 
-                                                {LCW{10}},
-                                                {{6, 7, 8}, {9}},
-                                                {{-1, -2}, {-3, -4}},
-                                                {{-5, -6, -7}, {-8, -9}} };
-  // clang-format on
-
+  cudf::test::fixed_width_column_wrapper<T> values{
+    1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9};
+  cudf::test::fixed_width_column_wrapper<offset_type> inner_offsets{
+    0, 2, 5, 6, 9, 10, 12, 14, 17, 19};
+  auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {});
+  cudf::test::fixed_width_column_wrapper<offset_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
+  auto col = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {});
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf
   cudf::test::fixed_width_column_wrapper<size_type> expected{
     ((4 + 8) * CHAR_BIT) + (type_size * 5),
@@ -99,8 +96,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_list_column()
     ((4 + 8) * CHAR_BIT) + (type_size * 4),
     ((4 + 8) * CHAR_BIT) + (type_size * 4),
     ((4 + 8) * CHAR_BIT) + (type_size * 5)};
-
-  return {col.release(), expected.release()};
+  return {std::move(col), expected.release()};
 }
 
 TYPED_TEST(RowBitCountTyped, Lists)
@@ -119,22 +115,21 @@ TYPED_TEST(RowBitCountTyped, Lists)
 
 TYPED_TEST(RowBitCountTyped, ListsWithNulls)
 {
-  using T                       = TypeParam;
-  using LCW                     = cudf::test::lists_column_wrapper<T, int>;
-  constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
-
-  std::vector<bool> valids{true, false, true};
-  std::vector<bool> valids2{false, true, false};
-  std::vector<bool> valids3{true, false};
+  using T   = TypeParam;
+  using LCW = cudf::test::lists_column_wrapper<T, int>;
 
-  // clang-format off
-  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {{3, 4, 5}, valids.begin()}}, 
-                                                LCW{LCW{}}, 
-                                                {LCW{10}}, 
-                                                {{{{6, 7, 8}, valids2.begin()}, {9}}, valids3.begin()} };
-  // clang-format on
-
-  table_view t({col});
+  constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
+  cudf::test::fixed_width_column_wrapper<T> values{{1, 2, 3, 4, 5, 10, 6, 7, 8},
+                                                   {1, 1, 1, 0, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<offset_type> inner_offsets{0, 2, 5, 6, 9, 9};
+  std::vector<bool> inner_validity{1, 1, 1, 1, 0};
+  auto inner_null_mask =
+    cudf::test::detail::make_null_mask(inner_validity.begin(), inner_validity.end());
+  auto inner_list = cudf::make_lists_column(
+    5, inner_offsets.release(), values.release(), 1, std::move(inner_null_mask));
+  cudf::test::fixed_width_column_wrapper<offset_type> outer_offsets{0, 2, 2, 3, 5};
+  auto col = cudf::make_lists_column(4, outer_offsets.release(), std::move(inner_list), 0, {});
+  table_view t({*col});
   auto result = cudf::row_bit_count(t);
 
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf + validity
@@ -144,7 +139,6 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
     ((4 + 0) * CHAR_BIT) + (type_size * 0),
     ((4 + 4) * CHAR_BIT) + (type_size * 1) + 2,
     ((4 + 8) * CHAR_BIT) + (type_size * 3) + 5};
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
@@ -430,10 +424,10 @@ TEST_F(RowBitCount, NestedTypes)
                                                                l4_offsets.end());
     auto const l4_size = l4_offsets.size() - 1;
     auto l4            = cudf::make_lists_column(static_cast<cudf::size_type>(l4_size),
-                                      l4_offsets_col.release(),
-                                      innermost_struct.release(),
-                                      cudf::UNKNOWN_NULL_COUNT,
-                                      rmm::device_buffer{});
+                                                 l4_offsets_col.release(),
+                                                 innermost_struct.release(),
+                                                 cudf::UNKNOWN_NULL_COUNT,
+                                                 rmm::device_buffer{});
 
     // inner struct
     std::vector<std::unique_ptr<column>> inner_struct_children;

From 02b00444b878fe1f7c3a80dd85323d105d6e6d90 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 29 Jul 2021 20:57:27 +0000
Subject: [PATCH 031/112] Rename FixedPointTestBothReps

---
 cpp/tests/binaryop/binop-integration-test.cpp | 76 +++++++++----------
 cpp/tests/copying/concatenate_tests.cu        | 20 ++---
 cpp/tests/copying/scatter_tests.cpp           |  6 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   | 38 +++++-----
 cpp/tests/fixed_point/fixed_point_tests.cu    |  6 +-
 cpp/tests/groupby/count_scan_tests.cpp        |  6 +-
 cpp/tests/groupby/count_tests.cpp             |  6 +-
 cpp/tests/groupby/max_scan_tests.cpp          |  6 +-
 cpp/tests/groupby/max_tests.cpp               |  8 +-
 cpp/tests/groupby/min_scan_tests.cpp          |  6 +-
 cpp/tests/groupby/min_tests.cpp               |  8 +-
 cpp/tests/groupby/sum_scan_tests.cpp          |  6 +-
 cpp/tests/groupby/sum_tests.cpp               |  8 +-
 cpp/tests/merge/merge_test.cpp                | 10 +--
 cpp/tests/reductions/reduction_tests.cpp      | 38 +++++-----
 cpp/tests/replace/replace_tests.cpp           |  6 +-
 .../reshape/interleave_columns_tests.cpp      |  6 +-
 cpp/tests/search/search_test.cpp              |  8 +-
 cpp/tests/sort/sort_test.cpp                  |  6 +-
 19 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 68a8845132b..f31e3ebd50e 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2018,14 +2018,14 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2034,7 +2034,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
 
   auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
     return decimalXX{i, scale_type{0}};
-  });
+       });
   auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
   auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
   auto expected   = std::vector<decimalXX>(sz);
@@ -2058,7 +2058,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2067,7 +2067,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
 
   auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
     return decimalXX{i, scale_type{0}};
-  });
+       });
   auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
   auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
   auto expected   = std::vector<decimalXX>(sz);
@@ -2094,7 +2094,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2113,7 +2113,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2132,7 +2132,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2151,7 +2151,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2168,7 +2168,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2188,7 +2188,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2207,7 +2207,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2226,7 +2226,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2243,7 +2243,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd5)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2260,7 +2260,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd6)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2279,7 +2279,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
+TYPED_TEST(FixedPointTestAllReps, FixedPointCast)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2293,7 +2293,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiplyScalar)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2310,7 +2310,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpSimplePlus)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2329,7 +2329,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2346,7 +2346,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2362,7 +2362,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0Null)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2378,7 +2378,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale2Null)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2394,7 +2394,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2438,7 +2438,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMaxSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2458,7 +2458,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMinSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2478,7 +2478,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullEqualsSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2495,7 +2495,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2511,7 +2511,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2527,7 +2527,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2543,7 +2543,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2559,7 +2559,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div6)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2576,7 +2576,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div7)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2593,7 +2593,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div8)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2609,7 +2609,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div9)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2625,7 +2625,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div10)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2641,7 +2641,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div11)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2657,7 +2657,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 7d3b7beb2cb..74f0688a38d 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -363,7 +363,7 @@ TEST_F(OverflowTest, OverflowTest)
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, size};
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
     auto col        = cudf::make_strings_column(
-      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+             1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -378,7 +378,7 @@ TEST_F(OverflowTest, OverflowTest)
     auto many_offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, size + 1);
     auto chars        = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
     auto col          = cudf::make_strings_column(
-      size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
+               size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
 
     table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -486,7 +486,7 @@ TEST_F(OverflowTest, Presliced)
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows);
     auto col        = cudf::make_strings_column(
-      num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+             num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::split(*col, {(num_rows / 2) - 1});
 
@@ -517,7 +517,7 @@ TEST_F(OverflowTest, Presliced)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows);
     auto col        = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
     // leaving this disabled as it typically runs out of memory on a T4
@@ -686,7 +686,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto col        = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
@@ -714,7 +714,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto col        = cudf::make_lists_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
@@ -742,7 +742,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto list_col   = cudf::make_lists_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+        num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     // struct
     std::vector<std::unique_ptr<column>> children;
@@ -1535,15 +1535,15 @@ TEST_F(ListsColumnTest, ListOfStructs)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointConcatentate)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index be4a689f213..b0b942b57b8 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -899,14 +899,14 @@ TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointScatter)
+TYPED_TEST(FixedPointTestAllReps, FixedPointScatter)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 47b2a95e7b5..a90e0f0f541 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -35,14 +35,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -63,7 +63,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -84,7 +84,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -109,7 +109,7 @@ TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
   EXPECT_EQ(0.000123, static_cast<double>(y));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -138,7 +138,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
   EXPECT_EQ(1.4375, static_cast<double>(num9));
 }
 
-TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -149,7 +149,7 @@ TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, static_cast<double>(num1));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -174,7 +174,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
   EXPECT_EQ(a - b, a);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
+TYPED_TEST(FixedPointTestAllReps, ComparisonOperators)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -193,7 +193,7 @@ TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
   EXPECT_TRUE(SIX / TWO >= ONE);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -223,7 +223,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
   EXPECT_EQ(static_cast<int32_t>(n), 20);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -251,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
   EXPECT_TRUE(FIVE_0 * THREE_0 != TEN_1);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
+TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -276,7 +276,7 @@ TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
   EXPECT_EQ(c - d, zz);
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledTest)
+TYPED_TEST(FixedPointTestAllReps, RescaledTest)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -296,7 +296,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledTest)
   EXPECT_EQ(num5, num6.rescaled(scale_type{-5}));
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
+TYPED_TEST(FixedPointTestAllReps, RescaledRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -311,7 +311,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
   EXPECT_EQ(-1000, static_cast<TypeParam>(num3.rescaled(scale_type{3})));
 }
 
-TYPED_TEST(FixedPointTestBothReps, BoolConversion)
+TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -468,7 +468,7 @@ struct cast_to_int32_fn {
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
   using namespace numeric;
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -489,7 +489,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, w);
 }
 
-TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
+TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
 {
   auto null_mask = cudf::create_null_mask(0, cudf::mask_state::ALL_NULL);
 
@@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
     cudf::logic_error);
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
@@ -554,7 +554,7 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
+TYPED_TEST(FixedPointTestAllReps, ExtremelyLargeNegativeScale)
 {
   // This is testing fixed_point values with an extremely large negative scale. The fixed_point
   // implementation should be able to handle any scale representable by an int32_t
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 2627ab6d48d..78101a3b1fe 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -40,14 +40,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 9740bfa1954..8ca73e03bbc 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -156,12 +156,12 @@ TEST_F(groupby_count_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
+TYPED_TEST(FixedPointTestAllReps, GroupByCountScan)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index 2d45de04607..2d695957326 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -169,12 +169,12 @@ TEST_F(groupby_count_string_test, basic)
 // clang-format on
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCount)
+TYPED_TEST(FixedPointTestAllReps, GroupByCount)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 70a48da69e8..962fdcde51a 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -129,12 +129,12 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index b5710d3f4bc..eb000cb73df 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -224,12 +224,12 @@ TEST_F(groupby_dictionary_max_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -253,7 +253,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index ef548407761..e4f2091781e 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -143,12 +143,12 @@ TEST_F(groupby_min_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 1544e867595..161c69714ae 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -224,12 +224,12 @@ TEST_F(groupby_dictionary_min_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -252,7 +252,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 2f1928747ae..af8e8ff2eb4 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -133,12 +133,12 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX      = TypeParam;
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 458937ff2e4..9f4aaa1336f 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -152,12 +152,12 @@ TYPED_TEST(groupby_sum_test, dictionary)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX    = TypeParam;
@@ -187,7 +187,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX    = TypeParam;
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 452f3adfdbb..de6eefb989a 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -453,7 +453,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
     } else {
       return row * 2;
     }
-  });
+        });
   auto valid_sequence1 = cudf::detail::make_counting_transform_iterator(
     0, [inputRows](auto row) { return (row < inputRows - 1); });
   cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(sequence1)::value_type>
@@ -698,7 +698,7 @@ TEST_F(MergeTest, KeysWithNulls)
   cudf::size_type nrows = 13200;  // Ensures that thrust::merge uses more than one tile/block
   auto data_iter        = thrust::make_counting_iterator<int32_t>(0);
   auto valids1          = cudf::detail::make_counting_transform_iterator(
-    0, [](auto row) { return (row % 10 == 0) ? false : true; });
+             0, [](auto row) { return (row % 10 == 0) ? false : true; });
   cudf::test::fixed_width_column_wrapper<int32_t> data1(data_iter, data_iter + nrows, valids1);
   auto valids2 = cudf::detail::make_counting_transform_iterator(
     0, [](auto row) { return (row % 15 == 0) ? false : true; });
@@ -874,15 +874,15 @@ TEST_F(MergeTest, StructsNestedWithNulls)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointMerge)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 37dbb913781..9a919c63d28 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1041,12 +1041,12 @@ TYPED_TEST(ReductionTest, UniqueCount)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductZeroScale)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1070,7 +1070,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
   EXPECT_EQ(result_fp, _24);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProduct)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1090,7 +1090,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductWithNulls)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1110,7 +1110,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSum)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1131,7 +1131,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumAlternate)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1155,7 +1155,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
   EXPECT_EQ(result_scalar->fixed_point_value(), TEN);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumFractional)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1175,7 +1175,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1198,7 +1198,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMin)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1218,7 +1218,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMinLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1239,7 +1239,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMax)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1259,7 +1259,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMaxLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1280,7 +1280,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNUnique)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1299,7 +1299,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumOfSquares)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1319,7 +1319,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianOddNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1339,7 +1339,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianEvenNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1359,7 +1359,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionQuantile)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1381,7 +1381,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNthElement)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 58ef08f6052..70da4aaf4d4 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -539,14 +539,14 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReplace)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReplace)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 386fd9d08ee..d1e97bb3e84 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -345,12 +345,12 @@ TEST_F(InterleaveStringsColumnsTest, MultiColumnStringMixNullableMix)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointInterleave)
+TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 38fc5abb250..df340c772ed 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1817,12 +1817,12 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointLowerBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1846,7 +1846,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointUpperBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointUpperBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 6e668068f94..48cab98cb3e 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -673,14 +673,14 @@ TEST_F(SortByKey, ValueKeysSizeMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointSortedOrderGather)
+TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
 {
   using namespace numeric;
   using decimalXX = TypeParam;

From 95a107c717a295a3d15bbf997c077f72c5d78678 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 3 Aug 2021 20:47:16 +0000
Subject: [PATCH 032/112] test group_by for only decimal32/64

---
 cpp/tests/groupby/max_tests.cpp      | 9 +++++----
 cpp/tests/groupby/min_tests.cpp      | 9 +++++----
 cpp/tests/groupby/sum_scan_tests.cpp | 7 ++++---
 cpp/tests/groupby/sum_tests.cpp      | 9 +++++----
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index eb000cb73df..cfc15a8fe56 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -224,12 +224,13 @@ TEST_F(groupby_dictionary_max_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
+using RepTypes = ::testing::Types<int32_t, int64_t>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
 
-TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -253,7 +254,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestAllReps, GroupByHashMaxDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 161c69714ae..e297f21afe8 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -224,12 +224,13 @@ TEST_F(groupby_dictionary_min_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
+using RepTypes = ::testing::Types<int32_t, int64_t>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
 
-TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -252,7 +253,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestAllReps, GroupByHashMinDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index af8e8ff2eb4..85a038af678 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -133,12 +133,13 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
+using RepTypes = ::testing::Types<int32_t, int64_t>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
 
-TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX      = TypeParam;
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 9f4aaa1336f..27c63c3baef 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -152,12 +152,13 @@ TYPED_TEST(groupby_sum_test, dictionary)
 }
 
 template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
+using RepTypes = ::testing::Types<int32_t, int64_t>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
 
-TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX    = TypeParam;
@@ -187,7 +188,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
+TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX    = TypeParam;

From 0d8aa3640e9401a3207b4d5ac55a2649c84c48be Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 4 Aug 2021 02:34:34 +0000
Subject: [PATCH 033/112] Using cuda::std:: for utility functions

---
 .../detail/utilities/device_operators.cuh     |  4 +-
 .../cudf/detail/utilities/integer_utils.hpp   |  2 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  4 +-
 cpp/include/cudf/fixed_point/temporary.hpp    | 53 +------------------
 cpp/include/cudf/utilities/traits.hpp         |  2 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |  4 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  4 +-
 cpp/src/round/round.cu                        | 12 ++---
 cpp/src/strings/convert/utilities.cuh         |  4 +-
 cpp/tests/reductions/reduction_tests.cpp      |  2 +-
 10 files changed, 21 insertions(+), 70 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 535f8d52ab4..f8792061612 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -99,7 +99,7 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return numeric::detail::numeric_limits::max<T>();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
@@ -137,7 +137,7 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return numeric::detail::numeric_limits::lowest<T>();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 365ee1e91f4..ddedab3944c 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -155,7 +155,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
 template <typename T>
 constexpr inline auto absolute_value(T value) -> T
 {
-  if constexpr (numeric::detail::is_signed<T>()) return numeric::detail::abs(value);
+  if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
 }
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index f4254ffe4ba..930fef40747 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -57,7 +57,7 @@ constexpr inline auto is_supported_representation_type()
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
-  return numeric::detail::is_integral<T>() || cuda::std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() || cuda::std::is_floating_point<T>::value;
 }
 
 // Helper functions for `fixed_point` type
@@ -279,7 +279,7 @@ class fixed_point {
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U,
-            typename cuda::std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   explicit constexpr operator U() const
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 12b10fee91d..49c83090da7 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -20,45 +20,14 @@
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
 //       Jitify is needed for several algorithms (binaryop, rolling, etc)
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <algorithm>
-#include <limits>
 #include <string>
 
 namespace numeric {
 namespace detail {
-namespace numeric_limits {
-
-template <typename T>
-static constexpr auto max() -> T
-{
-  if constexpr (std::is_same_v<T, __int128_t>) {
-    // 170,141,183,460,469,231,731,687,303,715,884,105,727
-    __int128_t max = 1;
-    for (int i = 0; i < 126; ++i)
-      max *= 2;
-    return max + (max - 1);
-  }
-
-  return std::numeric_limits<T>::max();
-}
-
-template <typename T>
-static constexpr auto lowest() -> T
-{
-  if constexpr (std::is_same_v<T, __int128_t>) {
-    // -170,141,183,460,469,231,731,687,303,715,884,105,728
-    __int128_t lowest = -1;
-    for (int i = 0; i < 127; ++i)
-      lowest *= 2;
-    return lowest;
-  }
-
-  return std::numeric_limits<T>::lowest();
-}
-
-}  // namespace numeric_limits
 
 template <typename T>
 auto to_string(T value) -> std::string
@@ -69,7 +38,7 @@ auto to_string(T value) -> std::string
     if (sign) {
       value += 1;  // avoid overflowing if value == _int128_t lowest
       value *= -1;
-      if (value == detail::numeric_limits::max<__int128_t>())
+      if (value == cuda::std::numeric_limits<__int128_t>::max())
         return "-170141183460469231731687303715884105728";
       value += 1;  // can add back the one, no need to avoid overflow anymore
     }
@@ -104,23 +73,5 @@ CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
   return lhs > rhs ? lhs : rhs;
 }
 
-template <typename T>
-constexpr auto is_signed()
-{
-  return std::is_signed<T>::value || std::is_same_v<T, __int128_t>;
-}
-
-template <typename T>
-constexpr auto is_integral()
-{
-  return cuda::std::is_integral<T>::value || cuda::std::is_same_v<T, __int128_t>;
-}
-
-template <typename T>
-constexpr auto is_arithmetic()
-{
-  return numeric::detail::is_integral<T>() || cuda::std::is_floating_point_v<T>;
-}
-
 }  // namespace detail
 }  // namespace numeric
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index dbb06865f20..388a2e8aace 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -152,7 +152,7 @@ constexpr inline bool is_equality_comparable()
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return numeric::detail::is_integral<T>() or std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() or std::is_floating_point<T>::value;
 }
 
 struct is_numeric_impl {
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 5c3810743a9..200ba5a7a15 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,7 +50,7 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T, typename std::enable_if_t<numeric::detail::is_arithmetic<T>()>* = nullptr>
+  template <typename T, typename std::enable_if_t<cuda::std::is_arithmetic<T>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -72,7 +72,7 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<not numeric::detail::is_arithmetic<T>(), std::unique_ptr<column>> operator()(
+  std::enable_if_t<not cuda::std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
     Args&&...)
   {
     CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 5ba500b10a7..ef804f244e9 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -122,11 +122,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return numeric::detail::is_arithmetic<T>() || is_string_supported<T>();
+    return cuda::std::is_arithmetic<T>() || is_string_supported<T>();
   }
 
   // for arithmetic types
-  template <typename T, std::enable_if_t<numeric::detail::is_arithmetic<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cuda::std::is_arithmetic<T>::value>* = nullptr>
   auto inclusive_scan(column_view const& input_view,
                       null_policy,
                       rmm::cuda_stream_view stream,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index ab2acc91c9d..3a6a2beda45 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -86,7 +86,7 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -105,7 +105,7 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -122,7 +122,7 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -139,7 +139,7 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -158,7 +158,7 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -175,7 +175,7 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<numeric::detail::is_integral<U>()>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 6a6c92ba7c7..0006592e599 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -64,7 +64,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
     *d_buffer = '0';
     return 1;
   }
-  bool const is_negative = numeric::detail::is_signed<IntegerType>() ? (value < 0) : false;
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
 
   constexpr IntegerType base = 10;
   constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
@@ -98,7 +98,7 @@ constexpr size_type count_digits(IntegerType value)
 {
   // TODO definitely broken
   if (value == 0) return 1;
-  bool is_negative = numeric::detail::is_signed<IntegerType>() ? (value < 0) : false;
+  bool is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;
   // for all integer types, the max() and min() values have the same number of digits
   value = (value == std::numeric_limits<IntegerType>::min())
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 9a919c63d28..a2194bc1b55 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1454,7 +1454,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction3)
   auto const values   = std::vector(127, -2);
   auto const scale    = scale_type{0};
   auto const column   = fp_wrapper{values.cbegin(), values.cend(), scale};
-  auto const lowest   = numeric::detail::numeric_limits::lowest<RepType>();
+  auto const lowest   = cuda::std::numeric_limits<RepType>::lowest();
   auto const expected = decimal128{scaled_integer<RepType>{lowest, scale}};
 
   auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};

From 73b36825ce9cac4d7309010cbc007d571e77a325 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 4 Aug 2021 16:19:11 +0000
Subject: [PATCH 034/112] cudf::fill(_in_place) fix for decimal128

---
 cpp/src/filling/fill.cu          | 14 ++++++-----
 cpp/tests/filling/fill_tests.cpp | 43 ++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index b62d2ed4f8f..ab1bca86444 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -77,8 +77,7 @@ struct in_place_fill_range_dispatch {
     auto unscaled = static_cast<cudf::fixed_point_scalar<T> const&>(value).value();
     using RepType = typename T::rep;
     auto s        = cudf::numeric_scalar<RepType>(unscaled, value.is_valid());
-    auto view     = cudf::bit_cast(destination, s.type());
-    in_place_fill<RepType>(view, begin, end, s, stream);
+    in_place_fill<RepType>(destination, begin, end, s, stream);
   }
 
   template <typename T, typename... Args>
@@ -93,13 +92,15 @@ struct out_of_place_fill_range_dispatch {
   cudf::column_view const& input;
 
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
+  std::enable_if_t<not cudf::is_rep_layout_compatible<T>() and not cudf::is_fixed_point<T>(),
+                   std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
     CUDF_FAIL("Unsupported type in fill.");
   }
 
-  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
+  template <typename T,
+            CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type begin,
     cudf::size_type end,
@@ -116,8 +117,9 @@ struct out_of_place_fill_range_dispatch {
           0);
       }
 
-      auto ret_view = p_ret->mutable_view();
-      in_place_fill<T>(ret_view, begin, end, value, stream);
+      auto ret_view    = p_ret->mutable_view();
+      using DeviceType = cudf::device_storage_type_t<T>;
+      in_place_fill<DeviceType>(ret_view, begin, end, value, stream);
     }
 
     return p_ret;
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 75c0cad20e7..3173a23d493 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -363,4 +363,47 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
   EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
 }
 
+template <typename T>
+class FixedPointAllReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointAllReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointAllReps, OutOfPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto const result = cudf::fill(column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+  }
+}
+
+TYPED_TEST(FixedPointAllReps, InPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto column         = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto mut_column = cudf::mutable_column_view{column};
+    cudf::fill_in_place(mut_column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(column, expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From bcd18361208d206c4bcde30f6000962b2a80a8bb Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 5 Aug 2021 05:35:17 +0000
Subject: [PATCH 035/112] Remove TODOs

---
 cpp/include/cudf/fixed_point/fixed_point.hpp   | 1 -
 cpp/include/cudf/utilities/type_dispatcher.hpp | 7 ++++++-
 cpp/src/strings/convert/convert_fixed_point.cu | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 930fef40747..4891bc4ba2d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -53,7 +53,6 @@ constexpr inline auto is_supported_representation_type()
          cuda::std::is_same_v<T, __int128_t>;
 }
 
-// TODO make a temporary::is_integral function
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index e0e7254588d..40e03dc62c3 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -121,7 +121,12 @@ constexpr bool type_id_matches_device_storage_type(type_id id)
          (id == type_id::DECIMAL128 && std::is_same_v<T, __int128_t>) || id == type_to_id<T>();
 }
 
-// TODO docs
+/**
+ * @brief Checks if `id` is fixed_point (DECIMAL32/64/128)
+ *
+ * @return    `true` if `id` is `DECIMAL32`, `DECIMAL64` or `DECIMAL128`
+ * @return    `false` otherwise
+ */
 constexpr bool is_fixed_point(cudf::type_id id)
 {
   return id == type_id::DECIMAL32 or  //
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 9d0a6a3fdd2..20fbcc5f1b2 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -115,7 +115,7 @@ struct string_to_decimal_check_fn {
     return (exp_ten < scale)
              ? true
              : value <= static_cast<uint64_t>(
-                          std::numeric_limits<DecimalType>::max() /  // TODO probably broken
+                          cuda::std::numeric_limits<DecimalType>::max() /
                           static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };

From 84f394bb9d2036962c9c6e93569b12fc10bcf0b0 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Mon, 23 Aug 2021 16:50:13 +0000
Subject: [PATCH 036/112] Initial string conversion changes

---
 .../strings/detail/convert/fixed_point.cuh    | 12 ++---
 .../strings/convert/convert_fixed_point.cu    | 17 ++++---
 cpp/tests/strings/fixed_point_tests.cpp       | 49 +++++++++++--------
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 53774ed948d..f437bebcda4 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -24,22 +24,22 @@ namespace detail {
 /**
  * @brief Return the integer component of a decimal string.
  *
- * This is reads everything up to the exponent 'e' notation.
+ * This reads everything up to the exponent 'e' notation.
  * The return includes the integer digits and any exponent offset.
  *
  * @param[in,out] iter Start of characters to parse
  * @param[in] end End of characters to parse
  * @return Integer component and exponent offset.
  */
-__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
-                                                                char const* iter_end,
-                                                                const char decimal_pt_char = '.')
+__device__ inline thrust::pair<__uint128_t, int32_t> parse_integer(char const*& iter,
+                                                                   char const* iter_end,
+                                                                   const char decimal_pt_char = '.')
 {
   // highest value where another decimal digit cannot be appended without an overflow;
   // this preserves the most digits when scaling the final result
-  constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+  constexpr auto decimal_max = (cuda::std::numeric_limits<__uint128_t>::max() - 9L) / 10L;
 
-  uint64_t value     = 0;  // for checking overflow
+  __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
   bool decimal_found = false;
 
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 20fbcc5f1b2..fb168f5dcd3 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -83,7 +83,7 @@ struct string_to_decimal_check_fn {
   int32_t const scale;
 
   string_to_decimal_check_fn(column_device_view const& d_strings, int32_t scale)
-    : d_strings(d_strings), scale(scale)
+    : d_strings{d_strings}, scale{scale}
   {
   }
 
@@ -112,11 +112,16 @@ struct string_to_decimal_check_fn {
     exp_ten += exp_offset;
 
     // finally, check for overflow based on the exp_ten and scale values
-    return (exp_ten < scale)
-             ? true
-             : value <= static_cast<uint64_t>(
-                          cuda::std::numeric_limits<DecimalType>::max() /
-                          static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
+    if (exp_ten < scale) {
+      // temporary bug fix
+      // TODO: fix once David's refactor/comprehensive bug fix is done
+      return (value / static_cast<__uint128_t>(exp10(static_cast<double>(scale - exp_ten)))) <=
+             static_cast<__uint128_t>(cuda::std::numeric_limits<DecimalType>::max());
+    } else {
+      return value <= static_cast<__uint128_t>(
+                        cuda::std::numeric_limits<DecimalType>::max() /
+                        static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
+    }
   }
 };
 
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index d8b570cee8b..674ad0d5bc6 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -182,38 +182,45 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  cudf::test::strings_column_wrapper big_numbers({
-    "2147483647",
-    "-2147483647",
-    "2147483648",
-    "9223372036854775807",
-    "-9223372036854775807",
-    "9223372036854775808",
-    "100E2147483648",
-  });
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                          cudf::data_type{cudf::type_id::DECIMAL32});
-  auto const expected32 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, false, false, false, false, false});
+  cudf::test::strings_column_wrapper big_numbers({"2147483647",
+                                                  "-2147483647",
+                                                  "2147483648",
+                                                  "9223372036854775807",
+                                                  "-9223372036854775807",
+                                                  "9223372036854775808",
+                                                  "100E2147483648",
+                                                  "170141183460469231731687303715884105727",
+                                                  "170141183460469231731687303715884105728"});
+
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                                        cudf::data_type{cudf::type_id::DECIMAL32});
+  auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                          cudf::data_type{cudf::type_id::DECIMAL64});
-  auto const expected64 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, false, false});
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                                        cudf::data_type{cudf::type_id::DECIMAL64});
+  auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
+  results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                                         cudf::data_type{cudf::type_id::DECIMAL128});
+  auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, false, true, false});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
+
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
-  auto const expected32_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, false});
+  auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
-  auto const expected64_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, false, false, false, false});
+  auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }

From 7031551cde71a6069a819dab374556e6d54b134f Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 24 Aug 2021 18:43:35 +0000
Subject: [PATCH 037/112] Final string changes

---
 cpp/include/cudf/strings/detail/convert/fixed_point.cuh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index f437bebcda4..5f6ceb41588 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -142,11 +142,9 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   exp_ten += exp_offset;
 
   // shift the output value based on the exp_ten and the scale values
-  if (exp_ten < scale) {
-    value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
-  } else {
-    value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
-  }
+  value = exp_ten < scale
+            ? value / static_cast<__uint128_t>(exp10(static_cast<double>(scale - exp_ten)))
+            : value * static_cast<__uint128_t>(exp10(static_cast<double>(exp_ten - scale)));
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
 }

From ea97b9d00d3b2733b23e949193d9893ab76d1299 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 25 Aug 2021 20:49:52 +0000
Subject: [PATCH 038/112] Enhance casting tests for decimal128

---
 cpp/tests/unary/cast_tests.cpp | 204 +++++++++++++++++++++++++++------
 1 file changed, 167 insertions(+), 37 deletions(-)

diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index c71f6aa2019..43dca211ded 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -784,67 +784,197 @@ TYPED_TEST(FixedPointTests, FixedPointToFixedPointSameTypeidDownPositive)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeid)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXX)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-3));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidDown)
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithSmallerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-5));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUp)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUpNullMask)
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector<int32_t>{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperB{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
\ No newline at end of file

From 655cceedc8762fcbc309a8ac08715bba53b583a2 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 26 Aug 2021 02:27:56 +0000
Subject: [PATCH 039/112] Merge conflict fixes

---
 cpp/include/cudf/strings/detail/convert/fixed_point.cuh | 4 ++--
 cpp/src/strings/convert/convert_fixed_point.cu          | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 325bb4d0967..aa3f544202f 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,7 +17,7 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace strings {
@@ -137,7 +137,7 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   // if string begins with a sign, continue with next character
   if (sign != 0) ++iter;
 
-  using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+  using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
   auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
   if (value == 0) { return DecimalType{0}; }
 
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index bbaae08c6b8..23e027be208 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -37,6 +37,8 @@
 #include <thrust/optional.h>
 #include <thrust/transform.h>
 
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -97,7 +99,7 @@ struct string_to_decimal_check_fn {
 
     auto const iter_end = d_str.data() + d_str.size_bytes();
 
-    using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+    using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
     auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
 
     // only exponent notation is expected here

From 2a894bd497906f67c76a31226571ab847edac46a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 26 Aug 2021 20:55:25 +0000
Subject: [PATCH 040/112] Missed STRINGS fixes

---
 cpp/src/strings/convert/convert_fixed_point.cu | 3 ++-
 cpp/tests/strings/fixed_point_tests.cpp        | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 23e027be208..524f6c614e8 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -37,6 +37,7 @@
 #include <thrust/optional.h>
 #include <thrust/transform.h>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 namespace cudf {
@@ -117,7 +118,7 @@ struct string_to_decimal_check_fn {
     // finally, check for overflow based on the exp_ten and scale values
     return (exp_ten < scale) or
            value <= static_cast<UnsignedDecimalType>(
-                      std::numeric_limits<DecimalType>::max() /
+                      cuda::std::numeric_limits<DecimalType>::max() /
                       static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 511b4756ed7..8fffa8ce157 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -206,10 +206,10 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     {true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
-  results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                                         cudf::data_type{cudf::type_id::DECIMAL128});
-  auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, true, false, true, false});
+  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                          cudf::data_type{cudf::type_id::DECIMAL128});
+  auto const expected128 =
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
 
   results = cudf::strings::is_fixed_point(

From d8813211cff33184e0c5293dc8c4fc810ade4064 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 26 Aug 2021 22:24:08 +0000
Subject: [PATCH 041/112] Enhance STRINGS_TEST

---
 cpp/tests/strings/fixed_point_tests.cpp | 42 ++++++++++++-------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 8fffa8ce157..2b6883a080d 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -182,54 +182,52 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  cudf::test::strings_column_wrapper big_numbers({
-    "2147483647",
-    "-2147483647",
-    "2147483648",
-    "9223372036854775807",
-    "-9223372036854775807",
-    "9223372036854775808",
-    "9223372036854775808000",
-    "100E2147483648",
-    // "170141183460469231731687303715884105727",
-    // "170141183460469231731687303715884105728"  TODO add these back
-  });
+  cudf::test::strings_column_wrapper big_numbers({"2147483647",
+                                                  "-2147483647",
+                                                  "2147483648",
+                                                  "9223372036854775807",
+                                                  "-9223372036854775807",
+                                                  "9223372036854775808",
+                                                  "9223372036854775808000",
+                                                  "100E2147483648",
+                                                  "170141183460469231731687303715884105727",
+                                                  "170141183460469231731687303715884105728"});
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                                         cudf::data_type{cudf::type_id::DECIMAL32});
   auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, false, false, false, false, false, false});
+    {true, true, false, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                                         cudf::data_type{cudf::type_id::DECIMAL64});
   auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, false, false, false});
+    {true, true, true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                          cudf::data_type{cudf::type_id::DECIMAL128});
-  auto const expected128 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                                         cudf::data_type{cudf::type_id::DECIMAL128});
+  auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
   auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, true, false, false});
+    {true, true, true, true, true, true, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{10}});
-  auto const expected64_scaled_positive =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  auto const expected64_scaled_positive = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled_positive);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
   auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, false, false, false, false, false});
+    {true, true, true, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }

From 1380a0cebc75c08bf693cc986309b7fb7add7843 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 26 Aug 2021 22:46:05 +0000
Subject: [PATCH 042/112] Enhance ROUND tests

---
 cpp/tests/round/round_tests.cpp | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 825703274e2..b4050625570 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -587,6 +587,39 @@ TEST_F(RoundTests, Int64AtBoundaryHalfUp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, result5->view());
 }
 
+TEST_F(RoundTests, FixedPointAtBoundaryTestHalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const m = std::numeric_limits<RepType>::max();  // 170141183460469231731687303715884105727
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000}, scale_type{5}};
+    auto const result   = cudf::round(input, -5, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000000000}, scale_type{11}};
+    auto const result   = cudf::round(input, -11, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 1000000000000000}, scale_type{15}};
+    auto const result   = cudf::round(input, -15, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 TEST_F(RoundTests, BoolTestHalfUp)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<bool>;

From b5d449331207c43c3f64b123c44389f6279ef9c7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 26 Aug 2021 22:57:39 +0000
Subject: [PATCH 043/112] Fix FIXED_POINT_TESTs

---
 cpp/tests/fixed_point/fixed_point_tests.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 30b3284e032..a90e0f0f541 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -524,8 +524,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -547,8 +547,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -570,10 +570,10 @@ TYPED_TEST(FixedPointTestAllReps, ExtremelyLargeNegativeScale)
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
   auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
 
   auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());

From 87151969f407f5179abb1124c26b4005ce329e11 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 27 Aug 2021 16:35:19 +0000
Subject: [PATCH 044/112] Enhance GROUPBY_TEST for decimal128

---
 cpp/tests/groupby/max_tests.cpp      | 14 +++++++---
 cpp/tests/groupby/min_tests.cpp      | 14 +++++++---
 cpp/tests/groupby/sum_scan_tests.cpp | 41 ++++++++++++++++++++++------
 cpp/tests/groupby/sum_tests.cpp      | 23 ++++++++++------
 4 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 0f2ebfe7788..44dac359935 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -228,13 +228,12 @@ TEST_F(groupby_dictionary_max_test, basic)
 }
 
 template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepTypes = ::testing::Types<int32_t, int64_t>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -258,6 +257,13 @@ TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMaxDecimalAsValue)
   }
 }
 
+template <typename T>
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+};
+
+using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+
 TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 041ed37d71a..f801104d6ea 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -228,13 +228,12 @@ TEST_F(groupby_dictionary_min_test, basic)
 }
 
 template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepTypes = ::testing::Types<int32_t, int64_t>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -257,6 +256,13 @@ TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortMinDecimalAsValue)
   }
 }
 
+template <typename T>
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+};
+
+using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+
 TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index d5d52bf4272..d62759b2327 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -133,19 +133,19 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepTypes = ::testing::Types<int32_t, int64_t>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX      = TypeParam;
-  using RepType        = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper     = fixed_point_column_wrapper<RepType>;
-  using out_fp_wrapper = fixed_point_column_wrapper<int64_t>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
+  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
+  using out_fp_wrapper = fixed_point_column_wrapper<SumType>;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -162,5 +162,30 @@ TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumScanDecimalAsValue)
   }
 }
 
+// struct Decimal128Only : public cudf::test::BaseFixture {
+// };
+
+// TEST_F(Decimal128Only, GroupBySortSumScanDecimalAsValue)
+// {
+//   using namespace numeric;
+//   using RepType        = cudf::device_storage_type_t<decimal128>;
+//   using fp_wrapper     = fixed_point_column_wrapper<RepType>;
+//   using out_fp_wrapper = fixed_point_column_wrapper<RepType>;
+
+//   for (auto const i : {2, 1, 0, -1, -2}) {
+//     auto const scale = scale_type{i};
+//     // clang-format off
+//     auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+//     auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+
+//     auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+//     auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
+//     // clang-format on
+
+//     auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
+//     test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2));
+//   }
+// }
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index ca010d7572a..f81a63b179c 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -157,19 +157,19 @@ TYPED_TEST(groupby_sum_test, dictionary)
 }
 
 template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepTypes = ::testing::Types<int32_t, int64_t>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+TYPED_TEST_CASE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
+  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<SumType>;
   using K            = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
@@ -193,6 +193,13 @@ TYPED_TEST(FixedPointTest_32_64_Reps, GroupBySortSumDecimalAsValue)
   }
 }
 
+template <typename T>
+struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
+};
+
+using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
+TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
+
 TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;

From 7952e90ac7df99ad9a09c0a87c022d2809939be2 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 27 Aug 2021 16:42:23 +0000
Subject: [PATCH 045/112] Delete commented out code

---
 cpp/tests/groupby/sum_scan_tests.cpp | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index d62759b2327..6b813f8b6db 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -162,30 +162,5 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
   }
 }
 
-// struct Decimal128Only : public cudf::test::BaseFixture {
-// };
-
-// TEST_F(Decimal128Only, GroupBySortSumScanDecimalAsValue)
-// {
-//   using namespace numeric;
-//   using RepType        = cudf::device_storage_type_t<decimal128>;
-//   using fp_wrapper     = fixed_point_column_wrapper<RepType>;
-//   using out_fp_wrapper = fixed_point_column_wrapper<RepType>;
-
-//   for (auto const i : {2, 1, 0, -1, -2}) {
-//     auto const scale = scale_type{i};
-//     // clang-format off
-//     auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-//     auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-//     auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
-//     auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
-//     // clang-format on
-
-//     auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
-//     test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2));
-//   }
-// }
-
 }  // namespace test
 }  // namespace cudf

From 10d58a3e5f2f2a334e156af2fae5f194f536c4a1 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 31 Aug 2021 20:12:54 +0000
Subject: [PATCH 046/112] Support hash groupby decimal128 (by making is sort) -
 initial change

---
 cpp/src/groupby/hash/groupby.cu |  7 ++++---
 cpp/tests/groupby/max_tests.cpp |  9 +--------
 cpp/tests/groupby/min_tests.cpp |  9 +--------
 cpp/tests/groupby/sum_tests.cpp | 18 ++++++------------
 4 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 87f83c6edd6..e94c119596f 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -647,9 +647,10 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-      return is_hash_aggregation(a->kind);
-    });
+    return (r.values.type().id() != cudf::type_id::DECIMAL128) and
+           std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+             return is_hash_aggregation(a->kind);
+           });
   });
 }
 
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index a8fdbef8384..ef72fff45ac 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -281,14 +281,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
   }
 }
 
-template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
-};
-
-using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
-
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index c47b51b9ce1..044f97c3cac 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -280,14 +280,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
   }
 }
 
-template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
-};
-
-using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
-
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index f81a63b179c..ed42386b694 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -193,20 +193,14 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
   }
 }
 
-template <typename T>
-struct FixedPointTest_32_64_Reps : public cudf::test::BaseFixture {
-};
-
-using RepTypes = ::testing::Types<numeric::decimal32, numeric::decimal64>;
-TYPED_TEST_CASE(FixedPointTest_32_64_Reps, RepTypes);
-
-TYPED_TEST(FixedPointTest_32_64_Reps, GroupByHashSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
+  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<SumType>;
   using K            = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {

From 60ce655d4e53162b7b7ad2b9c078a18e736dd001 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 31 Aug 2021 23:28:05 +0000
Subject: [PATCH 047/112] has_atomic_support

---
 cpp/src/groupby/hash/groupby.cu | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e94c119596f..ede631d2f54 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -50,6 +50,8 @@
 #include <unordered_set>
 #include <utility>
 
+#include <cuda/std/atomic>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -634,6 +636,20 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 
 }  // namespace
 
+// TODO move this to more appropriate file
+struct has_atomic_support_type_dispatcher {
+  template <typename T>
+  bool operator()()
+  {
+    return cuda::std::atomic<T>::is_always_lock_free;
+  }
+};
+
+bool has_atomic_support(cudf::data_type const& type)
+{
+  return type_dispatcher(type, has_atomic_support_type_dispatcher{});
+}
+
 /**
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
@@ -647,7 +663,7 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    return (r.values.type().id() != cudf::type_id::DECIMAL128) and
+    return has_atomic_support(r.values.type()) and
            std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
              return is_hash_aggregation(a->kind);
            });

From 28aca7d8e6e544da9ffc37173b1a8e5fb6c20db0 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 1 Sep 2021 19:09:07 +0000
Subject: [PATCH 048/112] TEMPORARY - will revert later

---
 .../Modules/JitifyPreprocessKernels.cmake     |   3 +-
 cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake |   4 +-
 cpp/src/binaryop/binaryop.cpp                 | 190 +++++++++---------
 3 files changed, 99 insertions(+), 98 deletions(-)

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 7e2ec5254d3..e854f4fa1a3 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -55,8 +55,7 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
-                     FILES                 binaryop/jit/kernel.cu
-                                           transform/jit/masked_udf_kernel.cu
+                     FILES                 transform/jit/masked_udf_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
                      )
diff --git a/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake b/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake
index 63d6d26802c..ef5db9ca91d 100644
--- a/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetLibcudacxx.cmake
@@ -17,8 +17,8 @@
 function(find_and_configure_libcudacxx VERSION)
     CPMFindPackage(NAME     libcudacxx
         VERSION             ${VERSION}
-        GIT_REPOSITORY      https://github.com/NVIDIA/libcudacxx.git
-        GIT_TAG             ${VERSION}
+        GIT_REPOSITORY      https://gitlab-master.nvidia.com/nvhpc/libcudacxx.git
+        GIT_TAG             staging/1.6.0
         GIT_SHALLOW         TRUE
         DOWNLOAD_ONLY       TRUE
     )
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 785fba0b1a9..15c1747dc0e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -20,7 +20,7 @@
 #include "compiled/binary_ops.hpp"
 #include "jit/util.hpp"
 
-#include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
+// #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -134,41 +134,42 @@ void binary_operation(mutable_column_view& out,
                       OperatorType op_type,
                       rmm::cuda_stream_view stream)
 {
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               lhs.offset(),
-               rhs.is_valid());
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
+  // if (is_null_dependent(op)) {
+  //   std::string kernel_name =
+  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
+  //       .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+  //                    cudf::jit::get_type_name(lhs.type()),
+  //                    cudf::jit::get_type_name(rhs.type()),
+  //                    get_operator_name(op, op_type));
+
+  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+  //     ->launch(out.size(),
+  //              cudf::jit::get_data_ptr(out),
+  //              cudf::jit::get_data_ptr(lhs),
+  //              cudf::jit::get_data_ptr(rhs),
+  //              out.null_mask(),
+  //              lhs.null_mask(),
+  //              lhs.offset(),
+  //              rhs.is_valid());
+  // } else {
+  //   std::string kernel_name =
+  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
+  //       .instantiate(cudf::jit::get_type_name(out.type()),            // list of template
+  //       arguments
+  //                    cudf::jit::get_type_name(lhs.type()),
+  //                    cudf::jit::get_type_name(rhs.type()),
+  //                    get_operator_name(op, op_type));
+
+  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+  //     ->launch(out.size(),
+  //              cudf::jit::get_data_ptr(out),
+  //              cudf::jit::get_data_ptr(lhs),
+  //              cudf::jit::get_data_ptr(rhs));
+  // }
 }
 
 void binary_operation(mutable_column_view& out,
@@ -195,42 +196,43 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               rhs.offset(),
-               rhs.null_mask(),
-               rhs.offset());
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
+  // if (is_null_dependent(op)) {
+  //   std::string kernel_name =
+  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
+  //       .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+  //                    cudf::jit::get_type_name(lhs.type()),
+  //                    cudf::jit::get_type_name(rhs.type()),
+  //                    get_operator_name(op, OperatorType::Direct));
+
+  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+  //     ->launch(out.size(),
+  //              cudf::jit::get_data_ptr(out),
+  //              cudf::jit::get_data_ptr(lhs),
+  //              cudf::jit::get_data_ptr(rhs),
+  //              out.null_mask(),
+  //              lhs.null_mask(),
+  //              rhs.offset(),
+  //              rhs.null_mask(),
+  //              rhs.offset());
+  // } else {
+  //   std::string kernel_name =
+  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+  //       .instantiate(cudf::jit::get_type_name(out.type()),            // list of template
+  //       arguments
+  //                    cudf::jit::get_type_name(lhs.type()),
+  //                    cudf::jit::get_type_name(rhs.type()),
+  //                    get_operator_name(op, OperatorType::Direct));
+
+  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+  //     ->launch(out.size(),
+  //              cudf::jit::get_data_ptr(out),
+  //              cudf::jit::get_data_ptr(lhs),
+  //              cudf::jit::get_data_ptr(rhs));
+  // }
 }
 
 void binary_operation(mutable_column_view& out,
@@ -239,28 +241,28 @@ void binary_operation(mutable_column_view& out,
                       const std::string& ptx,
                       rmm::cuda_stream_view stream)
 {
-  std::string const output_type_name = cudf::jit::get_type_name(out.type());
-
-  std::string ptx_hash =
-    "prog_binop." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
-  std::string cuda_source =
-    cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
-
-  std::string kernel_name =
-    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-      .instantiate(output_type_name,                                // list of template arguments
-                   cudf::jit::get_type_name(lhs.type()),
-                   cudf::jit::get_type_name(rhs.type()),
-                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
-
-  cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
-    ->launch(out.size(),
-             cudf::jit::get_data_ptr(out),
-             cudf::jit::get_data_ptr(lhs),
-             cudf::jit::get_data_ptr(rhs));
+  // std::string const output_type_name = cudf::jit::get_type_name(out.type());
+
+  // std::string ptx_hash =
+  //   "prog_binop." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
+  // std::string cuda_source =
+  //   cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
+
+  // std::string kernel_name =
+  //   jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+  //     .instantiate(output_type_name,                                // list of template arguments
+  //                  cudf::jit::get_type_name(lhs.type()),
+  //                  cudf::jit::get_type_name(rhs.type()),
+  //                  get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+
+  // cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+  //   .get_kernel(
+  //     kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+  //   ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
+  //   ->launch(out.size(),
+  //            cudf::jit::get_data_ptr(out),
+  //            cudf::jit::get_data_ptr(lhs),
+  //            cudf::jit::get_data_ptr(rhs));
 }
 }  // namespace jit
 

From fe446a427e5c50b30b9c37781553f0190630ce4a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 9 Sep 2021 01:36:09 +0000
Subject: [PATCH 049/112] Block group_by mean for decimal types

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index bcf9fa386d5..e5d62ef584f 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -951,10 +951,12 @@ struct target_type_impl<Source, aggregation::ALL> {
 // Except for chrono types where result is chrono. (Use FloorDiv)
 // TODO: MEAN should be only be enabled for duration types - not for timestamps
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<
-  Source,
-  k,
-  std::enable_if_t<is_fixed_width<Source>() && !is_chrono<Source>() && (k == aggregation::MEAN)>> {
+struct target_type_impl<Source,
+                        k,
+                        std::enable_if_t<is_fixed_width<Source>()        //
+                                         and not is_chrono<Source>()     //
+                                         and not is_fixed_point<Source>  //
+                                         and (k == aggregation::MEAN)>> {
   using type = double;
 };
 

From efd0b62534260a2f6b0f42f990b7591b91274643 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 9 Sep 2021 05:17:15 +0000
Subject: [PATCH 050/112] Revert non-comprehensive fix

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 8745228a519..1d6399d4d00 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -953,10 +953,8 @@ struct target_type_impl<Source, aggregation::ALL> {
 template <typename Source, aggregation::Kind k>
 struct target_type_impl<Source,
                         k,
-                        std::enable_if_t<is_fixed_width<Source>()        //
-                                         and not is_chrono<Source>()     //
-                                         and not is_fixed_point<Source>  //
-                                         and (k == aggregation::MEAN)>> {
+                        std::enable_if_t<is_fixed_width<Source>() and not is_chrono<Source>() and
+                                         (k == aggregation::MEAN)>> {
   using type = double;
 };
 

From 5622a842c426944838784b8d9f36912306e8491e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 17 Sep 2021 22:16:45 +0000
Subject: [PATCH 051/112] binary op changes

---
 .../Modules/JitifyPreprocessKernels.cmake     |   3 +-
 cpp/src/binaryop/binaryop.cpp                 | 437 ++++--------------
 cpp/tests/binaryop/binop-integration-test.cpp |  83 ++--
 3 files changed, 143 insertions(+), 380 deletions(-)

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index e854f4fa1a3..7e2ec5254d3 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -55,7 +55,8 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
-                     FILES                 transform/jit/masked_udf_kernel.cu
+                     FILES                 binaryop/jit/kernel.cu
+                                           transform/jit/masked_udf_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
                      )
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 15c1747dc0e..486ace3424f 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -20,7 +20,7 @@
 #include "compiled/binary_ops.hpp"
 #include "jit/util.hpp"
 
-// #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -134,42 +134,41 @@ void binary_operation(mutable_column_view& out,
                       OperatorType op_type,
                       rmm::cuda_stream_view stream)
 {
-  // if (is_null_dependent(op)) {
-  //   std::string kernel_name =
-  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
-  //       .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-  //                    cudf::jit::get_type_name(lhs.type()),
-  //                    cudf::jit::get_type_name(rhs.type()),
-  //                    get_operator_name(op, op_type));
-
-  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-  //     ->launch(out.size(),
-  //              cudf::jit::get_data_ptr(out),
-  //              cudf::jit::get_data_ptr(lhs),
-  //              cudf::jit::get_data_ptr(rhs),
-  //              out.null_mask(),
-  //              lhs.null_mask(),
-  //              lhs.offset(),
-  //              rhs.is_valid());
-  // } else {
-  //   std::string kernel_name =
-  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
-  //       .instantiate(cudf::jit::get_type_name(out.type()),            // list of template
-  //       arguments
-  //                    cudf::jit::get_type_name(lhs.type()),
-  //                    cudf::jit::get_type_name(rhs.type()),
-  //                    get_operator_name(op, op_type));
-
-  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-  //     ->launch(out.size(),
-  //              cudf::jit::get_data_ptr(out),
-  //              cudf::jit::get_data_ptr(lhs),
-  //              cudf::jit::get_data_ptr(rhs));
-  // }
+  if (is_null_dependent(op)) {
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, op_type));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs),
+               out.null_mask(),
+               lhs.null_mask(),
+               lhs.offset(),
+               rhs.is_valid());
+  } else {
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, op_type));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs));
+  }
 }
 
 void binary_operation(mutable_column_view& out,
@@ -196,43 +195,42 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  // if (is_null_dependent(op)) {
-  //   std::string kernel_name =
-  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
-  //       .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-  //                    cudf::jit::get_type_name(lhs.type()),
-  //                    cudf::jit::get_type_name(rhs.type()),
-  //                    get_operator_name(op, OperatorType::Direct));
-
-  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-  //     ->launch(out.size(),
-  //              cudf::jit::get_data_ptr(out),
-  //              cudf::jit::get_data_ptr(lhs),
-  //              cudf::jit::get_data_ptr(rhs),
-  //              out.null_mask(),
-  //              lhs.null_mask(),
-  //              rhs.offset(),
-  //              rhs.null_mask(),
-  //              rhs.offset());
-  // } else {
-  //   std::string kernel_name =
-  //     jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-  //       .instantiate(cudf::jit::get_type_name(out.type()),            // list of template
-  //       arguments
-  //                    cudf::jit::get_type_name(lhs.type()),
-  //                    cudf::jit::get_type_name(rhs.type()),
-  //                    get_operator_name(op, OperatorType::Direct));
-
-  //   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-  //     .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-  //     ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-  //     ->launch(out.size(),
-  //              cudf::jit::get_data_ptr(out),
-  //              cudf::jit::get_data_ptr(lhs),
-  //              cudf::jit::get_data_ptr(rhs));
-  // }
+  if (is_null_dependent(op)) {
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, OperatorType::Direct));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs),
+               out.null_mask(),
+               lhs.null_mask(),
+               rhs.offset(),
+               rhs.null_mask(),
+               rhs.offset());
+  } else {
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, OperatorType::Direct));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs));
+  }
 }
 
 void binary_operation(mutable_column_view& out,
@@ -241,28 +239,28 @@ void binary_operation(mutable_column_view& out,
                       const std::string& ptx,
                       rmm::cuda_stream_view stream)
 {
-  // std::string const output_type_name = cudf::jit::get_type_name(out.type());
-
-  // std::string ptx_hash =
-  //   "prog_binop." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
-  // std::string cuda_source =
-  //   cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
-
-  // std::string kernel_name =
-  //   jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-  //     .instantiate(output_type_name,                                // list of template arguments
-  //                  cudf::jit::get_type_name(lhs.type()),
-  //                  cudf::jit::get_type_name(rhs.type()),
-  //                  get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
-
-  // cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-  //   .get_kernel(
-  //     kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-  //   ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
-  //   ->launch(out.size(),
-  //            cudf::jit::get_data_ptr(out),
-  //            cudf::jit::get_data_ptr(lhs),
-  //            cudf::jit::get_data_ptr(rhs));
+  std::string const output_type_name = cudf::jit::get_type_name(out.type());
+
+  std::string ptx_hash =
+    "prog_binop." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
+  std::string cuda_source =
+    cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
+
+  std::string kernel_name =
+    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+      .instantiate(output_type_name,                                // list of template arguments
+                   cudf::jit::get_type_name(lhs.type()),
+                   cudf::jit::get_type_name(rhs.type()),
+                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+
+  cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+    .get_kernel(
+      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
+    ->launch(out.size(),
+             cudf::jit::get_data_ptr(out),
+             cudf::jit::get_data_ptr(lhs),
+             cudf::jit::get_data_ptr(rhs));
 }
 }  // namespace jit
 
@@ -422,241 +420,6 @@ void fixed_point_binary_operation_validation(binary_operator op,
 }
 
 namespace jit {
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `scalar` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
- * @param stream CUDA stream used for device memory operations
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{rhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff = lhs.type().scale() - rhs.type().scale();
-      if (lhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value();
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      } else if (lhs.type().id() == type_id::DECIMAL64) {
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value();
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal128> const&>(lhs).value();
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal128>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      }
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else if (lhs.type().id() == type_id::DECIMAL64) {
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `scalar` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
- * @param stream CUDA stream used for device memory operations
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     scalar const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() > lhs.type().scale()) {
-      auto const diff = rhs.type().scale() - lhs.type().scale();
-      if (rhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(rhs).value();
-        auto const scale  = scale_type{lhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      } else if (rhs.type().id() == type_id::DECIMAL64) {
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value();
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      } else {
-        CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal128> const&>(rhs).value();
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal128>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      }
-    } else {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (rhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else if (rhs.type().id() == type_id::DECIMAL64) {
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of two `column_view`s
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
- * @param stream CUDA stream used for device memory operations
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty() or rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else if (lhs.type().id() == type_id::DECIMAL64) {
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else if (lhs.type().id() == type_id::DECIMAL64) {
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL128, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<__int128_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal128>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
 
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
@@ -669,8 +432,8 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
+  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) TODO
+  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
@@ -697,8 +460,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
+  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
+  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
@@ -727,8 +490,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
+  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) // TODO
+  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 0c546424d0b..a0847f3eff2 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2053,7 +2053,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2086,7 +2086,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2108,7 +2108,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2127,7 +2127,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2146,7 +2146,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2163,7 +2163,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2183,7 +2183,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2202,7 +2202,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2221,7 +2221,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2238,7 +2238,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2255,7 +2255,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2272,8 +2272,8 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -2305,7 +2305,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2324,7 +2324,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2340,8 +2340,8 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2357,7 +2357,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
   auto const result =
-    cudf::jit::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2372,8 +2372,8 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2388,8 +2388,8 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale2Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2415,8 +2415,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add =
-    cudf::jit::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -2427,15 +2426,15 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
 
   auto const btype = cudf::data_type{type_id::BOOL8};
   auto const equal_result =
-    cudf::jit::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
   auto const less_result =
-    cudf::jit::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
   auto const greater_result =
-    cudf::jit::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -2454,7 +2453,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2474,7 +2473,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2490,7 +2489,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::jit::binary_operation(
+  auto const result = cudf::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -2507,7 +2506,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div)
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2523,7 +2522,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div2)
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2539,7 +2538,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div3)
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2555,7 +2554,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div4)
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2572,7 +2571,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div6)
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2589,7 +2588,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div7)
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2605,7 +2604,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div8)
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2621,7 +2620,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div9)
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2637,7 +2636,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div10)
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2653,7 +2652,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div11)
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2667,9 +2666,9 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
   auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
                cudf::logic_error);
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
                cudf::logic_error);
 }
 

From 5ebd1bbef5e8a41aa1d3f623639bb90fd624eb20 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Fri, 17 Sep 2021 23:33:46 +0000
Subject: [PATCH 052/112] add checks to jit binary op

---
 cpp/src/binaryop/binaryop.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 486ace3424f..97716e3b07f 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -432,11 +432,10 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) TODO
-  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
@@ -460,11 +459,10 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
@@ -490,11 +488,10 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) // TODO
-  //   return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 

From cb4e38910edc512fd1971783f947b2c2574584e3 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 21 Sep 2021 04:12:14 +0000
Subject: [PATCH 053/112] Final changes for binary ops

---
 cpp/src/binaryop/binaryop.cpp              | 50 ++++++++++++----------
 cpp/tests/binaryop/binop-compiled-test.cpp | 26 ++++-------
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 97716e3b07f..9c8817a6b4d 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -266,6 +266,28 @@ void binary_operation(mutable_column_view& out,
 
 // Compiled Binary operation
 namespace compiled {
+
+template <typename Lhs, typename Rhs>
+void fixed_point_binary_operation_validation(binary_operator op,
+                                             Lhs lhs,
+                                             Rhs rhs,
+                                             thrust::optional<cudf::data_type> output_type = {})
+{
+  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
+  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation");
+  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
+  if (output_type.has_value()) {
+    if (binops::is_comparison_binop(op))
+      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
+                   "Comparison operations require boolean output type.");
+    else
+      CUDF_EXPECTS(is_fixed_point(output_type.value()),
+                   "fixed_point binary operations require fixed_point output type.");
+  }
+}
+
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * binary_operator, data_type, rmm::mr::device_memory_resource*)
@@ -291,6 +313,11 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
   if (not cudf::binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
     CUDF_FAIL("Unsupported operator for these types");
 
+  if (cudf::is_fixed_point(lhs.type()) or cudf::is_fixed_point(rhs.type())) {
+    cudf::binops::compiled::fixed_point_binary_operation_validation(
+      op, lhs.type(), rhs.type(), output_type);
+  }
+
   auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if constexpr (std::is_same_v<LhsType, column_view>)
@@ -398,27 +425,6 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   }
 };
 
-template <typename Lhs, typename Rhs>
-void fixed_point_binary_operation_validation(binary_operator op,
-                                             Lhs lhs,
-                                             Rhs rhs,
-                                             thrust::optional<cudf::data_type> output_type = {})
-{
-  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
-               "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
-  if (output_type.has_value()) {
-    if (binops::is_comparison_binop(op))
-      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
-                   "Comparison operations require boolean output type.");
-    else
-      CUDF_EXPECTS(is_fixed_point(output_type.value()),
-                   "fixed_point binary operations require fixed_point output type.");
-  }
-}
-
 namespace jit {
 
 std::unique_ptr<column> binary_operation(scalar const& lhs,
@@ -617,7 +623,7 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs)
 {
-  cudf::detail::fixed_point_binary_operation_validation(op, lhs, rhs);
+  cudf::binops::compiled::fixed_point_binary_operation_validation(op, lhs, rhs);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.scale(), rhs.scale());
   return cudf::data_type{lhs.id(), scale};
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 25d2f1d2c24..8fa82a07db7 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -115,7 +115,6 @@ using Add_types =
                     cudf::test::Types<duration_us, duration_us, duration_D>,
                     // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
                     cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
                     // Extras
                     cudf::test::Types<duration_D, duration_D, duration_D>,
                     cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
@@ -144,8 +143,7 @@ using Sub_types =
                     cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
                     cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
                     cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
@@ -166,9 +164,7 @@ using Mul_types =
                     cudf::test::Types<duration_s, u_int64_t, duration_s>,
                     cudf::test::Types<duration_ms, duration_D, int16_t>,
                     cudf::test::Types<duration_ns, duration_us, uint8_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, int, int>>;
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
@@ -192,8 +188,7 @@ using Div_types =
                     cudf::test::Types<double, duration_D, duration_ns>,
                     cudf::test::Types<float, duration_ms, duration_ns>,
                     cudf::test::Types<u_int64_t, duration_us, duration_ns>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
@@ -209,13 +204,10 @@ TYPED_TEST(BinaryOperationCompiledTest_Div, Vector_Vector)
 // n n / n
 // t
 // d
-using TrueDiv_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<int8_t, bool, u_int32_t>,
-                    cudf::test::Types<u_int64_t, float, int16_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+using TrueDiv_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                        cudf::test::Types<double, int8_t, int64_t>,
+                                        cudf::test::Types<int8_t, bool, u_int32_t>,
+                                        cudf::test::Types<u_int64_t, float, int16_t>>;
 template <typename T>
 struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
 };
@@ -519,9 +511,7 @@ using Null_types =
                     cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
                     cudf::test::Types<duration_ns, duration_us, duration_s>,
                     // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, uint32_t, numeric::decimal32>,
-                    cudf::test::Types<int64_t, numeric::decimal64, int64_t>>;
+                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {

From 4c81f57fd36309eecf8d286ee42428931d5cc0de Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Tue, 21 Sep 2021 04:22:28 +0000
Subject: [PATCH 054/112] Add more binop tests

---
 cpp/tests/binaryop/binop-compiled-test.cpp | 102 ++++++++++++---------
 1 file changed, 57 insertions(+), 45 deletions(-)

diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 8fa82a07db7..206b0252abf 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -107,20 +107,22 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
 // t      	     	t + d
 // d      	d + t	d + d
 
-using Add_types =
-  cudf::test::Types<cudf::test::Types<bool, bool, float>,
-                    cudf::test::Types<int16_t, double, uint8_t>,
-                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
-                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
-                    cudf::test::Types<duration_us, duration_us, duration_D>,
-                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    // Extras
-                    cudf::test::Types<duration_D, duration_D, duration_D>,
-                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
-                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
-                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
-                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+using Add_types = cudf::test::Types<
+  cudf::test::Types<bool, bool, float>,
+  cudf::test::Types<int16_t, double, uint8_t>,
+  cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
+  cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
+  cudf::test::Types<duration_us, duration_us, duration_D>,
+  // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
+  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
+  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>,
+  // Extras
+  cudf::test::Types<duration_D, duration_D, duration_D>,
+  cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
+  cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
+  cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
+  cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
 };
@@ -137,13 +139,15 @@ TYPED_TEST(BinaryOperationCompiledTest_Add, Vector_Vector)
 // t      	t - t	t - d
 // d      	     	d - d
 
-using Sub_types =
-  cudf::test::Types<cudf::test::Types<int32_t, bool, float>,                  // n - n
-                    cudf::test::Types<duration_D, timestamp_D, timestamp_D>,  // t - t
-                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
-                    cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
-                    cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
+using Sub_types = cudf::test::Types<
+  cudf::test::Types<int32_t, bool, float>,                  // n - n
+  cudf::test::Types<duration_D, timestamp_D, timestamp_D>,  // t - t
+  cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
+  cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
+  cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
+  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
+  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
@@ -159,12 +163,14 @@ TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
 // n n * n	     	n * d
 // t
 // d d * n
-using Mul_types =
-  cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
-                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
-                    cudf::test::Types<duration_ms, duration_D, int16_t>,
-                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
+using Mul_types = cudf::test::Types<
+  cudf::test::Types<int32_t, u_int64_t, float>,
+  cudf::test::Types<duration_s, u_int64_t, duration_s>,
+  cudf::test::Types<duration_ms, duration_D, int16_t>,
+  cudf::test::Types<duration_ns, duration_us, uint8_t>,
+  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
+  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
@@ -180,15 +186,17 @@ TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
 // n n / n
 // t
 // d d / n	     	d / d
-using Div_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
-                    cudf::test::Types<duration_ns, duration_D, int16_t>,
-                    cudf::test::Types<double, duration_D, duration_ns>,
-                    cudf::test::Types<float, duration_ms, duration_ns>,
-                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
+using Div_types = cudf::test::Types<
+  cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+  cudf::test::Types<double, int8_t, int64_t>,
+  cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+  cudf::test::Types<duration_ns, duration_D, int16_t>,
+  cudf::test::Types<double, duration_D, duration_ns>,
+  cudf::test::Types<float, duration_ms, duration_ns>,
+  cudf::test::Types<u_int64_t, duration_us, duration_ns>,
+  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
+  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
@@ -459,7 +467,9 @@ using Comparison_types =
                     cudf::test::Types<bool, duration_ns, duration_ns>,
                     cudf::test::Types<bool, duration_us, duration_s>,
                     cudf::test::Types<bool, std::string, std::string>,
-                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>>;
+                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>,
+                    cudf::test::Types<bool, numeric::decimal64, numeric::decimal64>,
+                    cudf::test::Types<bool, numeric::decimal128, numeric::decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
@@ -504,14 +514,16 @@ TYPED_TEST(BinaryOperationCompiledTest_Comparison, GreaterEqual_Vector_Vector)
 // d          .
 // s             .
 // dc .             .
-using Null_types =
-  cudf::test::Types<cudf::test::Types<int16_t, int8_t, int16_t>,
-                    cudf::test::Types<uint16_t, uint32_t, uint16_t>,
-                    cudf::test::Types<double, uint64_t, double>,
-                    cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
-                    cudf::test::Types<duration_ns, duration_us, duration_s>,
-                    // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>>;
+using Null_types = cudf::test::Types<
+  cudf::test::Types<int16_t, int8_t, int16_t>,
+  cudf::test::Types<uint16_t, uint32_t, uint16_t>,
+  cudf::test::Types<double, uint64_t, double>,
+  cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
+  cudf::test::Types<duration_ns, duration_us, duration_s>,
+  // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
+  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
+  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
+  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {

From 58b23cd2c46dbccc72fb0300ba1c45b1d5033b6d Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 22 Sep 2021 05:33:30 +0000
Subject: [PATCH 055/112] Temporary fix for chrono groupby min_tests

---
 cpp/include/cudf/detail/utilities/device_operators.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index f8792061612..1ebecdf1a83 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -99,6 +99,7 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    if constexpr (cudf::is_chrono<T>()) return std::numeric_limits<T>::max();
     return cuda::std::numeric_limits<T>::max();
   }
 

From 1f3284fd37851fef70da3776fa9924de4221b8b1 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 22 Sep 2021 15:42:07 +0000
Subject: [PATCH 056/112] decimal128 comparision tests

---
 cpp/tests/binaryop/binop-integration-test.cpp | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index a0847f3eff2..d2c39454f90 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2672,6 +2672,38 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
                cudf::logic_error);
 }
 
+template <typename T>
+struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
+};
+
+using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
+TYPED_TEST_CASE(FixedPointTest_64_128_Reps, Decimal64And128Types);
+
+TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
+    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
+    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
+    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
+    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
+    auto const bool_type = cudf::data_type{type_id::BOOL8};
+
+    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
+    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
+    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
+    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, a->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, b->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, c->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, d->view());
+  }
+}
+
 }  // namespace binop
 }  // namespace test
 }  // namespace cudf

From 7713bc4aeac00c8ac02f8c431a24fd950e3a0531 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 22 Sep 2021 15:59:00 +0000
Subject: [PATCH 057/112] Enhance decimal128 comparison tests

---
 cpp/tests/binaryop/binop-integration-test.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index d2c39454f90..495b6c09698 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2696,11 +2696,19 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
     auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
     auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
+    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
+    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, a->view());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, b->view());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, c->view());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, d->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, e->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, f->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, g->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, h->view());
   }
 }
 

From 2de00b8b3acd8c2fe6cc38df39b85c195c613e81 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 22 Sep 2021 16:15:41 +0000
Subject: [PATCH 058/112] small cleanup

---
 cpp/tests/binaryop/binop-integration-test.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 495b6c09698..6b7d8ead299 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2701,14 +2701,14 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
     auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
 
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, a->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, b->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, c->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, d->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, e->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(trues, f->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, g->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(falses, h->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
   }
 }
 

From ea36188cc65ecebfcec41af22b0bbeebc04b3229 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Thu, 23 Sep 2021 02:21:14 +0000
Subject: [PATCH 059/112] cleanup

---
 cpp/src/aggregation/aggregation.cu | 3 +--
 cpp/src/groupby/hash/groupby.cu    | 7 ++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index b4d4b99b87a..02998b84ffd 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -28,8 +28,7 @@ void initialize_with_identity(mutable_table_view& table,
   // kernel per column
   for (size_type i = 0; i < table.num_columns(); ++i) {
     auto col = table.column(i);
-    dispatch_type_and_aggregation(
-      col.type(), aggs[i], identity_initializer{}, col, stream);  // TODO SFINAE for decimal
+    dispatch_type_and_aggregation(col.type(), aggs[i], identity_initializer{}, col, stream);
   }
 }
 
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 2260b39b3c7..77b23e06a09 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -628,7 +628,6 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 
 }  // namespace
 
-// TODO move this to more appropriate file
 struct has_atomic_support_type_dispatcher {
   template <typename T>
   bool operator()()
@@ -637,6 +636,12 @@ struct has_atomic_support_type_dispatcher {
   }
 };
 
+/**
+ * @brief Indicates whether `type` has support for atomics
+ *
+ * @param type  The `data_type` that is being checked
+ * @return      `true` if `type` has support for atomics, `false` otherwise
+ */
 bool has_atomic_support(cudf::data_type const& type)
 {
   return type_dispatcher(type, has_atomic_support_type_dispatcher{});

From d093ae82778469b1cead190044b43113cc23e1ce Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 5 Oct 2021 05:17:09 -0500
Subject: [PATCH 060/112] Fix rounding issues with DECIMAL128

---
 cpp/src/round/round.cu          | 10 +++++-----
 cpp/tests/round/round_tests.cpp | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 3a6a2beda45..b4472c5b61b 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -46,26 +46,26 @@ inline double __device__ generic_round_half_even(double d) { return rint(d); }
 inline float __device__ generic_modf(float a, float* b) { return modff(a, b); }
 inline double __device__ generic_modf(double a, double* b) { return modf(a, b); }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
-  return abs(value);
+  return value < 0 ? -value : value;
 }
 
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
   return value;
 }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T value)
 {
   return value < 0 ? -1 : 1;
 }
 
 // this is needed to suppress warning: pointless comparison of unsigned integer with zero
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T)
 {
   return 1;
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index b4050625570..4d1f66443c2 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -587,6 +587,21 @@ TEST_F(RoundTests, Int64AtBoundaryHalfUp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, result5->view());
 }
 
+TEST_F(RoundTests, FixedPoint128HalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    auto const input    = fp_wrapper{{-160714515306}, scale_type{-13}};
+    auto const expected = fp_wrapper{{-16071451531}, scale_type{-12}};
+    auto const result   = cudf::round(input, 12, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 TEST_F(RoundTests, FixedPointAtBoundaryTestHalfUp)
 {
   using namespace numeric;

From 7eedaea94ae18681c269ab909fe979ca19fae89a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <choekstra@nvidia.com>
Date: Wed, 6 Oct 2021 15:46:33 +0000
Subject: [PATCH 061/112] Use numeric::detail::abs in round.cu

---
 cpp/src/round/round.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index b4472c5b61b..36dd2dabd72 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/round.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -49,7 +50,7 @@ inline double __device__ generic_modf(double a, double* b) { return modf(a, b);
 template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
-  return value < 0 ? -value : value;
+  return numeric::detail::abs(value);
 }
 
 template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>

From a8109278c20acb96e4eab396a5d7b20530b0dc11 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 20 Oct 2021 16:35:05 -0400
Subject: [PATCH 062/112] Add cuda:: and if constexpr check

---
 cpp/include/cudf/detail/utilities/device_operators.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 1ebecdf1a83..8867cc9bf68 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -107,7 +107,7 @@ struct DeviceMin {
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
-    return std::numeric_limits<T>::max();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   // @brief identity specialized for string_view
@@ -138,6 +138,7 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    if constexpr (cudf::is_chrono<T>()) return std::numeric_limits<T>::lowest();
     return cuda::std::numeric_limits<T>::lowest();
   }
 
@@ -145,7 +146,7 @@ struct DeviceMax {
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
-    return std::numeric_limits<T>::lowest();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>

From 9286b43f82d99123ef6ab9f8e4d71cd1aa440881 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 20 Oct 2021 16:47:48 -0400
Subject: [PATCH 063/112] Clang format :)

---
 cpp/src/binaryop/binaryop.cpp                 |  2 +-
 cpp/src/io/json/json_gpu.cu                   |  8 +++----
 cpp/src/io/parquet/reader_impl.cu             |  6 ++---
 .../strings/convert/convert_fixed_point.cu    | 22 +++++++++----------
 cpp/tests/binaryop/binop-integration-test.cpp |  4 ++--
 cpp/tests/copying/concatenate_tests.cu        | 14 ++++++------
 cpp/tests/merge/merge_test.cpp                |  4 ++--
 cpp/tests/reductions/reduction_tests.cpp      |  4 ++--
 cpp/tests/reductions/scan_tests.cpp           |  2 +-
 cpp/tests/strings/fixed_point_tests.cpp       |  6 ++---
 cpp/tests/transform/row_bit_count_test.cu     |  8 +++----
 11 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index d9d1866edea..c09962219a9 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -597,7 +597,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out      = make_fixed_width_column(
-         output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
+    output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
   // Check for 0 sized data
   if (lhs.is_empty() or rhs.is_empty()) return out;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 9358e7f7709..ec10e32e55d 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -62,9 +62,9 @@ __device__ std::pair<char const*, char const*> limit_range_to_brackets(char cons
   auto const data_begin = thrust::next(thrust::find_if(
     thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }));
   auto const data_end   = thrust::next(thrust::find_if(thrust::seq,
-                                                       thrust::make_reverse_iterator(end),
-                                                       thrust::make_reverse_iterator(data_begin),
-                                                       [](auto c) { return c == ']' || c == '}'; }))
+                                                     thrust::make_reverse_iterator(end),
+                                                     thrust::make_reverse_iterator(data_begin),
+                                                     [](auto c) { return c == ']' || c == '}'; }))
                           .base();
   return {data_begin, data_end};
 }
@@ -565,7 +565,7 @@ __global__ void detect_data_types_kernel(
       bool is_negative       = (*desc.value_begin == '-');
       char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
       cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
-          data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
+        data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
       atomicAdd(ptr, 1);
     } else if (is_like_float(
                  value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 6d4fbcdf4e4..2e606610c4e 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -752,9 +752,9 @@ class aggregate_metadata {
           // Check if the path exists in our selected_columns and if not, add it.
           auto const& name_to_find = path[depth];
           auto found_col           = std::find_if(
-                      array_to_find_in->begin(),
-                      array_to_find_in->end(),
-                      [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
+            array_to_find_in->begin(),
+            array_to_find_in->end(),
+            [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
           if (found_col == array_to_find_in->end()) {
             auto& col        = array_to_find_in->emplace_back(name_to_find);
             array_to_find_in = &col.children;
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 1d94c94f5b3..ba96f8ebe89 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -139,11 +139,11 @@ struct dispatch_to_fixed_point_fn {
 
     // create output column
     auto results   = make_fixed_point_column(output_type,
-                                             input.size(),
-                                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                             input.null_count(),
-                                             stream,
-                                             mr);
+                                           input.size(),
+                                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                           input.null_count(),
+                                           stream,
+                                           mr);
     auto d_results = results->mutable_view().data<DecimalType>();
 
     // convert strings into decimal values
@@ -211,7 +211,7 @@ struct decimal_to_string_size_fn {
 
     auto const abs_value = numeric::detail::abs(value);
     auto const exp_ten   = static_cast<int64_t>(exp10(
-        static_cast<double>(-scale)));  // TODO probably broken (might need numeric::detail::exp10)
+      static_cast<double>(-scale)));  // TODO probably broken (might need numeric::detail::exp10)
     auto const fraction  = count_digits(abs_value % exp_ten);
     auto const num_zeros = std::max(0, (-scale - fraction));
     return static_cast<int32_t>(value < 0) +    // sign if negative
@@ -349,11 +349,11 @@ struct dispatch_is_fixed_point_fn {
 
     // create output column
     auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                         input.size(),
-                                         cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                         input.null_count(),
-                                         stream,
-                                         mr);
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
+                                       stream,
+                                       mr);
     auto d_results = results->mutable_view().data<bool>();
 
     // check strings for valid fixed-point chars
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 6b7d8ead299..fa3d9d13f0d 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2034,7 +2034,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
 
   auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
     return decimalXX{i, scale_type{0}};
-       });
+  });
   auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
   auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
   auto expected   = std::vector<decimalXX>(sz);
@@ -2067,7 +2067,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
 
   auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
     return decimalXX{i, scale_type{0}};
-       });
+  });
   auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
   auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
   auto expected   = std::vector<decimalXX>(sz);
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 38a7fa4db58..98ffb121a9d 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -361,7 +361,7 @@ TEST_F(OverflowTest, OverflowTest)
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, size};
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
     auto col        = cudf::make_strings_column(
-             1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -376,7 +376,7 @@ TEST_F(OverflowTest, OverflowTest)
     auto many_offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, size + 1);
     auto chars        = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
     auto col          = cudf::make_strings_column(
-               size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
+      size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
 
     table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -484,7 +484,7 @@ TEST_F(OverflowTest, Presliced)
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size);
     auto col        = cudf::make_strings_column(
-             num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+      num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::split(*col, {(num_rows / 2) - 1});
 
@@ -515,7 +515,7 @@ TEST_F(OverflowTest, Presliced)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size);
     auto col        = cudf::make_strings_column(
-             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
     // leaving this disabled as it typically runs out of memory on a T4
@@ -684,7 +684,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto col        = cudf::make_strings_column(
-             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
@@ -712,7 +712,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto col        = cudf::make_lists_column(
-             num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
@@ -740,7 +740,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
                            offsets->mutable_view().begin<offset_type>());
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, inner_size);
     auto list_col   = cudf::make_lists_column(
-        num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
     // struct
     std::vector<std::unique_ptr<column>> children;
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index de6eefb989a..64ab3d137d6 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -453,7 +453,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
     } else {
       return row * 2;
     }
-        });
+  });
   auto valid_sequence1 = cudf::detail::make_counting_transform_iterator(
     0, [inputRows](auto row) { return (row < inputRows - 1); });
   cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(sequence1)::value_type>
@@ -698,7 +698,7 @@ TEST_F(MergeTest, KeysWithNulls)
   cudf::size_type nrows = 13200;  // Ensures that thrust::merge uses more than one tile/block
   auto data_iter        = thrust::make_counting_iterator<int32_t>(0);
   auto valids1          = cudf::detail::make_counting_transform_iterator(
-             0, [](auto row) { return (row % 10 == 0) ? false : true; });
+    0, [](auto row) { return (row % 10 == 0) ? false : true; });
   cudf::test::fixed_width_column_wrapper<int32_t> data1(data_iter, data_iter + nrows, valids1);
   auto valids2 = cudf::detail::make_counting_transform_iterator(
     0, [](auto row) { return (row % 15 == 0) ? false : true; });
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index a1ac942cdab..6f292ac6d98 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1390,7 +1390,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionQuantile)
     for (auto const i : {0, 1, 2, 3, 4}) {
       auto const expected = decimalXX{scaled_integer<RepType>{i + 1, scale}};
       auto const result   = cudf::reduce(
-          column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
+        column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
@@ -1413,7 +1413,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNthElement)
     for (auto const i : {0, 1, 2, 3}) {
       auto const expected = decimalXX{scaled_integer<RepType>{values[i], scale}};
       auto const result   = cudf::reduce(
-          column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
+        column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index aedc76c879c..87b329b36c1 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -410,7 +410,7 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
     return std::string(s);
   });
   auto validity   = cudf::detail::make_counting_transform_iterator(
-      0, [](auto idx) -> bool { return (idx % 23) != 22; });
+    0, [](auto idx) -> bool { return (idx % 23) != 22; });
   strings_column_wrapper col(data_begin, data_begin + row_count, validity);
 
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 2b6883a080d..3ceaba2637b 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -193,19 +193,19 @@ TEST_F(StringsConvertTest, IsFixedPoint)
                                                   "170141183460469231731687303715884105727",
                                                   "170141183460469231731687303715884105728"});
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                                        cudf::data_type{cudf::type_id::DECIMAL32});
+                                          cudf::data_type{cudf::type_id::DECIMAL32});
   auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
     {true, true, false, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                                        cudf::data_type{cudf::type_id::DECIMAL64});
+                                          cudf::data_type{cudf::type_id::DECIMAL64});
   auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
     {true, true, true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
   results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
-                                                         cudf::data_type{cudf::type_id::DECIMAL128});
+                                          cudf::data_type{cudf::type_id::DECIMAL128});
   auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
     {true, true, true, true, true, true, true, false, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 73f4e1cb465..44a5ad44cea 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -489,10 +489,10 @@ TEST_F(RowBitCount, NestedTypes)
                                                                l4_offsets.end());
     auto const l4_size = l4_offsets.size() - 1;
     auto l4            = cudf::make_lists_column(static_cast<cudf::size_type>(l4_size),
-                                                 l4_offsets_col.release(),
-                                                 innermost_struct.release(),
-                                                 cudf::UNKNOWN_NULL_COUNT,
-                                                 rmm::device_buffer{});
+                                      l4_offsets_col.release(),
+                                      innermost_struct.release(),
+                                      cudf::UNKNOWN_NULL_COUNT,
+                                      rmm::device_buffer{});
 
     // inner struct
     std::vector<std::unique_ptr<column>> inner_struct_children;

From 4ad26f4fee0fd615943c243906741141defad3be Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 00:31:39 -0400
Subject: [PATCH 064/112] Cleanup

---
 cpp/include/cudf_test/column_wrapper.hpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index f7ec3738b90..ccfb9d24a7d 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -511,9 +511,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
 
     auto const size      = cudf::distance(begin, end);
     auto const elements  = thrust::host_vector<Rep>(begin, end);
-    auto const id        = std::is_same_v<Rep, int32_t>   ? type_id::DECIMAL32
-                           : std::is_same_v<Rep, int64_t> ? type_id::DECIMAL64
-                                                          : type_id::DECIMAL128;
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10> >();
     auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
@@ -577,9 +575,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
 
     auto const size      = cudf::distance(begin, end);
     auto const elements  = thrust::host_vector<Rep>(begin, end);
-    auto const id        = std::is_same_v<Rep, int32_t>   ? type_id::DECIMAL32
-                           : std::is_same_v<Rep, int64_t> ? type_id::DECIMAL64
-                                                          : type_id::DECIMAL128;
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10> >();
     auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{

From 3892e7346fe8922d7a17654526c5a56dbdf27986 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 00:56:07 -0400
Subject: [PATCH 065/112] Cleanup

---
 .../cudf/column/column_device_view.cuh        | 46 ++-----------------
 1 file changed, 5 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 1ab259be00a..505ff33ec72 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -421,57 +421,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief Returns a `numeric::decimal32` element at the specified index for a `fixed_point`
-   * column.
+   * @brief Returns a `numeric::fixed_point` element at the specified index for a `fixed_point` column.
    *
    * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
    * then any attempt to use the result will lead to undefined behavior.
    *
    * @param element_index Position of the desired element
-   * @return numeric::decimal32 representing the element at this index
+   * @return numeric::fixed_point representing the element at this index
    */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal32>)>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
   __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
+    using rep = typename T::rep;
     auto const scale = scale_type{_type.scale()};
-    return decimal32{scaled_integer<int32_t>{data<int32_t>()[element_index], scale}};
-  }
-
-  /**
-   * @brief Returns a `numeric::decimal64` element at the specified index for a `fixed_point`
-   * column.
-   *
-   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
-   * then any attempt to use the result will lead to undefined behavior.
-   *
-   * @param element_index Position of the desired element
-   * @return numeric::decimal64 representing the element at this index
-   */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal64>)>
-  __device__ T element(size_type element_index) const noexcept
-  {
-    using namespace numeric;
-    auto const scale = scale_type{_type.scale()};
-    return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
-  }
-
-  /**
-   * @brief Returns a `numeric::decimal128` element at the specified index for a `fixed_point`
-   * column.
-   *
-   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
-   * then any attempt to use the result will lead to undefined behavior.
-   *
-   * @param element_index Position of the desired element
-   * @return numeric::decimal128 representing the element at this index
-   */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal128>)>
-  __device__ T element(size_type element_index) const noexcept
-  {
-    using namespace numeric;
-    auto const scale = scale_type{_type.scale()};
-    return decimal128{scaled_integer<__int128_t>{data<__int128_t>()[element_index], scale}};
+    return T{scaled_integer<rep>{data<rep>()[element_index], scale}};
   }
 
   /**

From 8e9bd9020af53bab56d32ec780320530d607dac4 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 00:57:43 -0400
Subject: [PATCH 066/112] Missing clang-format

---
 cpp/include/cudf/column/column_device_view.cuh | 5 +++--
 cpp/include/cudf_test/column_wrapper.hpp       | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 505ff33ec72..6ecb0796283 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -421,7 +421,8 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief Returns a `numeric::fixed_point` element at the specified index for a `fixed_point` column.
+   * @brief Returns a `numeric::fixed_point` element at the specified index for a `fixed_point`
+   * column.
    *
    * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
    * then any attempt to use the result will lead to undefined behavior.
@@ -433,7 +434,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
-    using rep = typename T::rep;
+    using rep        = typename T::rep;
     auto const scale = scale_type{_type.scale()};
     return T{scaled_integer<rep>{data<rep>()[element_index], scale}};
   }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index ccfb9d24a7d..c228bea9257 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -511,7 +511,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
 
     auto const size      = cudf::distance(begin, end);
     auto const elements  = thrust::host_vector<Rep>(begin, end);
-    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10> >();
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
     auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
@@ -575,7 +575,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
 
     auto const size      = cudf::distance(begin, end);
     auto const elements  = thrust::host_vector<Rep>(begin, end);
-    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10> >();
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
     auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{

From 41cc23a3a1e0119dbb8c9cd7444f35b227ae8156 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 02:20:24 -0400
Subject: [PATCH 067/112] digits10

---
 cpp/src/io/orc/writer_impl.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 011053aefb9..866be896012 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -43,6 +43,8 @@
 #include <numeric>
 #include <utility>
 
+#include <cuda/std/limits>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -124,10 +126,11 @@ constexpr int32_t to_clockscale(cudf::type_id timestamp_id)
  */
 constexpr auto orc_precision(cudf::type_id decimal_id)
 {
+  using namespace numeric;
   switch (decimal_id) {
-    case cudf::type_id::DECIMAL32: return 9;
-    case cudf::type_id::DECIMAL64: return 18;
-    case cudf::type_id::DECIMAL128: return 38;
+    case cudf::type_id::DECIMAL32: return cuda::std::numeric_limits<decimal32::rep>::digits10;
+    case cudf::type_id::DECIMAL64: return cuda::std::numeric_limits<decimal64::rep>::digits10;
+    case cudf::type_id::DECIMAL128: return cuda::std::numeric_limits<decimal128::rep>::digits10;
     default: return 0;
   }
 }

From 921ff12d58251c8db963cd56fa71e24cb3c716c9 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 09:36:06 -0400
Subject: [PATCH 068/112] Clean up

---
 cpp/include/cudf/detail/copy_if.cuh | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 587bf3abf24..fb4c636fcb0 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -217,17 +217,7 @@ struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
 };
 
 template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal32, T>>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal64, T>>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal128, T>>> {
+struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   using type = typename cudf::device_storage_type_t<T>;
 };
 

From a5e4187f1e5e160eeeb075d684844ece22b664e7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 15:55:08 -0400
Subject: [PATCH 069/112] IO changes

---
 cpp/src/io/orc/reader_impl.cu     | 10 +++-------
 cpp/src/io/orc/stripe_data.cu     |  2 +-
 cpp/src/io/orc/stripe_enc.cu      |  9 ++++-----
 cpp/src/io/orc/writer_impl.cu     |  6 +++---
 cpp/src/io/parquet/writer_impl.cu |  4 +---
 5 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d1c8c3661f4..e49cf718740 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -81,7 +81,7 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL64;
+    case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL128;
     // Need to update once cuDF plans to support map type
     case orc::MAP:
     case orc::LIST: return type_id::LIST;
@@ -1074,7 +1074,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL64) {
+      if (type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata->get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -1215,11 +1215,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       auto col_type = to_type_id(
         _metadata->get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      // Remove this once we support Decimal128 data type
-      CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col.id).precision <= 18),
-        "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
-      if (col_type == type_id::DECIMAL64) {
+      if (col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index bcbe77d9df8..5993d12fa6f 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1722,7 +1722,7 @@ __global__ void __launch_bounds__(block_size)
             case DOUBLE:
             case LONG:
             case DECIMAL:
-              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              static_cast<__uint128_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               break;
             case MAP:
             case LIST: {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 3e313a7399f..ccfa42deea7 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -116,9 +116,8 @@ static inline __device__ uint64_t zigzag(int64_t v)
   return ((v ^ -s) * 2) + s;
 }
 
-static inline __device__ uint64_t zigzag(__int128_t v)
+static inline __device__ __uint128_t zigzag(__int128_t v)
 {
-  // TODO
   int64_t s = (v < 0) ? 1 : 0;
   return ((v ^ -s) * 2) + s;
 }
@@ -285,11 +284,11 @@ static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = {
 /**
  * @brief Encode a varint value, return the number of bytes written
  */
-static inline __device__ uint32_t StoreVarint(uint8_t* dst, uint64_t v)
+static inline __device__ uint32_t StoreVarint(uint8_t* dst, __uint128_t v)
 {
   uint32_t bytecnt = 0;
   for (;;) {
-    uint32_t c = (uint32_t)(v & 0x7f);
+    auto c = static_cast<uint32_t>(v & 0x7f);
     v >>= 7u;
     if (v == 0) {
       dst[bytecnt++] = c;
@@ -950,7 +949,7 @@ __global__ void __launch_bounds__(block_size)
           case DECIMAL: {
             if (is_value_valid) {
               auto const id = column.type().id();
-              uint64_t const zz_val =
+              __uint128_t const zz_val =
                 id == type_id::DECIMAL32   ? zigzag(column.element<int32_t>(row))
                 : id == type_id::DECIMAL64 ? zigzag(column.element<int64_t>(row))
                                            : zigzag(column.element<__int128_t>(row));
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 866be896012..67c67f4e432 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1636,13 +1636,13 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                          if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
                            return 0u;
 
-                         int64_t const element =
+                         __int128_t const element =
                            col.type().id() == type_id::DECIMAL32   ? col.element<int32_t>(idx)
                            : col.type().id() == type_id::DECIMAL64 ? col.element<int64_t>(idx)
                                                                    : col.element<__int128_t>(idx);
 
-                         int64_t const sign      = (element < 0) ? 1 : 0;
-                         uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
+                         __int128_t const sign      = (element < 0) ? 1 : 0;
+                         __uint128_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
                          uint32_t encoded_length = 1;
                          while (zigzaged_value > 127) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index d4a49cb6f6d..de9af2282c0 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -342,9 +342,7 @@ struct leaf_schema_fn {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else if (std::is_same_v<T, numeric::decimal128>) {
-      // TODO
-      // col_schema.type        = Type::INT64;
-      // col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
+      CUDF_FAIL("decimal128 currently not supported for parquet writer");
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }

From d87c9d4acc73842627989162e0757a816f3cbc4f Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 22 Oct 2021 17:08:51 -0400
Subject: [PATCH 070/112] Fix and partial test updates

---
 cpp/src/io/orc/stripe_data.cu |  2 ++
 cpp/tests/io/orc_test.cpp     | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 5993d12fa6f..300bf889f90 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1721,6 +1721,8 @@ __global__ void __launch_bounds__(block_size)
             case INT: static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
             case DOUBLE:
             case LONG:
+              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              break;
             case DECIMAL:
               static_cast<__uint128_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               break;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index f2d5952d0ed..c87c15db664 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -341,9 +341,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_vals = random_values<int64_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{2}};
   });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
@@ -353,7 +353,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, validity};
 
   cudf::test::lists_column_wrapper<int64_t> col7{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
@@ -401,7 +401,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{2}};
   });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
@@ -423,7 +423,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
+  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, col6_mask};
   cudf::test::lists_column_wrapper<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};

From 3b9a61175d406c88311e836aa8109e2d15050838 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 25 Oct 2021 14:20:32 -0400
Subject: [PATCH 071/112] Clean up

---
 cpp/include/cudf/fixed_point/fixed_point.hpp | 3 ++-
 cpp/src/io/orc/stripe_data.cu                | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 8178aecd83d..b356d857f32 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -49,7 +49,8 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same_v<T, int32_t> || cuda::std::is_same_v<T, int64_t> ||
+  return cuda::std::is_same_v<T, int32_t> ||  //
+         cuda::std::is_same_v<T, int64_t> ||  //
          cuda::std::is_same_v<T, __int128_t>;
 }
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 300bf889f90..a3cb1581266 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1720,9 +1720,7 @@ __global__ void __launch_bounds__(block_size)
             case FLOAT:
             case INT: static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
             case DOUBLE:
-            case LONG:
-              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
-              break;
+            case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
               static_cast<__uint128_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               break;

From 5bab167b132177b5ebeff383bd318794c089edd3 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 25 Oct 2021 15:48:37 -0400
Subject: [PATCH 072/112] Update libcudacxx

---
 cpp/cmake/thirdparty/get_libcudacxx.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 772e14c66da..aab75f63bf6 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,8 +16,8 @@
 
 function(find_and_configure_libcudacxx VERSION)
     rapids_cpm_find(libcudacxx ${VERSION}
-        GIT_REPOSITORY      https://gitlab-master.nvidia.com/nvhpc/libcudacxx.git
-        GIT_TAG             staging/1.6.0
+        GIT_REPOSITORY      https://github.com/NVIDIA/libcudacxx.git
+        GIT_TAG             branch/1.6.0
         GIT_SHALLOW         TRUE
         DOWNLOAD_ONLY       TRUE
     )

From a4c03e57e4fde13cbd148cbd0a93c380641e1c7d Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 25 Oct 2021 21:23:28 -0400
Subject: [PATCH 073/112] Fixing OrcWriterTestDecimal.Decimal64 test

---
 cpp/tests/io/orc_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index c87c15db664..f0e83f3e634 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1138,10 +1138,10 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
   auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
+    return numeric::decimal128{vals[i], numeric::scale_type{scale}};
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
+  column_wrapper<numeric::decimal128> col{data, data + num_rows, mask};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");

From 976fb743233b937ec05cfa85b0f58e893e100891 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 25 Oct 2021 21:27:31 -0400
Subject: [PATCH 074/112] Fix rest of ORC_TEST

---
 cpp/tests/io/orc_test.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index f0e83f3e634..e3ca8824880 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1185,13 +1185,13 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  // Need a 64bit decimal column for comparison since the reader always creates DECIMAL64 columns
-  auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{2}};
+  // Need a 128bit decimal column for comparison since the reader always creates DECIMAL128 columns
+  auto data128 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
+    return numeric::decimal128{vals[i], numeric::scale_type{2}};
   });
-  column_wrapper<numeric::decimal64> col64{data64, data64 + num_rows, mask};
+  column_wrapper<numeric::decimal128> col128{data128, data128 + num_rows, mask};
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col128, result.tbl->view().column(0));
 }
 
 TEST_F(OrcStatisticsTest, Overflow)

From c9c7250f5906a439c16b94e4e22ac266104ea3ef Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 26 Oct 2021 14:34:32 -0400
Subject: [PATCH 075/112] ORC changes for decimal128

---
 cpp/src/io/orc/stripe_data.cu | 60 +++++++++++------------------------
 1 file changed, 18 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index a3cb1581266..5b68a425cb4 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -133,6 +133,8 @@ struct orcdec_state_s {
     uint64_t u64[block_size];
     int64_t i64[block_size];
     double f64[block_size];
+    __int128_t i128[block_size];   // TMP
+    __uint128_t u128[block_size];  // TMP
   } vals;
 };
 
@@ -451,29 +453,18 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ int128_s decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
 {
-  uint32_t b        = bytestream_readbyte(bs, pos++);
-  int64_t sign_mask = -(int32_t)(b & 1);
-  uint64_t v        = (b >> 1) & 0x3f;
-  uint32_t bitpos   = 6;
-  uint64_t lo       = v;
-  uint64_t hi       = 0;
+  uint32_t b           = bytestream_readbyte(bs, pos++);
+  __int128_t sign_mask = -(int32_t)(b & 1);
+  __int128_t v         = (b >> 1) & 0x3f;
+  uint32_t bitpos      = 6;
   while (b > 0x7f && bitpos < 128) {
     b = bytestream_readbyte(bs, pos++);
     v |= ((uint64_t)(b & 0x7f)) << (bitpos & 0x3f);
-    if (bitpos == 62) {  // 6 + 7 * 8 = 62
-      lo = v;
-      v  = (b & 0x7f) >> 2;  // 64 - 62
-    }
     bitpos += 7;
   }
-  if (bitpos >= 64) {
-    hi = v;
-  } else {
-    lo = v;
-  }
-  return {(uint64_t)(lo ^ sign_mask), (int64_t)(hi ^ sign_mask)};
+  return v ^ sign_mask;
 }
 
 /**
@@ -1046,8 +1037,8 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         uint32_t pos = lastpos;
         pos += varint_length<uint4>(bs, pos);
         if (pos > maxpos) break;
-        vals.i64[n] = lastpos;
-        lastpos     = pos;
+        vals.i64[2 * n] = lastpos;
+        lastpos         = pos;
       }
       scratch->num_vals = n;
       bytestream_flush_bytes(bs, lastpos - bs->pos);
@@ -1055,11 +1046,11 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
     __syncthreads();
     uint32_t num_vals_to_read = scratch->num_vals;
     if (t >= num_vals_read and t < num_vals_to_read) {
-      auto const pos = static_cast<int>(vals.i64[t]);
-      int128_s v     = decode_varint128(bs, pos);
+      auto const pos = static_cast<int>(vals.i64[2 * t]);
+      __int128_t v   = decode_varint128(bs, pos);
 
       if (col_scale & orc_decimal2float64_scale) {
-        double f      = Int128ToDouble_rn(v.lo, v.hi);
+        double f      = v;
         int32_t scale = (t < numvals) ? val_scale : 0;
         if (scale >= 0)
           vals.f64[t] = f / kPow10[min(scale, 39)];
@@ -1071,27 +1062,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         // of them will be used to add 0s or remove digits.
         int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
         if (scale >= 0) {
-          scale       = min(scale, 27);
-          vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
+          scale        = min(scale, 27);
+          vals.i128[t] = v * kPow10[scale];
         } else  // if (scale < 0)
         {
-          bool is_negative = (v.hi < 0);
-          uint64_t hi = v.hi, lo = v.lo;
-          scale = min(-scale, 27);
-          if (is_negative) {
-            hi = (~hi) + (lo == 0);
-            lo = (~lo) + 1;
-          }
-          lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
-          hi >>= (int32_t)scale;
-          if (hi != 0) {
-            // Use intermediate float
-            lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
-            hi = 0;
-          } else {
-            lo /= kPow5i[scale];
-          }
-          vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
+          scale        = min(-scale, 27);  // should be irrelevant
+          vals.i128[t] = v / kPow10[scale];
         }
       }
     }
@@ -1722,7 +1698,7 @@ __global__ void __launch_bounds__(block_size)
             case DOUBLE:
             case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              static_cast<__uint128_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              static_cast<__uint128_t*>(data_out)[row] = s->vals.u128[t + vals_skipped];
               break;
             case MAP:
             case LIST: {

From 46bd2d86222206634487a7aa372ec18503c64e0b Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 26 Oct 2021 15:00:46 -0400
Subject: [PATCH 076/112] ORC fixes for decima128

---
 cpp/src/io/orc/stripe_data.cu |  4 ++--
 cpp/tests/io/orc_test.cpp     | 17 +++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 5b68a425cb4..3781e31cb9f 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1063,11 +1063,11 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
         if (scale >= 0) {
           scale        = min(scale, 27);
-          vals.i128[t] = v * kPow10[scale];
+          vals.i128[t] = (v * kPow5i[scale]) << scale;
         } else  // if (scale < 0)
         {
           scale        = min(-scale, 27);  // should be irrelevant
-          vals.i128[t] = v / kPow10[scale];
+          vals.i128[t] = (v / kPow5i[scale]) >> scale;
         }
       }
     }
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index e3ca8824880..8836e57a932 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -345,6 +345,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
     return numeric::decimal128{col6_vals[i], numeric::scale_type{2}};
   });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{-2}};
+  });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), validity};
@@ -354,15 +357,16 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
   column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col7{col7_data, col7_data + num_rows, validity};
 
-  cudf::test::lists_column_wrapper<int64_t> col7{
+  cudf::test::lists_column_wrapper<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
 
   auto child_col =
     cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
-  auto col8 = cudf::test::structs_column_wrapper{child_col};
+  auto col9 = cudf::test::structs_column_wrapper{child_col};
 
-  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
 
   cudf_io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("bools");
@@ -371,9 +375,10 @@ TEST_F(OrcWriterTest, MultiColumn)
   expected_metadata.column_metadata[3].set_name("int32s");
   expected_metadata.column_metadata[4].set_name("floats");
   expected_metadata.column_metadata[5].set_name("doubles");
-  expected_metadata.column_metadata[6].set_name("decimal");
-  expected_metadata.column_metadata[7].set_name("lists");
-  expected_metadata.column_metadata[8].set_name("structs");
+  expected_metadata.column_metadata[6].set_name("decimal_pos_scale");
+  expected_metadata.column_metadata[7].set_name("decimal_neg_scale");
+  expected_metadata.column_metadata[8].set_name("lists");
+  expected_metadata.column_metadata[9].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =

From 8a86d76a3d714e82e7130762640cb4b098ce1a44 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 27 Oct 2021 11:22:06 -0400
Subject: [PATCH 077/112] Binary op changes / GROUPBY_TEST working

---
 cpp/src/binaryop/binaryop.cpp                 |  16 +-
 cpp/tests/binaryop/binop-compiled-test.cpp    | 156 ++++++++++--------
 cpp/tests/binaryop/binop-integration-test.cpp |   3 -
 3 files changed, 95 insertions(+), 80 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index c09962219a9..9b940472080 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -273,19 +273,13 @@ void fixed_point_binary_operation_validation(binary_operator op,
                                              Rhs rhs,
                                              thrust::optional<cudf::data_type> output_type = {})
 {
-  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
+  CUDF_EXPECTS((is_fixed_point(lhs) or is_fixed_point(rhs)),
+               "One of the inputs must have fixed_point data_type.");
   CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
                "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
-  if (output_type.has_value()) {
-    if (binops::is_comparison_binop(op))
-      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
-                   "Comparison operations require boolean output type.");
-    else
-      CUDF_EXPECTS(is_fixed_point(output_type.value()),
-                   "fixed_point binary operations require fixed_point output type.");
-  }
+  if (output_type.has_value() and binops::is_comparison_binop(op))
+    CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
+                 "Comparison operations require boolean output type.");
 }
 
 /**
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 206b0252abf..52708829502 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -107,22 +107,27 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
 // t      	     	t + d
 // d      	d + t	d + d
 
-using Add_types = cudf::test::Types<
-  cudf::test::Types<bool, bool, float>,
-  cudf::test::Types<int16_t, double, uint8_t>,
-  cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
-  cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
-  cudf::test::Types<duration_us, duration_us, duration_D>,
-  // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
-  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
-  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>,
-  // Extras
-  cudf::test::Types<duration_D, duration_D, duration_D>,
-  cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
-  cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
-  cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
-  cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+using namespace numeric;
+
+using Add_types = cudf::test::Types<cudf::test::Types<bool, bool, float>,
+                                    cudf::test::Types<int16_t, double, uint8_t>,
+                                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
+                                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
+                                    cudf::test::Types<duration_us, duration_us, duration_D>,
+                                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    // Extras
+                                    cudf::test::Types<duration_D, duration_D, duration_D>,
+                                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
+                                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
+                                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
+                                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
 };
@@ -139,15 +144,19 @@ TYPED_TEST(BinaryOperationCompiledTest_Add, Vector_Vector)
 // t      	t - t	t - d
 // d      	     	d - d
 
-using Sub_types = cudf::test::Types<
-  cudf::test::Types<int32_t, bool, float>,                  // n - n
-  cudf::test::Types<duration_D, timestamp_D, timestamp_D>,  // t - t
-  cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
-  cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
-  cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
-  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
-  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
+using Sub_types =
+  cudf::test::Types<cudf::test::Types<int32_t, bool, float>,                  // n - n
+                    cudf::test::Types<duration_D, timestamp_D, timestamp_D>,  // t - t
+                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
+                    cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
+                    cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<int, decimal32, decimal32>,
+                    cudf::test::Types<int, decimal64, decimal64>,
+                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
@@ -163,14 +172,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
 // n n * n	     	n * d
 // t
 // d d * n
-using Mul_types = cudf::test::Types<
-  cudf::test::Types<int32_t, u_int64_t, float>,
-  cudf::test::Types<duration_s, u_int64_t, duration_s>,
-  cudf::test::Types<duration_ms, duration_D, int16_t>,
-  cudf::test::Types<duration_ns, duration_us, uint8_t>,
-  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
-  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
+using Mul_types = cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
+                                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
+                                    cudf::test::Types<duration_ms, duration_D, int16_t>,
+                                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    cudf::test::Types<decimal32, int, int>,
+                                    cudf::test::Types<decimal64, int, int>,
+                                    cudf::test::Types<decimal128, int, int>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
@@ -186,17 +201,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
 // n n / n
 // t
 // d d / n	     	d / d
-using Div_types = cudf::test::Types<
-  cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-  cudf::test::Types<double, int8_t, int64_t>,
-  cudf::test::Types<duration_ms, duration_s, u_int32_t>,
-  cudf::test::Types<duration_ns, duration_D, int16_t>,
-  cudf::test::Types<double, duration_D, duration_ns>,
-  cudf::test::Types<float, duration_ms, duration_ns>,
-  cudf::test::Types<u_int64_t, duration_us, duration_ns>,
-  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
-  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
+using Div_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                    cudf::test::Types<double, int8_t, int64_t>,
+                                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+                                    cudf::test::Types<duration_ns, duration_D, int16_t>,
+                                    cudf::test::Types<double, duration_D, duration_ns>,
+                                    cudf::test::Types<float, duration_ms, duration_ns>,
+                                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
@@ -216,6 +234,7 @@ using TrueDiv_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_
                                         cudf::test::Types<double, int8_t, int64_t>,
                                         cudf::test::Types<int8_t, bool, u_int32_t>,
                                         cudf::test::Types<u_int64_t, float, int16_t>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
 };
@@ -458,18 +477,17 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
 
 // Comparison Operations ==, !=, <, >, <=, >=
 // n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
-using Comparison_types =
-  cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
-                    cudf::test::Types<bool, uint32_t, uint16_t>,
-                    cudf::test::Types<bool, uint64_t, double>,
-                    cudf::test::Types<bool, timestamp_D, timestamp_s>,
-                    cudf::test::Types<bool, timestamp_ns, timestamp_us>,
-                    cudf::test::Types<bool, duration_ns, duration_ns>,
-                    cudf::test::Types<bool, duration_us, duration_s>,
-                    cudf::test::Types<bool, std::string, std::string>,
-                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<bool, numeric::decimal64, numeric::decimal64>,
-                    cudf::test::Types<bool, numeric::decimal128, numeric::decimal128>>;
+using Comparison_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
+                                           cudf::test::Types<bool, uint32_t, uint16_t>,
+                                           cudf::test::Types<bool, uint64_t, double>,
+                                           cudf::test::Types<bool, timestamp_D, timestamp_s>,
+                                           cudf::test::Types<bool, timestamp_ns, timestamp_us>,
+                                           cudf::test::Types<bool, duration_ns, duration_ns>,
+                                           cudf::test::Types<bool, duration_us, duration_s>,
+                                           cudf::test::Types<bool, std::string, std::string>,
+                                           cudf::test::Types<bool, decimal32, decimal32>,
+                                           cudf::test::Types<bool, decimal64, decimal64>,
+                                           cudf::test::Types<bool, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
@@ -514,16 +532,22 @@ TYPED_TEST(BinaryOperationCompiledTest_Comparison, GreaterEqual_Vector_Vector)
 // d          .
 // s             .
 // dc .             .
-using Null_types = cudf::test::Types<
-  cudf::test::Types<int16_t, int8_t, int16_t>,
-  cudf::test::Types<uint16_t, uint32_t, uint16_t>,
-  cudf::test::Types<double, uint64_t, double>,
-  cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
-  cudf::test::Types<duration_ns, duration_us, duration_s>,
-  // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
-  cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-  cudf::test::Types<numeric::decimal64, numeric::decimal64, numeric::decimal64>,
-  cudf::test::Types<numeric::decimal128, numeric::decimal128, numeric::decimal128>>;
+using Null_types =
+  cudf::test::Types<cudf::test::Types<int16_t, int8_t, int16_t>,
+                    cudf::test::Types<uint16_t, uint32_t, uint16_t>,
+                    cudf::test::Types<double, uint64_t, double>,
+                    cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
+                    cudf::test::Types<duration_ns, duration_us, duration_s>,
+                    // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<decimal32, uint32_t, decimal32>,
+                    cudf::test::Types<decimal64, uint32_t, decimal64>,
+                    cudf::test::Types<decimal128, uint32_t, decimal128>,
+                    cudf::test::Types<int64_t, decimal32, decimal32>,
+                    cudf::test::Types<int64_t, decimal64, decimal64>,
+                    cudf::test::Types<int64_t, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index fa3d9d13f0d..f47a618fbe0 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2665,11 +2665,8 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
 
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  auto const float_type    = data_type{type_id::FLOAT32};
   EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
                cudf::logic_error);
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
-               cudf::logic_error);
 }
 
 template <typename T>

From e54d3fa045412ca7f430f7cc68d867932a9ee330 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Thu, 28 Oct 2021 16:17:15 -0400
Subject: [PATCH 078/112] Test for blog

---
 cpp/tests/round/round_tests.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 4d1f66443c2..5a2e1353fb0 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -284,6 +284,20 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfNegEven3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(RoundTestsFixedPointTypes, TestForBlog)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper{{25649999}, scale_type{-5}};
+  auto const expected = fp_wrapper{{256}, scale_type{0}};
+  auto const result   = cudf::round(input);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(RoundTestsFloatingPointTypes, SimpleFloatingPointTestHalfUp0)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;

From 92694b8ec63686f5f2b6d227defe0046b131a0a7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 1 Nov 2021 15:53:39 -0400
Subject: [PATCH 079/112] Merge conflict fix

---
 cpp/src/io/orc/reader_impl.cu | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 4c57aa71a13..a3c108421ef 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -944,15 +944,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       auto col_type = to_type_id(
         _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-<<<<<<< HEAD
       if (col_type == type_id::DECIMAL128) {
-=======
-      // Remove this once we support Decimal128 data type
-      CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
-        "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
-      if (col_type == type_id::DECIMAL64) {
->>>>>>> branch-21.12
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.

From 44d05733dafdc534a6d3d40dd48993ade4b1c48b Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 2 Nov 2021 13:24:12 -0400
Subject: [PATCH 080/112] Temporary fix

---
 cpp/cmake/libcudacxx.patch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
index ef11688311b..3cdc40ef084 100644
--- a/cpp/cmake/libcudacxx.patch
+++ b/cpp/cmake/libcudacxx.patch
@@ -7,7 +7,7 @@ index d55a43688..654142d7e 100644
      #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
      #define _LIBCUDACXX_CUDACC_VER                                                  \
 -        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
-+        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
++        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
          _LIBCUDACXX_CUDACC_VER_BUILD
  
      #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE

From 99a82ee5405f98535c254a5c81e81977c100f626 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 2 Nov 2021 14:10:08 -0400
Subject: [PATCH 081/112] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f83d7c5b759..aae62fbd47c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,12 +62,12 @@ The following instructions are for developers and contributors to cuDF OSS devel
 Compilers:
 
 * `gcc`     version 9.3+
-* `nvcc`    version 11.0+
+* `nvcc`    version 11.5+
 * `cmake`   version 3.20.1+
 
 CUDA/GPU:
 
-* CUDA 11.0+
+* CUDA 11.5+
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 

From 99ad08bfe1ec58fa437ff00ff95ea6211f991d22 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 2 Nov 2021 16:24:42 -0400
Subject: [PATCH 082/112] Temporary

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 6764c78ed87..ecf02c22885 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -22,8 +22,8 @@ function(find_and_configure_cucollections)
     rapids_cpm_find(cuco 0.0
         GLOBAL_TARGETS cuco::cuco
         CPM_ARGS
-            GITHUB_REPOSITORY NVIDIA/cuCollections
-            GIT_TAG           62b90b7f7adf272455007b1c857e1d621aaf13ca
+            GITHUB_REPOSITORY robertmaynard/cuCollections
+            GIT_TAG           bf6a90db78516e099d07e845a39012dbcaa8de18
             OPTIONS           "BUILD_TESTS OFF"
                               "BUILD_BENCHMARKS OFF"
                               "BUILD_EXAMPLES OFF"

From 95a24020c1200c6a48f95620debfc3ff6702abc1 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 2 Nov 2021 21:35:42 -0400
Subject: [PATCH 083/112] Sum Aggregation uses same type for accumulator

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 13 ++-----------
 cpp/tests/groupby/sum_scan_tests.cpp                |  6 ++----
 cpp/tests/groupby/sum_tests.cpp                     | 12 ++++--------
 3 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index a10ffcffcfe..c2bd7a4893c 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1118,17 +1118,8 @@ template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<cudf::is_fixed_point<Source>() &&
-                   not std::is_same_v<Source, numeric::decimal128> && (k == aggregation::SUM)>> {
-  using type = numeric::decimal64;
-};
-
-template <typename Source, aggregation::Kind k>
-struct target_type_impl<
-  Source,
-  k,
-  std::enable_if_t<std::is_same_v<Source, numeric::decimal128> && (k == aggregation::SUM)>> {
-  using type = numeric::decimal128;
+  std::enable_if_t<cudf::is_fixed_point<Source>() && (k == aggregation::SUM)>> {
+  using type = Source;
 };
 
 // Summing/Multiplying float/doubles, use same type accumulator
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 6b813f8b6db..eab73c01dd9 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -144,8 +144,6 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
   using decimalXX  = TypeParam;
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = fixed_point_column_wrapper<RepType>;
-  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
-  using out_fp_wrapper = fixed_point_column_wrapper<SumType>;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -153,8 +151,8 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
     auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
     auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
 
-    auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
-    auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
+    auto const expect_keys     = key_wrapper{1, 1, 1, 2, 2,  2,  2, 3, 3,  3};
+    auto const expect_vals_sum = fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
     // clang-format on
 
     auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index ed42386b694..b12372c1e08 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -168,9 +168,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
   using decimalXX  = TypeParam;
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<SumType>;
-  using K            = int32_t;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -180,7 +178,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg1 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(
@@ -199,9 +197,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
   using decimalXX  = TypeParam;
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using SumType    = std::conditional_t<std::is_same_v<decimal128, TypeParam>, __int128_t, int64_t>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<SumType>;
-  using K            = int32_t;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -211,7 +207,7 @@ TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg5 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));

From 5ecd793519815cece2706a06f83f6a1c69596d4b Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 3 Nov 2021 10:14:21 -0400
Subject: [PATCH 084/112] ORC changes

---
 cpp/src/io/orc/stripe_data.cu | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 3781e31cb9f..5cd569f0d0a 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -45,11 +45,6 @@ inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding
 static __device__ __constant__ int64_t kORCTimeToUTC =
   1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
 
-struct int128_s {
-  uint64_t lo;
-  int64_t hi;
-};
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -1022,6 +1017,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
                                       volatile orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
+                                      TypeKind dtype_kind,
                                       int col_scale,
                                       int t)
 {
@@ -1049,7 +1045,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
       auto const pos = static_cast<int>(vals.i64[2 * t]);
       __int128_t v   = decode_varint128(bs, pos);
 
-      if (col_scale & orc_decimal2float64_scale) {
+      if (dtype_kind == DOUBLE) {
         double f      = v;
         int32_t scale = (t < numvals) ? val_scale : 0;
         if (scale >= 0)
@@ -1066,7 +1062,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
           vals.i128[t] = (v * kPow5i[scale]) << scale;
         } else  // if (scale < 0)
         {
-          scale        = min(-scale, 27);  // should be irrelevant
+          scale        = min(-scale, 27);
           vals.i128[t] = (v / kPow5i[scale]) >> scale;
         }
       }
@@ -1629,8 +1625,14 @@ __global__ void __launch_bounds__(block_size)
           }
           val_scale = (t < numvals) ? (int)s->vals.i64[skip + t] : 0;
           __syncthreads();
-          numvals = Decode_Decimals(
-            &s->bs, &s->u.rle8, s->vals, val_scale, numvals, s->chunk.decimal_scale, t);
+          numvals = Decode_Decimals(&s->bs,
+                                    &s->u.rle8,
+                                    s->vals,
+                                    val_scale,
+                                    numvals,
+                                    s->chunk.type_kind,
+                                    s->chunk.decimal_scale,
+                                    t);
         }
         __syncthreads();
       } else if (s->chunk.type_kind == FLOAT) {

From f55e0508f4d6b370d60827bb1f2b91f4cd2d1248 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 3 Nov 2021 15:14:56 -0400
Subject: [PATCH 085/112] Full ORC fix

---
 cpp/src/io/orc/orc_gpu.h      |  6 +-----
 cpp/src/io/orc/reader_impl.cu |  4 ++--
 cpp/src/io/orc/stripe_data.cu | 32 ++++++++++++++++----------------
 cpp/tests/io/orc_test.cpp     |  4 ++--
 4 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index f6a7c3f5f03..ad4450bc6a7 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -84,11 +84,6 @@ struct DictionaryEntry {
   uint32_t len;  // Length in data stream
 };
 
-/**
- * @brief Mask to indicate conversion from decimals to float64
- */
-constexpr int orc_decimal2float64_scale = 0x80;
-
 /**
  * @brief Struct to describe per stripe's column information
  */
@@ -111,6 +106,7 @@ struct ColumnDesc {
   ColumnEncodingKind encoding_kind;        // column encoding kind
   TypeKind type_kind;                      // column data type
   uint8_t dtype_len;          // data type length (for types that can be mapped to different sizes)
+  type_id dtype_id;           // TODO
   int32_t decimal_scale;      // number of fractional decimal digits for decimal type
   type_id timestamp_type_id;  // output timestamp type id (type_id::EMPTY by default)
   column_validity_info parent_validity_info;  // consists of parent column valid_map and null count
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a3c108421ef..23cd8a0936b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1114,10 +1114,10 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
             auto const decimal_as_float64 = should_convert_decimal_column_to_float(
               _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
+            chunk.dtype_id      = column_types[col_idx].id();
             chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                     .ff.types[columns_level[col_idx].id]
-                                    .scale.value_or(0) |
-                                  (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
+                                    .scale.value_or(0);
 
             chunk.rowgroup_id   = rowgroup_id;
             chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 5cd569f0d0a..652c86364bd 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -448,19 +448,19 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
-{
-  uint32_t b           = bytestream_readbyte(bs, pos++);
-  __int128_t sign_mask = -(int32_t)(b & 1);
-  __int128_t v         = (b >> 1) & 0x3f;
-  uint32_t bitpos      = 6;
-  while (b > 0x7f && bitpos < 128) {
-    b = bytestream_readbyte(bs, pos++);
-    v |= ((uint64_t)(b & 0x7f)) << (bitpos & 0x3f);
-    bitpos += 7;
-  }
-  return v ^ sign_mask;
-}
+ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+ {
+   auto byte                  = bytestream_readbyte(bs, pos++);
+   __int128_t const sign_mask = -(int32_t)(byte & 1);
+   __int128_t value           = (byte >> 1) & 0x3f;
+   uint32_t bitpos            = 6;
+   while (byte & 0x80 && bitpos < 128) {
+     byte = bytestream_readbyte(bs, pos++);
+     value |= ((__uint128_t)(byte & 0x7f)) << bitpos;
+     bitpos += 7;
+   }
+   return value ^ sign_mask;
+ }
 
 /**
  * @brief Decodes an unsigned 32-bit varint
@@ -1017,7 +1017,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
                                       volatile orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
-                                      TypeKind dtype_kind,
+                                      type_id dtype_id,
                                       int col_scale,
                                       int t)
 {
@@ -1045,7 +1045,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
       auto const pos = static_cast<int>(vals.i64[2 * t]);
       __int128_t v   = decode_varint128(bs, pos);
 
-      if (dtype_kind == DOUBLE) {
+      if (dtype_id == type_id::FLOAT64) {
         double f      = v;
         int32_t scale = (t < numvals) ? val_scale : 0;
         if (scale >= 0)
@@ -1630,7 +1630,7 @@ __global__ void __launch_bounds__(block_size)
                                     s->vals,
                                     val_scale,
                                     numvals,
-                                    s->chunk.type_kind,
+                                    s->chunk.dtype_id,
                                     s->chunk.decimal_scale,
                                     t);
         }
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 7e02e66d090..b29138f262c 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -343,10 +343,10 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int64_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
   });
   auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{-2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
   });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 

From 216385ae9d91950a78f70e07b8cd739b092db51a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 3 Nov 2021 16:10:54 -0400
Subject: [PATCH 086/112] clang-format

---
 cpp/src/io/orc/stripe_data.cu | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 652c86364bd..f121e1108dc 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -448,19 +448,19 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
- inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
- {
-   auto byte                  = bytestream_readbyte(bs, pos++);
-   __int128_t const sign_mask = -(int32_t)(byte & 1);
-   __int128_t value           = (byte >> 1) & 0x3f;
-   uint32_t bitpos            = 6;
-   while (byte & 0x80 && bitpos < 128) {
-     byte = bytestream_readbyte(bs, pos++);
-     value |= ((__uint128_t)(byte & 0x7f)) << bitpos;
-     bitpos += 7;
-   }
-   return value ^ sign_mask;
- }
+inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+{
+  auto byte                  = bytestream_readbyte(bs, pos++);
+  __int128_t const sign_mask = -(int32_t)(byte & 1);
+  __int128_t value           = (byte >> 1) & 0x3f;
+  uint32_t bitpos            = 6;
+  while (byte & 0x80 && bitpos < 128) {
+    byte = bytestream_readbyte(bs, pos++);
+    value |= ((__uint128_t)(byte & 0x7f)) << bitpos;
+    bitpos += 7;
+  }
+  return value ^ sign_mask;
+}
 
 /**
  * @brief Decodes an unsigned 32-bit varint

From 7ba47c70345a2d45b07f4f3f3e6e70270f344c06 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Thu, 4 Nov 2021 13:01:01 -0400
Subject: [PATCH 087/112] Reapply temporary fix

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 911195e2f56..89e45809010 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -15,15 +15,16 @@
 # This function finds cucollections and sets any additional necessary environment variables.
 function(find_and_configure_cucollections)
 
-  # Find or install cuCollections
-  rapids_cpm_find(
-    # cuCollections doesn't have a version yet
-    cuco 0.0
-    GLOBAL_TARGETS cuco::cuco
-    CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 62b90b7f7adf272455007b1c857e1d621aaf13ca
-    OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
-  )
+    # Find or install cuCollections
+    rapids_cpm_find(cuco 0.0
+        GLOBAL_TARGETS cuco::cuco
+        CPM_ARGS
+            GITHUB_REPOSITORY robertmaynard/cuCollections
+            GIT_TAG           bf6a90db78516e099d07e845a39012dbcaa8de18
+            OPTIONS           "BUILD_TESTS OFF"
+                              "BUILD_BENCHMARKS OFF"
+                              "BUILD_EXAMPLES OFF"
+    )
 endfunction()
 
 find_and_configure_cucollections()

From 1034057301540a4f3355ce0ae16b16c481a6bb77 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Thu, 4 Nov 2021 13:02:07 -0400
Subject: [PATCH 088/112] Perf improvement for rescale

---
 cpp/src/unary/cast_ops.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 8428efabbd2..e852b00796a 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -176,7 +176,7 @@ std::unique_ptr<column> rescale(column_view input,
 {
   using namespace numeric;
 
-  if (input.type().scale() > scale) {
+  if (input.type().scale() >= scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
     auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
     return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);

From d6e9ee810acc06a6da3fed9c9b1ffc538dd23fbf Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 3 Nov 2021 16:27:06 -0700
Subject: [PATCH 089/112] default to dec64;make1128 slectable;fix tests;add
 options test

---
 cpp/include/cudf/io/orc.hpp               | 30 ++++++++++
 cpp/src/io/orc/aggregate_orc_metadata.hpp |  4 +-
 cpp/src/io/orc/reader_impl.cu             | 70 ++++++++++++++---------
 cpp/src/io/orc/reader_impl.hpp            |  1 +
 cpp/src/io/orc/stripe_data.cu             | 38 ++++++++----
 cpp/tests/io/orc_test.cpp                 | 57 ++++++++++++++----
 python/cudf/cudf/_lib/cpp/io/orc.pxd      |  4 ++
 7 files changed, 152 insertions(+), 52 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 2a95b85465b..33f0232b4d0 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -70,6 +70,9 @@ class orc_reader_options {
   // Columns that should be converted from Decimal to Float64
   std::vector<std::string> _decimal_cols_as_float;
 
+  // Columns that should be read as Decimal128
+  std::vector<std::string> _decimal128_columns;
+
   friend orc_reader_options_builder;
 
   /**
@@ -143,6 +146,11 @@ class orc_reader_options {
     return _decimal_cols_as_float;
   }
 
+  /**
+   * @brief Columns that should be read as 128-bit Decimal
+   */
+  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
+
   // Setters
 
   /**
@@ -216,6 +224,16 @@ class orc_reader_options {
   {
     _decimal_cols_as_float = std::move(val);
   }
+
+  /**
+   * @brief Set columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of column names.
+   */
+  void set_decimal128_columns(std::vector<std::string> val)
+  {
+    _decimal128_columns = std::move(val);
+  }
 };
 
 class orc_reader_options_builder {
@@ -332,6 +350,18 @@ class orc_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of column names.
+   * @return this for chaining.
+   */
+  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
+  {
+    options._decimal128_columns = std::move(val);
+    return *this;
+  }
+
   /**
    * @brief move orc_reader_options member once it's built.
    */
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 356d20843e8..5132906a5fc 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -86,7 +86,7 @@ class aggregate_orc_metadata {
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  auto column_name(const int source_idx, const int column_id) const
+  std::string const& column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -98,7 +98,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  auto column_path(const int source_idx, const int column_id) const
+  std::string const& column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 23cd8a0936b..d35f2db90fc 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -56,7 +56,7 @@ namespace {
 constexpr type_id to_type_id(const orc::SchemaType& schema,
                              bool use_np_dtypes,
                              type_id timestamp_type_id,
-                             bool decimals_as_float64)
+                             type_id decimal_type_id)
 {
   switch (schema.kind) {
     case orc::BOOLEAN: return type_id::BOOL8;
@@ -78,7 +78,7 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL128;
+    case orc::DECIMAL: return decimal_type_id;
     // Need to update once cuDF plans to support map type
     case orc::MAP:
     case orc::LIST: return type_id::LIST;
@@ -227,15 +227,26 @@ size_t gather_stream_info(const size_t stripe_index,
 }
 
 /**
- * @brief Determines if a column should be converted from decimal to float
+ * @brief Determines cuDF type of an ORC Decimal column.
  */
-bool should_convert_decimal_column_to_float(const std::vector<std::string>& columns_to_convert,
-                                            cudf::io::orc::metadata& metadata,
-                                            int column_index)
+auto decimal_column_type(const std::vector<std::string>& float64_columns,
+                         const std::vector<std::string>& decimal128_columns,
+                         cudf::io::orc::metadata& metadata,
+                         int column_index)
 {
-  return (std::find(columns_to_convert.begin(),
-                    columns_to_convert.end(),
-                    metadata.column_name(column_index)) != columns_to_convert.end());
+  auto const& column_name = metadata.column_name(column_index);
+  auto is_column_in       = [&](const std::vector<std::string>& cols) {
+    return std::find(cols.cbegin(), cols.cend(), column_name) != cols.end();
+  };
+
+  auto const user_selected_float64    = is_column_in(float64_columns);
+  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
+  CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
+               "Both decimal128 and float64 types selected for column " + column_name);
+
+  if (user_selected_float64) return type_id::FLOAT64;
+  if (user_selected_decimal128) return type_id::DECIMAL128;
+  return type_id::DECIMAL64;
 }
 
 }  // namespace
@@ -729,12 +740,12 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
                                                           rmm::cuda_stream_view stream)
 {
   schema_info.name = _metadata.column_name(0, orc_col_id);
-  // If the column type is orc::DECIMAL see if the user
-  // desires it to be converted to float64 or not
-  auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-    _decimal_cols_as_float, _metadata.per_file_metadata[0], orc_col_id);
-  auto const type = to_type_id(
-    _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+  auto const type  = to_type_id(
+    _metadata.get_schema(orc_col_id),
+    _use_np_dtypes,
+    _timestamp_type.id(),
+    decimal_column_type(
+      _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -785,7 +796,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL128) {
+      if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -876,8 +887,9 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Enable or disable the conversion to numpy-compatible dtypes
   _use_np_dtypes = options.is_enabled_use_np_dtypes();
 
-  // Control decimals conversion (float64 or int64 with optional scale)
+  // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
+  decimal128_columns     = options.get_decimal128_columns();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -937,14 +949,18 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto& col : columns_level) {
-      // If the column type is orc::DECIMAL see if the user
-      // desires it to be converted to float64 or not
-      auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-        _decimal_cols_as_float, _metadata.per_file_metadata[0], col.id);
       auto col_type = to_type_id(
-        _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+        _metadata.get_col_type(col.id),
+        _use_np_dtypes,
+        _timestamp_type.id(),
+        decimal_column_type(
+          _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL128) {
+      CUDF_EXPECTS(
+        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
+        "Precision of column " + std::string{_metadata.column_name(0, col.id)} +
+          " is over 18, use 128-bit Decimal.");
+      if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
@@ -1111,11 +1127,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
-            chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-              _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
-            chunk.dtype_id      = column_types[col_idx].id();
-            chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+            chunk.dtype_id       = column_types[col_idx].id();
+            chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                     .ff.types[columns_level[col_idx].id]
                                     .scale.value_or(0);
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index c9de2211d48..64e7cbc74e5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -222,6 +222,7 @@ class reader::impl {
   bool _use_index     = true;
   bool _use_np_dtypes = true;
   std::vector<std::string> _decimal_cols_as_float;
+  std::vector<std::string> decimal128_columns;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index f121e1108dc..7496fd6facd 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1053,17 +1053,25 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         else
           vals.f64[t] = f * kPow10[min(-scale, 39)];
       } else {
-        // Since cuDF column stores just one scale, value needs to
-        // be adjusted to col_scale from val_scale. So the difference
-        // of them will be used to add 0s or remove digits.
-        int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
-        if (scale >= 0) {
-          scale        = min(scale, 27);
-          vals.i128[t] = (v * kPow5i[scale]) << scale;
-        } else  // if (scale < 0)
-        {
-          scale        = min(-scale, 27);
-          vals.i128[t] = (v / kPow5i[scale]) >> scale;
+        auto const scaled_value = [&]() {
+          // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from
+          // val_scale. So the difference of them will be used to add 0s or remove digits.
+          int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
+          if (scale >= 0) {
+            scale = min(scale, 27);
+            return (v * kPow5i[scale]) << scale;
+          } else  // if (scale < 0)
+          {
+            scale = min(-scale, 27);
+            return (v / kPow5i[scale]) >> scale;
+          }
+        }();
+        if (dtype_id == type_id::DECIMAL64) {
+          vals.i64[t] = scaled_value;
+        } else {
+          {
+            vals.i128[t] = scaled_value;
+          }
         }
       }
     }
@@ -1700,7 +1708,13 @@ __global__ void __launch_bounds__(block_size)
             case DOUBLE:
             case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              static_cast<__uint128_t*>(data_out)[row] = s->vals.u128[t + vals_skipped];
+              if (s->chunk.dtype_id == type_id::FLOAT64 or
+                  s->chunk.dtype_id == type_id::DECIMAL64) {
+                static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              } else {
+                // decimal128
+                static_cast<__uint128_t*>(data_out)[row] = s->vals.u128[t + vals_skipped];
+              }
               break;
             case MAP:
             case LIST: {
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b29138f262c..77c4081dbbf 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -387,7 +387,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"});
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -406,7 +408,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
   });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
@@ -428,7 +430,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, col6_mask};
+  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
   cudf::test::lists_column_wrapper<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};
@@ -1143,10 +1145,10 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
   auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{vals[i], numeric::scale_type{scale}};
+    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  column_wrapper<numeric::decimal128> col{data, data + num_rows, mask};
+  column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
@@ -1190,13 +1192,12 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  // Need a 128bit decimal column for comparison since the reader always creates DECIMAL128 columns
-  auto data128 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal128{vals[i], numeric::scale_type{2}};
+  auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
+    return numeric::decimal64{vals[i], numeric::scale_type{2}};
   });
-  column_wrapper<numeric::decimal128> col128{data128, data128 + num_rows, mask};
+  column_wrapper<numeric::decimal64> col64{data64, data64 + num_rows, mask};
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col128, result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0));
 }
 
 TEST_F(OrcStatisticsTest, Overflow)
@@ -1412,4 +1413,40 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
 }
 
+TEST_F(OrcReaderTest, DecimalOptions)
+{
+  constexpr auto num_rows = 10;
+  auto col_vals           = random_values<int64_t>(num_rows);
+  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
+
+  column_wrapper<numeric::decimal128> col{col_data, col_data + num_rows, mask};
+  table_view expected({col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("dec");
+
+  auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options valid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"decc", "fake_name"});
+  // Should not throw
+  EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
+
+  cudf_io::orc_reader_options invalid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"dec", "fake_name"});
+  // Should throw, options overlap
+  EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index c855f112692..f0450483345 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -36,6 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
+        void set_decimal128_columns(vector[string] val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -57,6 +58,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
+        orc_reader_options_builder& decimal128_columns(
+            vector[string] val
+        ) except+
 
         orc_reader_options build() except+
 

From 4411d8e148faa193e0b21fa8bd3bd44c9df07af1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Nov 2021 12:55:48 -0700
Subject: [PATCH 090/112] use paths for decimal types API; iron out generated
 column names

---
 cpp/include/cudf/io/orc.hpp   |  8 +++---
 cpp/src/io/orc/orc.cpp        | 16 ++++++++---
 cpp/src/io/orc/reader_impl.cu |  6 ++---
 cpp/src/io/orc/writer_impl.cu |  2 +-
 cpp/tests/io/orc_test.cpp     | 50 +++++++++++++++++++++++++++++++++++
 5 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 33f0232b4d0..fb1199fc166 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -139,7 +139,7 @@ class orc_reader_options {
   data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
-   * @brief Columns that should be converted from Decimal to Float64.
+   * @brief Fully qualified names of columns that should be converted from Decimal to Float64.
    */
   std::vector<std::string> const& get_decimal_cols_as_float() const
   {
@@ -147,7 +147,7 @@ class orc_reader_options {
   }
 
   /**
-   * @brief Columns that should be read as 128-bit Decimal
+   * @brief Fully qualified names of columns that should be read as 128-bit Decimal.
    */
   std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
 
@@ -218,7 +218,7 @@ class orc_reader_options {
   /**
    * @brief Set columns that should be converted from Decimal to Float64
    *
-   * @param val Vector of column names.
+   * @param val Vector of fully qualified column names.
    */
   void set_decimal_cols_as_float(std::vector<std::string> val)
   {
@@ -228,7 +228,7 @@ class orc_reader_options {
   /**
    * @brief Set columns that should be read as 128-bit Decimal
    *
-   * @param val Vector of column names.
+   * @param val Vector of fully qualified column names.
    */
   void set_decimal128_columns(std::vector<std::string> val)
   {
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 89eac0c9901..44cea6169e4 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -18,6 +18,8 @@
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
+#include <cudf/lists/lists_column_view.hpp>
+
 #include <thrust/tabulate.h>
 
 #include <string>
@@ -472,10 +474,16 @@ void metadata::init_column_names()
   thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
     if (not column_has_parent(col_id)) return std::string{};
     auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
-    // Child columns of lists don't have a name in ORC files, generate placeholder in that case
-    return field_index(col_id) < static_cast<size_type>(parent_field_names.size())
-             ? parent_field_names[field_index(col_id)]
-             : std::to_string(col_id);
+    if (field_index(col_id) < static_cast<size_type>(parent_field_names.size())) {
+      return parent_field_names[field_index(col_id)];
+    }
+
+    // Generate names for list and map child columns
+    if (ff.types[parent_id(col_id)].subtypes.size() == 1) {
+      return std::to_string(lists_column_view::child_column_index);
+    } else {
+      return std::to_string(field_index(col_id));
+    }
   });
 
   column_paths.resize(get_num_columns());
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d35f2db90fc..f0612dcb42f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -234,15 +234,15 @@ auto decimal_column_type(const std::vector<std::string>& float64_columns,
                          cudf::io::orc::metadata& metadata,
                          int column_index)
 {
-  auto const& column_name = metadata.column_name(column_index);
+  auto const& column_path = metadata.column_path(column_index);
   auto is_column_in       = [&](const std::vector<std::string>& cols) {
-    return std::find(cols.cbegin(), cols.cend(), column_name) != cols.end();
+    return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end();
   };
 
   auto const user_selected_float64    = is_column_in(float64_columns);
   auto const user_selected_decimal128 = is_column_in(decimal128_columns);
   CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
-               "Both decimal128 and float64 types selected for column " + column_name);
+               "Both decimal128 and float64 types selected for column " + column_path);
 
   if (user_selected_float64) return type_id::FLOAT64;
   if (user_selected_decimal128) return type_id::DECIMAL128;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e54c21efc47..2bf020d08a2 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1776,7 +1776,7 @@ void writer::impl::write(table_view const& table)
     [&](column_in_metadata& col_meta, std::string default_name) {
       if (col_meta.get_name().empty()) col_meta.set_name(default_name);
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i));
+        add_default_name(col_meta.child(i), std::to_string(i));
       }
     };
   for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 77c4081dbbf..4862bc74fab 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1449,4 +1449,54 @@ TEST_F(OrcReaderTest, DecimalOptions)
   EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error);
 }
 
+TEST_F(OrcWriterTest, DecimalOptionsNested)
+{
+  auto const num_rows = 100;
+
+  auto dec_vals  = random_values<int32_t>(num_rows);
+  auto keys_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto vals_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<numeric::decimal64> keys_col{keys_data, keys_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> vals_col{vals_data, vals_data + num_rows, validity};
+
+  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  std::iota(row_offsets.begin(), row_offsets.end(), 0);
+  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows,
+                            offsets.release(),
+                            std::move(struct_col),
+                            cudf::UNKNOWN_NULL_COUNT,
+                            cudf::test::detail::make_null_mask(validity, validity + num_rows));
+
+  table_view expected({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("lists");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("dec64");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("dec128");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"lists.1.dec128"});
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 61b3677cd3f6e7f2c5dc26be4c55681effa41134 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Nov 2021 13:16:22 -0700
Subject: [PATCH 091/112] small clean up

---
 cpp/src/io/orc/stripe_data.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 7496fd6facd..44f106c4f5c 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -122,14 +122,14 @@ struct orcdec_state_s {
     orc_rowdec_state_s rowdec;
   } u;
   union values {
-    uint8_t u8[block_size * 8];
-    uint32_t u32[block_size * 2];
-    int32_t i32[block_size * 2];
-    uint64_t u64[block_size];
-    int64_t i64[block_size];
-    double f64[block_size];
-    __int128_t i128[block_size];   // TMP
-    __uint128_t u128[block_size];  // TMP
+    uint8_t u8[block_size * 16];
+    uint32_t u32[block_size * 4];
+    int32_t i32[block_size * 4];
+    uint64_t u64[block_size * 2];
+    int64_t i64[block_size * 2];
+    double f64[block_size * 2];
+    __int128_t i128[block_size];
+    __uint128_t u128[block_size];
   } vals;
 };
 

From 7c01f21171de281c48ddd36915ac21437e8fff15 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 16:04:14 -0500
Subject: [PATCH 092/112] ROLLING_TEST fix

---
 cpp/tests/rolling/rolling_test.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 038d692a323..f136fff83da 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -1203,13 +1203,11 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
   using decimalXX  = TypeParam;
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using sum_type   = std::conditional_t<std::is_same_v<RepType, __int128_t>, __int128_t, int64_t>;
-  using fpsum_wrapper = cudf::test::fixed_point_column_wrapper<sum_type>;
   using fw_wrapper    = cudf::test::fixed_width_column_wrapper<size_type>;
 
   auto const scale              = scale_type{-1};
   auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};
-  auto const expected_sum       = fpsum_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_sum       = fp_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_min       = fp_wrapper{{42, 42, 55, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_max       = fp_wrapper{{42, 55, 55, 55, 2, 2}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_lag       = fp_wrapper{{0, 42, 1729, 55, 343, 1}, {0, 1, 0, 1, 0, 1}, scale};

From 63a00043297b97b4752608df8fbb554ae3394ca1 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 16:07:08 -0500
Subject: [PATCH 093/112] clang-format

---
 cpp/tests/rolling/rolling_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index f136fff83da..1a31192f6a4 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -1203,7 +1203,7 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
   using decimalXX  = TypeParam;
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fw_wrapper    = cudf::test::fixed_width_column_wrapper<size_type>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<size_type>;
 
   auto const scale              = scale_type{-1};
   auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};

From d3c589cd59c672277e6f85366cddad90e6eca22a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 16:12:36 -0500
Subject: [PATCH 094/112] Update meta.yaml

---
 conda/recipes/libcudf/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0e0fc816c62..df1f6bb3d37 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -116,6 +116,7 @@ test:
     - test -f $PREFIX/include/cudf/dictionary/update_keys.hpp
     - test -f $PREFIX/include/cudf/filling.hpp
     - test -f $PREFIX/include/cudf/fixed_point/fixed_point.hpp
+    - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp
     - test -f $PREFIX/include/cudf/groupby.hpp
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp

From 27a2e58b490322d4589e5a7b6f2e2b0aa64c9cfb Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 16:25:07 -0500
Subject: [PATCH 095/112] Cmake formatting

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 89e45809010..b9ddba27b7d 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -15,16 +15,15 @@
 # This function finds cucollections and sets any additional necessary environment variables.
 function(find_and_configure_cucollections)
 
-    # Find or install cuCollections
-    rapids_cpm_find(cuco 0.0
-        GLOBAL_TARGETS cuco::cuco
-        CPM_ARGS
-            GITHUB_REPOSITORY robertmaynard/cuCollections
-            GIT_TAG           bf6a90db78516e099d07e845a39012dbcaa8de18
-            OPTIONS           "BUILD_TESTS OFF"
-                              "BUILD_BENCHMARKS OFF"
-                              "BUILD_EXAMPLES OFF"
-    )
+  # Find or install cuCollections
+  rapids_cpm_find(
+    # cuCollections doesn't have a version yet
+    cuco 0.0
+    GLOBAL_TARGETS cuco::cuco
+    CPM_ARGS GITHUB_REPOSITORY robertmaynard/cuCollections
+    GIT_TAG bf6a90db78516e099d07e845a39012dbcaa8de18
+    OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
+  )
 endfunction()
 
 find_and_configure_cucollections()

From 9e2184f9607c2a7667372b6fce50fcee46356cd5 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 17:48:24 -0500
Subject: [PATCH 096/112] Cleaning up has_atomic_support

---
 cpp/include/cudf/utilities/traits.cuh | 67 +++++++++++++++++++++++++++
 cpp/src/groupby/hash/groupby.cu       | 22 +--------
 2 files changed, 69 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/traits.cuh

diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
new file mode 100644
index 00000000000..88fcc5dd8c2
--- /dev/null
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf {
+
+/**
+ * @addtogroup utility_types
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Indicates whether the type `T` has support for atomics
+ *
+ * @tparam T  The type to verify
+ * @return true `T` has support for atomics
+ * @return false  `T` no support for atomics
+ */
+template <typename T>
+constexpr inline bool has_atomic_support()
+{
+  return cuda::std::atomic<T>::is_always_lock_free;
+}
+
+struct has_atomic_support_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return has_atomic_support<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` has support for atomics
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` has support for atomics
+ * @return false `type` no support for atomics
+ */
+constexpr inline bool has_atomic_support(data_type type)
+{
+  return cudf::type_dispatcher(type, has_atomic_support_impl{});
+}
+
+/** @} */
+
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index efd80485ac7..f062a132317 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -43,6 +43,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/traits.cuh>
 #include <hash/concurrent_unordered_map.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -622,25 +623,6 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 
 }  // namespace
 
-struct has_atomic_support_type_dispatcher {
-  template <typename T>
-  bool operator()()
-  {
-    return cuda::std::atomic<T>::is_always_lock_free;
-  }
-};
-
-/**
- * @brief Indicates whether `type` has support for atomics
- *
- * @param type  The `data_type` that is being checked
- * @return      `true` if `type` has support for atomics, `false` otherwise
- */
-bool has_atomic_support(cudf::data_type const& type)
-{
-  return type_dispatcher(type, has_atomic_support_type_dispatcher{});
-}
-
 /**
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
@@ -654,7 +636,7 @@ bool has_atomic_support(cudf::data_type const& type)
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    return has_atomic_support(r.values.type()) and
+    return cudf::has_atomic_support(r.values.type()) and
            std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
              return is_hash_aggregation(a->kind);
            });

From 8634dea7a90a688a14320c1f6c8e057e2c7742de Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 18:28:53 -0500
Subject: [PATCH 097/112] Cleanup

---
 cpp/src/groupby/hash/groupby.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f062a132317..58d2c7f09d6 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -42,8 +42,8 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/traits.cuh>
+#include <cudf/utilities/traits.hpp>
 #include <hash/concurrent_unordered_map.cuh>
 
 #include <rmm/cuda_stream_view.hpp>

From 4b5dbe2e787d89e02de341a499c863a4244418e0 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 19:14:31 -0500
Subject: [PATCH 098/112] Use has_atomic_support

---
 cpp/include/cudf/detail/aggregation/aggregation.cuh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index e05e83991cd..52447d0ba5b 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -139,7 +140,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same_v<Source, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<Source>()) {
       using Target = target_type_t<Source, aggregation::MIN>;
       atomicMin(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -166,7 +167,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicMin(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }
@@ -189,7 +190,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same_v<Source, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<Source>()) {
       using Target = target_type_t<Source, aggregation::MAX>;
       atomicMax(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -216,7 +217,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicMax(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }
@@ -239,7 +240,7 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (not std::is_same_v<Source, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<Source>()) {
       using Target = target_type_t<Source, aggregation::SUM>;
       atomicAdd(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
@@ -266,7 +267,7 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (not std::is_same_v<DeviceSource, __int128_t>) {
+    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicAdd(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
     }

From 860bcbbaeae050a5bd5b47521a083a9854393772 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Mon, 8 Nov 2021 19:57:51 -0500
Subject: [PATCH 099/112] Fix silent failure

---
 cpp/include/cudf/detail/aggregation/aggregation.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 52447d0ba5b..ad3b1042c95 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -144,6 +144,8 @@ struct update_target_element<
       using Target = target_type_t<Source, aggregation::MIN>;
       atomicMin(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
+    } else {
+      cudf_assert(false and "Source has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
@@ -170,6 +172,8 @@ struct update_target_element<Source,
     if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicMin(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    } else {
+      cudf_assert(false and "DeviceSource has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
@@ -194,6 +198,8 @@ struct update_target_element<
       using Target = target_type_t<Source, aggregation::MAX>;
       atomicMax(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
+    } else {
+      cudf_assert(false and "Source has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
@@ -220,6 +226,8 @@ struct update_target_element<Source,
     if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicMax(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    } else {
+      cudf_assert(false and "DeviceSource has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
@@ -244,6 +252,8 @@ struct update_target_element<
       using Target = target_type_t<Source, aggregation::SUM>;
       atomicAdd(&target.element<Target>(target_index),
                 static_cast<Target>(source.element<Source>(source_index)));
+    } else {
+      cudf_assert(false and "Source has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
@@ -270,6 +280,8 @@ struct update_target_element<Source,
     if constexpr (cudf::has_atomic_support<DeviceSource>()) {
       atomicAdd(&target.element<DeviceTarget>(target_index),
                 static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+    } else {
+      cudf_assert(false and "DeviceSource has no atomic support.");
     }
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }

From 89004c7e276c95f5d71b70fd66e689a94b22814f Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 9 Nov 2021 00:14:41 -0500
Subject: [PATCH 100/112] docs cleanup

---
 cpp/include/cudf/utilities/traits.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
index 88fcc5dd8c2..43587ffa583 100644
--- a/cpp/include/cudf/utilities/traits.cuh
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -32,9 +32,9 @@ namespace cudf {
 /**
  * @brief Indicates whether the type `T` has support for atomics
  *
- * @tparam T  The type to verify
- * @return true `T` has support for atomics
- * @return false  `T` no support for atomics
+ * @tparam T     The type to verify
+ * @return true  `T` has support for atomics
+ * @return false `T` no support for atomics
  */
 template <typename T>
 constexpr inline bool has_atomic_support()
@@ -53,8 +53,8 @@ struct has_atomic_support_impl {
 /**
  * @brief Indicates whether `type` has support for atomics
  *
- * @param type The `data_type` to verify
- * @return true `type` has support for atomics
+ * @param type   The `data_type` to verify
+ * @return true  `type` has support for atomics
  * @return false `type` no support for atomics
  */
 constexpr inline bool has_atomic_support(data_type type)

From 12e5b20a22ab36bc2ff5043c785d44bffdf4b700 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 9 Nov 2021 14:55:25 -0500
Subject: [PATCH 101/112] Cleanup

---
 cpp/src/quantiles/quantiles_util.hpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 142b0d18772..a0554833def 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -153,10 +153,7 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
   }
 }
 
-template <typename Result,
-          typename Iterator,
-          typename std::enable_if_t<not cudf::is_fixed_point<Result>()>* =
-            nullptr>  // TODO revisit if this is needed
+template <typename Result, typename Iterator>
 CUDA_HOST_DEVICE_CALLABLE Result
 select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
 {

From 3ef6a09cdc1e08d8b0a57337658788dedd8851d8 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 10 Nov 2021 14:46:51 -0500
Subject: [PATCH 102/112] Additional decimal128 string tests

---
 cpp/tests/strings/fixed_point_tests.cpp | 27 +++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 3ceaba2637b..898d2105b63 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -84,6 +84,33 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsConvertTest, ToFixedPointVeryLargeDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const strings = cudf::test::strings_column_wrapper(
+    {"1234000000000000000000",
+     "-876000000000000000000",
+     "5432e+17",
+     "-12E016",
+     "250000000000000000",
+     "-2800000000000000",
+     "",
+     "-0.0",
+     "170141183460469231731687303715884105727",
+     "17014118346046923173168730371588410572700000000000000000000"});
+
+  auto const type     = cudf::data_type{cudf::type_to_id<decimal128>(), scale_type{20}};
+  auto const results  = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type);
+  auto const expected = fp_wrapper{
+    {12, -8, 5, 0, 0, 0, 0, 0, 1701411834604692317, cuda::std::numeric_limits<__int128_t>::max()},
+    scale_type{20}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall)
 {
   using DecimalType  = TypeParam;

From ec8e74afb19f158ae638d7a7a85ca8ecf80f4dc2 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Thu, 11 Nov 2021 11:15:05 -0500
Subject: [PATCH 103/112] count_digits

---
 cpp/src/strings/convert/utilities.cuh   | 44 +++++++++----------------
 cpp/tests/strings/fixed_point_tests.cpp | 42 ++++++++++++++++++++---
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 0006592e599..234ecf48f2e 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -96,38 +96,26 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
 template <typename IntegerType>
 constexpr size_type count_digits(IntegerType value)
 {
-  // TODO definitely broken
   if (value == 0) return 1;
-  bool is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;
   // for all integer types, the max() and min() values have the same number of digits
-  value = (value == std::numeric_limits<IntegerType>::min())
-            ? std::numeric_limits<IntegerType>::max()
+  value = (value == cuda::std::numeric_limits<IntegerType>::min())
+            ? cuda::std::numeric_limits<IntegerType>::max()
             : cudf::util::absolute_value(value);
-  // largest 8-byte unsigned value is 18446744073709551615 (20 digits)
-  // clang-format off
-  size_type digits =
-    (value < 10 ? 1 :
-    (value < 100 ? 2 :
-    (value < 1000 ? 3 :
-    (value < 10000 ? 4 :
-    (value < 100000 ? 5 :
-    (value < 1000000 ? 6 :
-    (value < 10000000 ? 7 :
-    (value < 100000000 ? 8 :
-    (value < 1000000000 ? 9 :
-    (value < 10000000000 ? 10 :
-    (value < 100000000000 ? 11 :
-    (value < 1000000000000 ? 12 :
-    (value < 10000000000000 ? 13 :
-    (value < 100000000000000 ? 14 :
-    (value < 1000000000000000 ? 15 :
-    (value < 10000000000000000 ? 16 :
-    (value < 100000000000000000 ? 17 :
-    (value < 1000000000000000000 ? 18 :
-    (value < 10000000000000000000 ? 19 :
-    20)))))))))))))))))));
-  // clang-format on
+
+  auto const digits = [value] {
+    // largest 8-byte  unsigned value is 18446744073709551615 (20 digits)
+    // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits)
+    auto constexpr max_digits = std::is_same_v<IntegerType, __int128_t> ? 39 : 20;
+
+    size_type digits = 1;
+    __int128_t pow10 = 10;
+    for (; digits < max_digits; ++digits, pow10 *= 10)
+      if (value < pow10) break;
+    return digits;
+  }();
+
   return digits + static_cast<size_type>(is_negative);
 }
 
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 8688ea76800..fe94ffe23d7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -84,7 +84,7 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
-TEST_F(StringsConvertTest, ToFixedPointVeryLargeDecimal128)
+TEST_F(StringsConvertTest, ToFixedPointDecimal128)
 {
   using namespace numeric;
   using RepType    = cudf::device_storage_type_t<decimal128>;
@@ -102,15 +102,47 @@ TEST_F(StringsConvertTest, ToFixedPointVeryLargeDecimal128)
      "170141183460469231731687303715884105727",
      "17014118346046923173168730371588410572700000000000000000000"});
 
-  auto const type     = cudf::data_type{cudf::type_to_id<decimal128>(), scale_type{20}};
+  auto const scale    = scale_type{20};
+  auto const type     = cudf::data_type{cudf::type_to_id<decimal128>(), scale};
   auto const results  = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type);
-  auto const expected = fp_wrapper{
-    {12, -8, 5, 0, 0, 0, 0, 0, 1701411834604692317, cuda::std::numeric_limits<__int128_t>::max()},
-    scale_type{20}};
+  auto const max      = cuda::std::numeric_limits<__int128_t>::max();
+  auto const expected = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0, 1701411834604692317, max}, scale};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsConvertTest, FromFixedPointDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    auto const input    = fp_wrapper{{110}, numeric::scale_type{-2}};
+    auto results        = cudf::strings::from_fixed_point(input);
+    auto const expected = cudf::test::strings_column_wrapper({"1.10"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input =
+      fp_wrapper({110, cuda::std::numeric_limits<__int128_t>::max()}, numeric::scale_type{2});
+    auto results = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"11000", "17014118346046923173168730371588410572700"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input    = fp_wrapper({-222}, numeric::scale_type{0});
+    auto results        = cudf::strings::from_fixed_point(input);
+    auto const expected = cudf::test::strings_column_wrapper({"-222"});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+}
+
 TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall)
 {
   using DecimalType  = TypeParam;

From e365080b2b43929c53f71da592ada402975d2eef Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Thu, 11 Nov 2021 16:50:22 -0500
Subject: [PATCH 104/112] final string changes

---
 cpp/include/cudf/fixed_point/temporary.hpp    | 10 ++++++++
 .../strings/convert/convert_fixed_point.cu    |  5 ++--
 cpp/tests/strings/fixed_point_tests.cpp       | 23 ++++++++++++++-----
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 49c83090da7..90c98130fdc 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
 //       Jitify is needed for several algorithms (binaryop, rolling, etc)
@@ -72,6 +73,15 @@ CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
 {
   return lhs > rhs ? lhs : rhs;
 }
+template <typename BaseType>
+constexpr auto exp10(int32_t exponent)
+{
+  CUDF_EXPECTS(exponent >= 0, "Exponent must be greater than 0.");
+  BaseType value = 1;
+  while (exponent > 0)
+    value *= 10, --exponent;
+  return value;
+}
 
 }  // namespace detail
 }  // namespace numeric
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 847e7c3566a..6944a8eb097 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -210,8 +210,7 @@ struct decimal_to_string_size_fn {
     if (scale >= 0) return count_digits(value) + scale;
 
     auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = static_cast<int64_t>(exp10(
-      static_cast<double>(-scale)));  // TODO probably broken (might need numeric::detail::exp10)
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const fraction  = count_digits(abs_value % exp_ten);
     auto const num_zeros = std::max(0, (-scale - fraction));
     return static_cast<int32_t>(value < 0) +    // sign if negative
@@ -253,7 +252,7 @@ struct decimal_to_string_fn {
     //       fraction = abs(value) % (10^abs(scale))
     if (value < 0) *d_buffer++ = '-';  // add sign
     auto const abs_value = numeric::detail::abs(value);
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
     d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index fe94ffe23d7..b96706c5eb2 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -117,18 +117,29 @@ TEST_F(StringsConvertTest, FromFixedPointDecimal128)
   using RepType    = cudf::device_storage_type_t<decimal128>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
+  auto constexpr max = cuda::std::numeric_limits<__int128_t>::max();
+
   {
-    auto const input    = fp_wrapper{{110}, numeric::scale_type{-2}};
-    auto results        = cudf::strings::from_fixed_point(input);
-    auto const expected = cudf::test::strings_column_wrapper({"1.10"});
+    auto const input = fp_wrapper{{110, max}, numeric::scale_type{-2}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.10", "1701411834604692317316873037158841057.27"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input = fp_wrapper{{max}, numeric::scale_type{-38}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.70141183460469231731687303715884105727"});
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 
   {
-    auto const input =
-      fp_wrapper({110, cuda::std::numeric_limits<__int128_t>::max()}, numeric::scale_type{2});
-    auto results = cudf::strings::from_fixed_point(input);
+    auto const input = fp_wrapper({110, max}, numeric::scale_type{2});
+    auto results     = cudf::strings::from_fixed_point(input);
     auto const expected =
       cudf::test::strings_column_wrapper({"11000", "17014118346046923173168730371588410572700"});
 

From a0d5d0cc91fa1c1a58c6b1866418304aa9957081 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 12 Nov 2021 13:44:16 -0500
Subject: [PATCH 105/112] use enable_if

---
 .../cudf/detail/aggregation/aggregation.cuh   | 99 ++++++++-----------
 1 file changed, 41 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index ad3b1042c95..85bfdd9c00d 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -132,7 +132,8 @@ struct update_target_element<
   aggregation::MIN,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -140,24 +141,19 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (cudf::has_atomic_support<Source>()) {
-      using Target = target_type_t<Source, aggregation::MIN>;
-      atomicMin(&target.element<Target>(target_index),
-                static_cast<Target>(source.element<Source>(source_index)));
-    } else {
-      cudf_assert(false and "Source has no atomic support.");
-    }
+    using Target = target_type_t<Source, aggregation::MIN>;
+    atomicMin(&target.element<Target>(target_index),
+              static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MIN,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+  struct update_target_element < Source,
+  aggregation::MIN, target_has_nulls, source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -169,12 +165,8 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
-      atomicMin(&target.element<DeviceTarget>(target_index),
-                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    } else {
-      cudf_assert(false and "DeviceSource has no atomic support.");
-    }
+    atomicMin(&target.element<DeviceTarget>(target_index),
+              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -186,7 +178,8 @@ struct update_target_element<
   aggregation::MAX,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -194,24 +187,22 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (cudf::has_atomic_support<Source>()) {
-      using Target = target_type_t<Source, aggregation::MAX>;
-      atomicMax(&target.element<Target>(target_index),
-                static_cast<Target>(source.element<Source>(source_index)));
-    } else {
-      cudf_assert(false and "Source has no atomic support.");
-    }
+    using Target = target_type_t<Source, aggregation::MAX>;
+    atomicMax(&target.element<Target>(target_index),
+              static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MAX,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -223,12 +214,8 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
-      atomicMax(&target.element<DeviceTarget>(target_index),
-                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    } else {
-      cudf_assert(false and "DeviceSource has no atomic support.");
-    }
+    atomicMax(&target.element<DeviceTarget>(target_index),
+              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -240,7 +227,8 @@ struct update_target_element<
   aggregation::SUM,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -248,24 +236,22 @@ struct update_target_element<
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    if constexpr (cudf::has_atomic_support<Source>()) {
-      using Target = target_type_t<Source, aggregation::SUM>;
-      atomicAdd(&target.element<Target>(target_index),
-                static_cast<Target>(source.element<Source>(source_index)));
-    } else {
-      cudf_assert(false and "Source has no atomic support.");
-    }
+    using Target = target_type_t<Source, aggregation::SUM>;
+    atomicAdd(&target.element<Target>(target_index),
+              static_cast<Target>(source.element<Source>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -277,12 +263,8 @@ struct update_target_element<Source,
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
 
-    if constexpr (cudf::has_atomic_support<DeviceSource>()) {
-      atomicAdd(&target.element<DeviceTarget>(target_index),
-                static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-    } else {
-      cudf_assert(false and "DeviceSource has no atomic support.");
-    }
+    atomicAdd(&target.element<DeviceTarget>(target_index),
+              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
   }
@@ -292,7 +274,8 @@ struct update_target_element<Source,
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.
  *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a dictionary.
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
  *
  */
 template <bool target_has_nulls = true>

From dd379506bd7dcbcfa0b0b11e1f2c135d9bc12a3e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 12 Nov 2021 13:49:06 -0500
Subject: [PATCH 106/112] clang-format

---
 cpp/include/cudf/detail/aggregation/aggregation.cuh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 85bfdd9c00d..47aa7d18489 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -150,8 +150,11 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-  struct update_target_element < Source,
-  aggregation::MIN, target_has_nulls, source_has_nulls,
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
   std::enable_if_t<is_fixed_point<Source>() &&
                    cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,

From fc4c1d1cbb5a97053122a102bc6bf93668b961b2 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Fri, 12 Nov 2021 15:36:56 -0500
Subject: [PATCH 107/112] Fix fix

---
 cpp/include/cudf/fixed_point/temporary.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 90c98130fdc..360794c8ade 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
 //       Jitify is needed for several algorithms (binaryop, rolling, etc)
@@ -76,7 +75,6 @@ CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
 template <typename BaseType>
 constexpr auto exp10(int32_t exponent)
 {
-  CUDF_EXPECTS(exponent >= 0, "Exponent must be greater than 0.");
   BaseType value = 1;
   while (exponent > 0)
     value *= 10, --exponent;

From 08da157ee591fca0aaf036ce3a76958078f9025a Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Sun, 14 Nov 2021 20:53:20 -0500
Subject: [PATCH 108/112] Cleanup

---
 cpp/include/cudf/fixed_point/temporary.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 360794c8ade..2b50e273517 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -72,6 +72,7 @@ CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
 {
   return lhs > rhs ? lhs : rhs;
 }
+
 template <typename BaseType>
 constexpr auto exp10(int32_t exponent)
 {

From 201a091f64e06d58472020dad4760e76d204f900 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 16 Nov 2021 11:35:45 -0500
Subject: [PATCH 109/112] is_chrono min/max identity

---
 cpp/include/cudf/detail/utilities/device_operators.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 0c85f448134..95605dc8a71 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -129,7 +129,7 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (cudf::is_chrono<T>()) return std::numeric_limits<T>::max();
+    if constexpr (cudf::is_chrono<T>()) return T::max();
     return cuda::std::numeric_limits<T>::max();
   }
 
@@ -171,7 +171,7 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    if constexpr (cudf::is_chrono<T>()) return std::numeric_limits<T>::lowest();
+    if constexpr (cudf::is_chrono<T>()) return T::min();
     return cuda::std::numeric_limits<T>::lowest();
   }
 

From f0afd8dcbf788ff5b13b8cbc367584182bb046a7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 16 Nov 2021 12:57:18 -0500
Subject: [PATCH 110/112] Use exp10

---
 cpp/include/cudf/fixed_point/fixed_point.hpp |  2 +-
 cpp/tests/strings/fixed_point_tests.cpp      | 24 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index b356d857f32..e8223b53997 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -555,7 +555,7 @@ class fixed_point {
   {
     if (_scale < 0) {
       auto const av = detail::abs(_value);
-      Rep const n   = std::pow(10, -_scale);  // does this work for all values of __int128
+      Rep const n   = detail::exp10<Rep>(-_scale);
       Rep const f   = av % n;
       auto const num_zeros =
         std::max(0, (-_scale - static_cast<int32_t>(detail::to_string(f).size())));
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index b96706c5eb2..d5ad57e7958 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <limits>
 
 #include <tests/strings/utilities.h>
 
@@ -301,3 +302,26 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     {true, true, true, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
+
+TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
+{
+  auto const max = cuda::std::numeric_limits<__int128_t>::max();
+
+  auto const x = numeric::decimal128{max, numeric::scale_type{-10}};
+  EXPECT_EQ(static_cast<std::string>(x), "17014118346046923173168730371.5884105727");
+
+  auto const y = numeric::decimal128{max, numeric::scale_type{10}};
+  EXPECT_EQ(static_cast<std::string>(y), "170141183460469231731687303710000000000");
+
+  auto const z = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{10}}};
+  EXPECT_EQ(static_cast<std::string>(z), "1701411834604692317316873037158841057270000000000");
+
+  auto const a = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{40}}};
+  EXPECT_EQ(static_cast<std::string>(a), "1701411834604692317316873037158841057270000000000000000000000000000000000000000");
+
+  auto const b = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-20}}};
+  EXPECT_EQ(static_cast<std::string>(b), "1701411834604692317.31687303715884105727");
+
+  auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
+  EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
+}
\ No newline at end of file

From 95ee95c633d49670c15bb4f63142ad710292f6c5 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 16 Nov 2021 14:00:06 -0500
Subject: [PATCH 111/112] clang-format

---
 cpp/tests/strings/fixed_point_tests.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index d5ad57e7958..7c188d39f6f 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -317,7 +317,8 @@ TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
   EXPECT_EQ(static_cast<std::string>(z), "1701411834604692317316873037158841057270000000000");
 
   auto const a = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{40}}};
-  EXPECT_EQ(static_cast<std::string>(a), "1701411834604692317316873037158841057270000000000000000000000000000000000000000");
+  EXPECT_EQ(static_cast<std::string>(a),
+            "1701411834604692317316873037158841057270000000000000000000000000000000000000000");
 
   auto const b = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-20}}};
   EXPECT_EQ(static_cast<std::string>(b), "1701411834604692317.31687303715884105727");

From 0b7c32e1821460bba38410cc44f2be0e430eb7ad Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 17 Nov 2021 04:11:22 +0530
Subject: [PATCH 112/112] Writer changes

---
 cpp/src/io/parquet/chunk_dict.cu  |  1 +
 cpp/src/io/parquet/page_enc.cu    | 29 +++++++++++++++--
 cpp/src/io/parquet/writer_impl.cu |  9 ++++--
 cpp/tests/io/parquet_test.cpp     | 52 +++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 64b3dd69c0d..9617fca0af3 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -188,6 +188,7 @@ __global__ void __launch_bounds__(block_size, 1)
                 return 4 + data_col.element<string_view>(val_idx).size_bytes();
               }
             case Type::FIXED_LEN_BYTE_ARRAY:
+              if (data_col.type().id() == type_id::DECIMAL128) { return 16; }
             default: cudf_assert(false && "Unsupported type for dictionary encoding"); return 0;
           }
         }();
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 48490426db7..1af62d998be 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -176,7 +176,8 @@ __global__ void __launch_bounds__(block_size)
     }
   }
   dtype     = s->col.physical_type;
-  dtype_len = (dtype == INT96)                      ? 12
+  dtype_len = (dtype == FIXED_LEN_BYTE_ARRAY)       ? 16
+              : (dtype == INT96)                    ? 12
               : (dtype == INT64 || dtype == DOUBLE) ? 8
               : (dtype == BOOLEAN)                  ? 1
                                                     : 4;
@@ -878,7 +879,8 @@ __global__ void __launch_bounds__(128, 8)
   // Encode data values
   __syncthreads();
   dtype         = s->col.physical_type;
-  dtype_len_out = (dtype == INT96)                      ? 12
+  dtype_len_out = (dtype == FIXED_LEN_BYTE_ARRAY)       ? 16
+                  : (dtype == INT96)                    ? 12
                   : (dtype == INT64 || dtype == DOUBLE) ? 8
                   : (dtype == BOOLEAN)                  ? 1
                                                         : 4;
@@ -1087,6 +1089,29 @@ __global__ void __launch_bounds__(128, 8)
             dst[pos + 3] = v >> 24;
             if (v != 0) memcpy(dst + pos + 4, str.data(), v);
           } break;
+          case FIXED_LEN_BYTE_ARRAY: {
+            if (s->col.leaf_column->type().id() == type_id::DECIMAL128) {
+              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+              auto v        = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+              auto v_       = reinterpret_cast<char*>(&v);
+              dst[pos + 0]  = v_[15];
+              dst[pos + 1]  = v_[14];
+              dst[pos + 2]  = v_[13];
+              dst[pos + 3]  = v_[12];
+              dst[pos + 4]  = v_[11];
+              dst[pos + 5]  = v_[10];
+              dst[pos + 6]  = v_[9];
+              dst[pos + 7]  = v_[8];
+              dst[pos + 8]  = v_[7];
+              dst[pos + 9]  = v_[6];
+              dst[pos + 10] = v_[5];
+              dst[pos + 11] = v_[4];
+              dst[pos + 12] = v_[3];
+              dst[pos + 13] = v_[2];
+              dst[pos + 14] = v_[1];
+              dst[pos + 15] = v_[0];
+            }
+          } break;
         }
       }
       __syncthreads();
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2c7d745bb4c..c01a4f2f9d1 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -343,7 +343,9 @@ struct leaf_schema_fn {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else if (std::is_same_v<T, numeric::decimal128>) {
-      CUDF_FAIL("decimal128 currently not supported for parquet writer");
+      col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
+      col_schema.type_length = 16;
+      col_schema.stats_dtype = statistics_dtype::dtype_decimal128;
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
@@ -1208,8 +1210,9 @@ void writer::impl::write(table_view const& table)
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
   for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk = (uint32_t)(
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+    uint32_t fragments_in_chunk =
+      (uint32_t)((md.row_groups[global_r].num_rows + max_page_fragment_size - 1) /
+                 max_page_fragment_size);
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3bae8d7ab1e..1438c3a28c0 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -463,6 +463,58 @@ TEST_F(ParquetWriterTest, MultiColumn)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
+TEST_F(ParquetWriterTest, DecimalColumns)
+{
+  constexpr auto num_rows = 5;
+
+  // auto col0_data = random_values<bool>(num_rows);
+  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col7_vals = random_values<int64_t>(num_rows);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+    return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+    return numeric::decimal64{col6_vals[i], numeric::scale_type{5}};
+  });
+  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+    return numeric::decimal128{i * 10000, numeric::scale_type{2}};
+  });
+  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  // column_wrapper<bool> col0{
+  //    col0_data.begin(), col0_data.end(), validity};
+  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, validity};
+
+  std::vector<std::unique_ptr<column>> cols;
+  // cols.push_back(col0.release());
+  cols.push_back(col6.release());
+  cols.push_back(col7.release());
+  cols.push_back(col8.release());
+  auto expected = std::make_unique<table>(std::move(cols));
+  EXPECT_EQ(3, expected->num_columns());
+
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  // expected_metadata.column_metadata[0].set_name( "bools");
+  expected_metadata.column_metadata[0].set_name("decimal32s").set_decimal_precision(10);
+  expected_metadata.column_metadata[1].set_name("decimal64s").set_decimal_precision(10);
+  expected_metadata.column_metadata[2].set_name("decimal128s").set_decimal_precision(10);
+
+  auto filepath = ("MultiColumn.parquet");
+  cudf_io::parquet_writer_options out_opts =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+      .metadata(&expected_metadata);
+  cudf_io::write_parquet(out_opts);
+
+  cudf_io::parquet_reader_options in_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
 TEST_F(ParquetWriterTest, MultiColumnWithNulls)
 {
   constexpr auto num_rows = 100;