From c1f20c7be6bc7cab8af1fb20032f05c186d41cfe Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Tue, 16 Nov 2021 18:18:04 -0500
Subject: [PATCH 01/72] Add support for `decimal128` (#9483)

Fixes https://github.com/rapidsai/cudf/issues/9597
Fixes https://github.com/rapidsai/cudf/issues/9565

Previously, `fixed_point` along with `decimal32` and `decimal64` were added to support DecimalType (see https://github.com/rapidsai/cudf/issues/3556 for a list of major and minor PRs). With [support for `__int128_t` now in CUDA 11.5](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-general-new-features), we can support `decimal128.` This PR enables `decimal128`.

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Mark Harris (https://github.com/harrism)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Jake Hemstad (https://github.com/jrhemstad)
---
 CONTRIBUTING.md                               |   4 +-
 conda/recipes/libcudf/meta.yaml               |   1 +
 .../cudf/column/column_device_view.cuh        |  27 +-
 .../cudf/detail/aggregation/aggregation.cuh   |  70 +++--
 .../cudf/detail/aggregation/aggregation.hpp   |  15 +-
 cpp/include/cudf/detail/copy_if.cuh           |   7 +-
 cpp/include/cudf/detail/iterator.cuh          |   3 +-
 .../detail/utilities/device_operators.cuh     |  15 +-
 .../cudf/detail/utilities/hash_functions.cuh  |  14 +
 .../cudf/detail/utilities/integer_utils.hpp   |  12 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  25 +-
 cpp/include/cudf/fixed_point/temporary.hpp    |  86 ++++++
 cpp/include/cudf/io/orc.hpp                   |  34 ++-
 cpp/include/cudf/scalar/scalar.hpp            |   2 +-
 .../strings/detail/convert/fixed_point.cuh    |  14 +-
 cpp/include/cudf/types.hpp                    |   3 +-
 cpp/include/cudf/utilities/traits.cuh         |  67 +++++
 cpp/include/cudf/utilities/traits.hpp         |   5 +-
 .../cudf/utilities/type_dispatcher.hpp        |  54 ++--
 cpp/include/cudf_test/column_wrapper.hpp      |  18 +-
 cpp/include/cudf_test/type_lists.hpp          |   3 +-
 cpp/src/aggregation/aggregation.cpp           |   6 +-
 cpp/src/aggregation/aggregation.cu            |   5 +-
 cpp/src/binaryop/binaryop.cpp                 | 266 ++----------------
 cpp/src/filling/fill.cu                       |  14 +-
 cpp/src/groupby/hash/groupby.cu               |  10 +-
 cpp/src/io/json/json_gpu.cu                   |   8 +
 cpp/src/io/orc/aggregate_orc_metadata.hpp     |   4 +-
 cpp/src/io/orc/orc.cpp                        |  16 +-
 cpp/src/io/orc/orc_gpu.h                      |   6 +-
 cpp/src/io/orc/reader_impl.cu                 |  72 +++--
 cpp/src/io/orc/reader_impl.hpp                |   1 +
 cpp/src/io/orc/stripe_data.cu                 | 122 ++++----
 cpp/src/io/orc/stripe_enc.cu                  |  18 +-
 cpp/src/io/orc/writer_impl.cu                 |  27 +-
 cpp/src/io/parquet/reader_impl.cu             |   2 +-
 cpp/src/io/parquet/writer_impl.cu             |   2 +
 cpp/src/jit/type.cpp                          |   1 +
 cpp/src/quantiles/quantiles_util.hpp          |   1 +
 cpp/src/reductions/scan/scan.cuh              |   2 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |   5 +-
 cpp/src/reductions/simple.cuh                 |   4 +-
 cpp/src/round/round.cu                        |  23 +-
 cpp/src/scalar/scalar.cpp                     |   3 +
 .../strings/convert/convert_fixed_point.cu    |  17 +-
 cpp/src/strings/convert/utilities.cuh         |  47 ++--
 cpp/src/transform/row_bit_count.cu            |   2 +-
 cpp/src/unary/cast_ops.cu                     |  10 +-
 cpp/src/unary/math_ops.cu                     |   2 +-
 cpp/tests/binaryop/binop-compiled-test.cpp    | 136 +++++----
 cpp/tests/binaryop/binop-integration-test.cpp | 196 +++++++------
 cpp/tests/copying/concatenate_tests.cu        |   6 +-
 cpp/tests/copying/scatter_tests.cpp           |   6 +-
 cpp/tests/filling/fill_tests.cpp              |  43 +++
 cpp/tests/fixed_point/fixed_point_tests.cpp   |  50 ++--
 cpp/tests/fixed_point/fixed_point_tests.cu    |   8 +-
 cpp/tests/groupby/count_scan_tests.cpp        |   6 +-
 cpp/tests/groupby/count_tests.cpp             |   6 +-
 cpp/tests/groupby/max_scan_tests.cpp          |   6 +-
 cpp/tests/groupby/max_tests.cpp               |   8 +-
 cpp/tests/groupby/min_scan_tests.cpp          |   6 +-
 cpp/tests/groupby/min_tests.cpp               |   8 +-
 cpp/tests/groupby/sum_scan_tests.cpp          |  17 +-
 cpp/tests/groupby/sum_tests.cpp               |  30 +-
 cpp/tests/io/orc_test.cpp                     | 114 +++++++-
 cpp/tests/merge/merge_test.cpp                |   6 +-
 cpp/tests/reductions/reduction_tests.cpp      |  98 +++++--
 cpp/tests/reductions/scan_tests.hpp           |   5 +
 cpp/tests/replace/replace_tests.cpp           |   6 +-
 .../reshape/interleave_columns_tests.cpp      |   6 +-
 cpp/tests/rolling/rolling_test.cpp            |  12 +-
 cpp/tests/round/round_tests.cpp               |  62 ++++
 cpp/tests/search/search_test.cpp              |   8 +-
 cpp/tests/sort/sort_test.cpp                  |   6 +-
 cpp/tests/strings/fixed_point_tests.cpp       | 133 +++++++--
 cpp/tests/transform/row_bit_count_test.cu     |   1 -
 cpp/tests/unary/cast_tests.cpp                | 205 +++++++++++---
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |   4 +
 78 files changed, 1481 insertions(+), 892 deletions(-)
 create mode 100644 cpp/include/cudf/fixed_point/temporary.hpp
 create mode 100644 cpp/include/cudf/utilities/traits.cuh

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f83d7c5b759..aae62fbd47c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,12 +62,12 @@ The following instructions are for developers and contributors to cuDF OSS devel
 Compilers:
 
 * `gcc`     version 9.3+
-* `nvcc`    version 11.0+
+* `nvcc`    version 11.5+
 * `cmake`   version 3.20.1+
 
 CUDA/GPU:
 
-* CUDA 11.0+
+* CUDA 11.5+
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index d39c7aaa39d..e78110f3233 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -117,6 +117,7 @@ test:
     - test -f $PREFIX/include/cudf/dictionary/update_keys.hpp
     - test -f $PREFIX/include/cudf/filling.hpp
     - test -f $PREFIX/include/cudf/fixed_point/fixed_point.hpp
+    - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp
     - test -f $PREFIX/include/cudf/groupby.hpp
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 0b7ca2096a3..6ecb0796283 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -421,39 +421,22 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief Returns a `numeric::decimal32` element at the specified index for a `fixed_point`
+   * @brief Returns a `numeric::fixed_point` element at the specified index for a `fixed_point`
    * column.
    *
    * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
    * then any attempt to use the result will lead to undefined behavior.
    *
    * @param element_index Position of the desired element
-   * @return numeric::decimal32 representing the element at this index
+   * @return numeric::fixed_point representing the element at this index
    */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal32>)>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
   __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
+    using rep        = typename T::rep;
     auto const scale = scale_type{_type.scale()};
-    return decimal32{scaled_integer<int32_t>{data<int32_t>()[element_index], scale}};
-  }
-
-  /**
-   * @brief Returns a `numeric::decimal64` element at the specified index for a `fixed_point`
-   * column.
-   *
-   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
-   * then any attempt to use the result will lead to undefined behavior.
-   *
-   * @param element_index Position of the desired element
-   * @return numeric::decimal64 representing the element at this index
-   */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal64>)>
-  __device__ T element(size_type element_index) const noexcept
-  {
-    using namespace numeric;
-    auto const scale = scale_type{_type.scale()};
-    return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
+    return T{scaled_integer<rep>{data<rep>()[element_index], scale}};
   }
 
   /**
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 53c1f47c201..47aa7d18489 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -131,7 +132,8 @@ struct update_target_element<
   aggregation::MIN,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -148,11 +150,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MIN,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -177,7 +181,8 @@ struct update_target_element<
   aggregation::MAX,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -194,11 +199,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MAX,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -223,7 +230,8 @@ struct update_target_element<
   aggregation::SUM,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -240,11 +248,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -267,7 +277,8 @@ struct update_target_element<Source,
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.
  *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a dictionary.
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
  *
  */
 template <bool target_has_nulls = true>
@@ -581,9 +592,7 @@ struct identity_initializer {
   template <typename T, aggregation::Kind k>
   static constexpr bool is_supported()
   {
-    // Note: !is_fixed_point<T>() means that aggregations for fixed_point should happen on the
-    //       underlying type (see device_storage_type_t), not that fixed_point is not supported
-    return cudf::is_fixed_width<T>() && !is_fixed_point<T>() and
+    return cudf::is_fixed_width<T>() and
            (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or
             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or
             k == aggregation::ARGMAX or k == aggregation::ARGMIN or
@@ -596,7 +605,8 @@ struct identity_initializer {
   std::enable_if_t<not std::is_same<corresponding_operator_t<k>, void>::value, T>
   identity_from_operator()
   {
-    return corresponding_operator_t<k>::template identity<T>();
+    using DeviceType = device_storage_type_t<T>;
+    return corresponding_operator_t<k>::template identity<DeviceType>();
   }
 
   template <typename T, aggregation::Kind k>
@@ -613,9 +623,11 @@ struct identity_initializer {
       if constexpr (cudf::is_timestamp<T>())
         return k == aggregation::ARGMAX ? T{typename T::duration(ARGMAX_SENTINEL)}
                                         : T{typename T::duration(ARGMIN_SENTINEL)};
-      else
-        return k == aggregation::ARGMAX ? static_cast<T>(ARGMAX_SENTINEL)
-                                        : static_cast<T>(ARGMIN_SENTINEL);
+      else {
+        using DeviceType = device_storage_type_t<T>;
+        return k == aggregation::ARGMAX ? static_cast<DeviceType>(ARGMAX_SENTINEL)
+                                        : static_cast<DeviceType>(ARGMIN_SENTINEL);
+      }
     }
     return identity_from_operator<T, k>();
   }
@@ -625,7 +637,11 @@ struct identity_initializer {
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
                                                           rmm::cuda_stream_view stream)
   {
-    thrust::fill(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), get_identity<T, k>());
+    using DeviceType = device_storage_type_t<T>;
+    thrust::fill(rmm::exec_policy(stream),
+                 col.begin<DeviceType>(),
+                 col.end<DeviceType>(),
+                 get_identity<DeviceType, k>());
   }
 
   template <typename T, aggregation::Kind k>
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 69bde7f57fd..c2bd7a4893c 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1085,8 +1085,8 @@ template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<is_fixed_width<Source>() && not is_chrono<Source>() &&
-                   not is_fixed_point<Source>() && (k == aggregation::MEAN)>> {
+  std::enable_if_t<is_fixed_width<Source>() and not is_chrono<Source>() and
+                   not is_fixed_point<Source>() and (k == aggregation::MEAN)>> {
   using type = double;
 };
 
@@ -1113,12 +1113,13 @@ struct target_type_impl<
   using type = int64_t;
 };
 
-// Summing fixed_point numbers, always use the decimal64 accumulator
+// Summing fixed_point numbers
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<Source,
-                        k,
-                        std::enable_if_t<is_fixed_point<Source>() && (k == aggregation::SUM)>> {
-  using type = numeric::decimal64;
+struct target_type_impl<
+  Source,
+  k,
+  std::enable_if_t<cudf::is_fixed_point<Source>() && (k == aggregation::SUM)>> {
+  using type = Source;
 };
 
 // Summing/Multiplying float/doubles, use same type accumulator
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index bb5cfa5c6e0..fb4c636fcb0 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -217,12 +217,7 @@ struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
 };
 
 template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal32, T>>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal64, T>>> {
+struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   using type = typename cudf::device_storage_type_t<T>;
 };
 
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index deb161fd9c2..3e789299716 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -102,9 +102,8 @@ struct null_replaced_value_accessor {
                                bool has_nulls = true)
     : col{col}, null_replacement{null_val}, has_nulls{has_nulls}
   {
-    CUDF_EXPECTS(type_to_id<Element>() == device_storage_type_id(col.type().id()),
+    CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(col.type().id()),
                  "the data type mismatch");
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 7524593e5ea..95605dc8a71 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -22,6 +22,7 @@
  */
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
@@ -119,7 +120,7 @@ struct DeviceMin {
   CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::min(lhs, rhs))
   {
-    return cudf::detail::min(lhs, rhs);
+    return numeric::detail::min(lhs, rhs);
   }
 
   template <
@@ -128,14 +129,15 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return std::numeric_limits<T>::max();
+    if constexpr (cudf::is_chrono<T>()) return T::max();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
-    return std::numeric_limits<T>::max();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   // @brief identity specialized for string_view
@@ -160,7 +162,7 @@ struct DeviceMax {
   CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::max(lhs, rhs))
   {
-    return cudf::detail::max(lhs, rhs);
+    return numeric::detail::max(lhs, rhs);
   }
 
   template <
@@ -169,14 +171,15 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return std::numeric_limits<T>::lowest();
+    if constexpr (cudf::is_chrono<T>()) return T::min();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
-    return std::numeric_limits<T>::lowest();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index ebb21492be9..f3390d9387b 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -274,6 +274,13 @@ MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) co
   return this->compute(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
@@ -419,6 +426,13 @@ SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& ke
   return this->compute<uint64_t>(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute<__int128_t>(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index dc919433da7..ddedab3944c 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -22,6 +22,8 @@
  * @file Utility code involving integer arithmetic
  */
 
+#include <cudf/fixed_point/temporary.hpp>
+
 #include <cmath>
 #include <cstdlib>
 #include <stdexcept>
@@ -151,17 +153,11 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
-{
-  return std::abs(value);
-}
-// Unsigned type just returns itself.
-template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+constexpr inline auto absolute_value(T value) -> T
 {
+  if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
 }
 
 }  // namespace util
-
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index af2715d1290..e8223b53997 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/assert.cuh>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/types.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
@@ -48,13 +49,15 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same_v<T, int32_t> || cuda::std::is_same_v<T, int64_t>;
+  return cuda::std::is_same_v<T, int32_t> ||  //
+         cuda::std::is_same_v<T, int64_t> ||  //
+         cuda::std::is_same_v<T, __int128_t>;
 }
 
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
-  return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() || cuda::std::is_floating_point<T>::value;
 }
 
 // Helper functions for `fixed_point` type
@@ -551,17 +554,18 @@ class fixed_point {
   explicit operator std::string() const
   {
     if (_scale < 0) {
-      auto const av   = std::abs(_value);
-      int64_t const n = std::pow(10, -_scale);
-      int64_t const f = av % n;
+      auto const av = detail::abs(_value);
+      Rep const n   = detail::exp10<Rep>(-_scale);
+      Rep const f   = av % n;
       auto const num_zeros =
-        std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
+        std::max(0, (-_scale - static_cast<int32_t>(detail::to_string(f).size())));
       auto const zeros = std::string(num_zeros, '0');
       auto const sign  = _value < 0 ? std::string("-") : std::string();
-      return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n);
+      return sign + detail::to_string(av / n) + std::string(".") + zeros +
+             detail::to_string(av % n);
     } else {
       auto const zeros = std::string(_scale, '0');
-      return std::to_string(_value) + zeros;
+      return detail::to_string(_value) + zeros;
     }
   }
 };
@@ -752,8 +756,9 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point<Rep1, Rad1> const& lhs,
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
 }
 
-using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+using decimal32  = fixed_point<int32_t, Radix::BASE_10>;
+using decimal64  = fixed_point<int64_t, Radix::BASE_10>;
+using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;
 
 /** @} */  // end of group
 }  // namespace numeric
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
new file mode 100644
index 00000000000..2b50e273517
--- /dev/null
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+// Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
+//       Jitify is needed for several algorithms (binaryop, rolling, etc)
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <string>
+
+namespace numeric {
+namespace detail {
+
+template <typename T>
+auto to_string(T value) -> std::string
+{
+  if constexpr (cuda::std::is_same_v<T, __int128_t>) {
+    auto s          = std::string{};
+    auto const sign = value < 0;
+    if (sign) {
+      value += 1;  // avoid overflowing if value == _int128_t lowest
+      value *= -1;
+      if (value == cuda::std::numeric_limits<__int128_t>::max())
+        return "-170141183460469231731687303715884105728";
+      value += 1;  // can add back the one, no need to avoid overflow anymore
+    }
+    while (value) {
+      s.push_back("0123456789"[value % 10]);
+      value /= 10;
+    }
+    if (sign) s.push_back('-');
+    std::reverse(s.begin(), s.end());
+    return s;
+  } else {
+    return std::to_string(value);
+  }
+  return std::string{};  // won't ever hit here, need to supress warning though
+}
+
+template <typename T>
+constexpr auto abs(T value)
+{
+  return value >= 0 ? value : -value;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs)
+{
+  return lhs < rhs ? lhs : rhs;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
+{
+  return lhs > rhs ? lhs : rhs;
+}
+
+template <typename BaseType>
+constexpr auto exp10(int32_t exponent)
+{
+  BaseType value = 1;
+  while (exponent > 0)
+    value *= 10, --exponent;
+  return value;
+}
+
+}  // namespace detail
+}  // namespace numeric
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 2a95b85465b..fb1199fc166 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -70,6 +70,9 @@ class orc_reader_options {
   // Columns that should be converted from Decimal to Float64
   std::vector<std::string> _decimal_cols_as_float;
 
+  // Columns that should be read as Decimal128
+  std::vector<std::string> _decimal128_columns;
+
   friend orc_reader_options_builder;
 
   /**
@@ -136,13 +139,18 @@ class orc_reader_options {
   data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
-   * @brief Columns that should be converted from Decimal to Float64.
+   * @brief Fully qualified names of columns that should be converted from Decimal to Float64.
    */
   std::vector<std::string> const& get_decimal_cols_as_float() const
   {
     return _decimal_cols_as_float;
   }
 
+  /**
+   * @brief Fully qualified names of columns that should be read as 128-bit Decimal.
+   */
+  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
+
   // Setters
 
   /**
@@ -210,12 +218,22 @@ class orc_reader_options {
   /**
    * @brief Set columns that should be converted from Decimal to Float64
    *
-   * @param val Vector of column names.
+   * @param val Vector of fully qualified column names.
    */
   void set_decimal_cols_as_float(std::vector<std::string> val)
   {
     _decimal_cols_as_float = std::move(val);
   }
+
+  /**
+   * @brief Set columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of fully qualified column names.
+   */
+  void set_decimal128_columns(std::vector<std::string> val)
+  {
+    _decimal128_columns = std::move(val);
+  }
 };
 
 class orc_reader_options_builder {
@@ -332,6 +350,18 @@ class orc_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of column names.
+   * @return this for chaining.
+   */
+  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
+  {
+    options._decimal128_columns = std::move(val);
+    return *this;
+  }
+
   /**
    * @brief move orc_reader_options member once it's built.
    */
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index aa98c2e6404..dc2df368bae 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -359,7 +359,7 @@ class fixed_point_scalar : public scalar {
   rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Get the decimal32 or decimal64.
+   * @brief Get the decimal32, decimal64 or decimal128.
    *
    * @param stream CUDA stream used for device memory operations.
    */
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 56205c161b1..aa3f544202f 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,7 +17,7 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace strings {
@@ -46,7 +46,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
   constexpr UnsignedDecimalType decimal_max =
     (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
-  uint64_t value     = 0;  // for checking overflow
+  __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
   bool decimal_found = false;
 
@@ -137,7 +137,7 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   // if string begins with a sign, continue with next character
   if (sign != 0) ++iter;
 
-  using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+  using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
   auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
   if (value == 0) { return DecimalType{0}; }
 
@@ -150,11 +150,9 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   exp_ten += exp_offset;
 
   // shift the output value based on the exp_ten and the scale values
-  if (exp_ten < scale) {
-    value = value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)));
-  } else {
-    value = value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
-  }
+  value = exp_ten < scale
+            ? value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)))
+            : value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
 }
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index e026ae9ac0f..13d5f8e06bc 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -236,6 +236,7 @@ enum class type_id : int32_t {
   LIST,                    ///< List elements
   DECIMAL32,               ///< Fixed-point type with int32_t
   DECIMAL64,               ///< Fixed-point type with int64_t
+  DECIMAL128,              ///< Fixed-point type with __int128_t
   STRUCT,                  ///< Struct elements
   // `NUM_TYPE_IDS` must be last!
   NUM_TYPE_IDS  ///< Total number of type ids
@@ -271,7 +272,7 @@ class data_type {
    */
   explicit data_type(type_id id, int32_t scale) : _id{id}, _fixed_point_scale{scale}
   {
-    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64);
+    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128);
   }
 
   /**
diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
new file mode 100644
index 00000000000..43587ffa583
--- /dev/null
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf {
+
+/**
+ * @addtogroup utility_types
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Indicates whether the type `T` has support for atomics
+ *
+ * @tparam T     The type to verify
+ * @return true  `T` has support for atomics
+ * @return false `T` no support for atomics
+ */
+template <typename T>
+constexpr inline bool has_atomic_support()
+{
+  return cuda::std::atomic<T>::is_always_lock_free;
+}
+
+struct has_atomic_support_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return has_atomic_support<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` has support for atomics
+ *
+ * @param type   The `data_type` to verify
+ * @return true  `type` has support for atomics
+ * @return false `type` no support for atomics
+ */
+constexpr inline bool has_atomic_support(data_type type)
+{
+  return cudf::type_dispatcher(type, has_atomic_support_impl{});
+}
+
+/** @} */
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 40a833112e1..d1bd3049ba3 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -177,7 +177,7 @@ inline bool is_equality_comparable(data_type type)
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() or std::is_floating_point<T>::value;
 }
 
 struct is_numeric_impl {
@@ -404,7 +404,8 @@ constexpr inline bool is_timestamp(data_type type)
 template <typename T>
 constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T>;
+  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
+         std::is_same_v<numeric::decimal128, T>;
 }
 
 struct is_fixed_point_impl {
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 857ddafa82c..a04b8309142 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -85,8 +85,9 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 /**
  * @brief "Returns" the corresponding type that is stored on the device when using `cudf::column`
  *
- * For `decimal32`, the storage type is an `int32_t`.
- * For `decimal64`, the storage type is an `int64_t`.
+ * For `decimal32`,  the storage type is an `int32_t`.
+ * For `decimal64`,  the storage type is an `int64_t`.
+ * For `decimal128`, the storage type is an `__int128_t`.
  *
  * Use this "type function" with the `using` type alias:
  * @code
@@ -98,25 +99,11 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 // clang-format off
 template <typename T>
 using device_storage_type_t =
-  std::conditional_t<std::is_same_v<numeric::decimal32, T>, int32_t,
-  std::conditional_t<std::is_same_v<numeric::decimal64, T>, int64_t, T>>;
+  std::conditional_t<std::is_same_v<numeric::decimal32,  T>, int32_t,
+  std::conditional_t<std::is_same_v<numeric::decimal64,  T>, int64_t,
+  std::conditional_t<std::is_same_v<numeric::decimal128, T>, __int128_t, T>>>;
 // clang-format on
 
-/**
- * @brief Returns the corresponding `type_id` of type stored on device for a given `type_id`
- *
- * @param id   The given `type_id`
- * @return     Corresponding `type_id` of type stored on device
- */
-inline type_id device_storage_type_id(type_id id)
-{
-  switch (id) {
-    case type_id::DECIMAL32: return type_id::INT32;
-    case type_id::DECIMAL64: return type_id::INT64;
-    default: return id;
-  }
-}
-
 /**
  * @brief Checks if `fixed_point`-like types have template type `T` matching the column's
  * stored type id
@@ -127,10 +114,24 @@ inline type_id device_storage_type_id(type_id id)
  * @return     `false` If T does not match the stored column `type_id`
  */
 template <typename T>
-bool type_id_matches_device_storage_type(type_id id)
+constexpr bool type_id_matches_device_storage_type(type_id id)
 {
   return (id == type_id::DECIMAL32 && std::is_same_v<T, int32_t>) ||
-         (id == type_id::DECIMAL64 && std::is_same_v<T, int64_t>) || id == type_to_id<T>();
+         (id == type_id::DECIMAL64 && std::is_same_v<T, int64_t>) ||
+         (id == type_id::DECIMAL128 && std::is_same_v<T, __int128_t>) || id == type_to_id<T>();
+}
+
+/**
+ * @brief Checks if `id` is fixed_point (DECIMAL32/64/128)
+ *
+ * @return    `true` if `id` is `DECIMAL32`, `DECIMAL64` or `DECIMAL128`
+ * @return    `false` otherwise
+ */
+constexpr bool is_fixed_point(cudf::type_id id)
+{
+  return id == type_id::DECIMAL32 or  //
+         id == type_id::DECIMAL64 or  //
+         id == type_id::DECIMAL128;
 }
 
 /**
@@ -188,6 +189,7 @@ CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32);
 CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST);
 CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
 CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
+CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128);
 CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
 
 /**
@@ -221,6 +223,7 @@ MAP_NUMERIC_SCALAR(int8_t)
 MAP_NUMERIC_SCALAR(int16_t)
 MAP_NUMERIC_SCALAR(int32_t)
 MAP_NUMERIC_SCALAR(int64_t)
+MAP_NUMERIC_SCALAR(__int128_t)
 MAP_NUMERIC_SCALAR(uint8_t)
 MAP_NUMERIC_SCALAR(uint16_t)
 MAP_NUMERIC_SCALAR(uint32_t)
@@ -253,6 +256,12 @@ struct type_to_scalar_type_impl<numeric::decimal64> {
   using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal64>;
 };
 
+template <>
+struct type_to_scalar_type_impl<numeric::decimal128> {
+  using ScalarType       = cudf::fixed_point_scalar<numeric::decimal128>;
+  using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal128>;
+};
+
 template <>  // TODO: this is a temporary solution for make_pair_iterator
 struct type_to_scalar_type_impl<cudf::dictionary32> {
   using ScalarType       = cudf::numeric_scalar<int32_t>;
@@ -492,6 +501,9 @@ CUDF_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functo
     case type_id::DECIMAL64:
       return f.template operator()<typename IdTypeMap<type_id::DECIMAL64>::type>(
         std::forward<Ts>(args)...);
+    case type_id::DECIMAL128:
+      return f.template operator()<typename IdTypeMap<type_id::DECIMAL128>::type>(
+        std::forward<Ts>(args)...);
     case type_id::STRUCT:
       return f.template operator()<typename IdTypeMap<type_id::STRUCT>::type>(
         std::forward<Ts>(args)...);
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 4bc48769592..f291b04776a 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -509,11 +509,10 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same_v<Rep, int32_t>;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,
@@ -574,11 +573,10 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same_v<Rep, int32_t>;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 2a02caa0326..097d072a5b4 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -264,7 +264,8 @@ using ListTypes = cudf::test::Types<list_view>;
  * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedPointTypes);
  * ```
  */
-using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using FixedPointTypes =
+  cudf::test::Types<numeric::decimal32, numeric::decimal64, numeric::decimal128>;
 
 /**
  * @brief Provides a list of all fixed-width element types for use in GTest
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 31bf9d65d56..c00b3d6db85 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -759,9 +759,9 @@ struct target_type_functor {
   template <typename Source, aggregation::Kind k>
   constexpr data_type operator()() const noexcept
   {
-    auto const id = type_to_id<target_type_t<Source, k>>();
-    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 ? data_type{id, type.scale()}
-                                                                : data_type{id};
+    using Type    = target_type_t<Source, k>;
+    auto const id = type_to_id<Type>();
+    return cudf::is_fixed_point<Type>() ? data_type{id, type.scale()} : data_type{id};
   }
 };
 
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index b9193345c94..02998b84ffd 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -27,9 +27,8 @@ void initialize_with_identity(mutable_table_view& table,
   // TODO: Initialize all the columns in a single kernel instead of invoking one
   // kernel per column
   for (size_type i = 0; i < table.num_columns(); ++i) {
-    auto col        = table.column(i);
-    auto const type = data_type{device_storage_type_id(col.type().id())};
-    dispatch_type_and_aggregation(type, aggs[i], identity_initializer{}, col, stream);
+    auto col = table.column(i);
+    dispatch_type_and_aggregation(col.type(), aggs[i], identity_initializer{}, col, stream);
   }
 }
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index b9ed95daf1b..e84e175eaca 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -266,6 +266,22 @@ void binary_operation(mutable_column_view& out,
 
 // Compiled Binary operation
 namespace compiled {
+
+template <typename Lhs, typename Rhs>
+void fixed_point_binary_operation_validation(binary_operator op,
+                                             Lhs lhs,
+                                             Rhs rhs,
+                                             thrust::optional<cudf::data_type> output_type = {})
+{
+  CUDF_EXPECTS((is_fixed_point(lhs) or is_fixed_point(rhs)),
+               "One of the inputs must have fixed_point data_type.");
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation");
+  if (output_type.has_value() and binops::is_comparison_binop(op))
+    CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
+                 "Comparison operations require boolean output type.");
+}
+
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * binary_operator, data_type, rmm::mr::device_memory_resource*)
@@ -291,6 +307,11 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
   if (not cudf::binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
     CUDF_FAIL("Unsupported operator for these types");
 
+  if (cudf::is_fixed_point(lhs.type()) or cudf::is_fixed_point(rhs.type())) {
+    cudf::binops::compiled::fixed_point_binary_operation_validation(
+      op, lhs.type(), rhs.type(), output_type);
+  }
+
   auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if constexpr (std::is_same_v<LhsType, column_view>)
@@ -398,235 +419,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   }
 };
 
-template <typename Lhs, typename Rhs>
-void fixed_point_binary_operation_validation(binary_operator op,
-                                             Lhs lhs,
-                                             Rhs rhs,
-                                             thrust::optional<cudf::data_type> output_type = {})
-{
-  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
-               "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
-  if (output_type.has_value()) {
-    if (binops::is_comparison_binop(op))
-      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
-                   "Comparison operations require boolean output type.");
-    else
-      CUDF_EXPECTS(is_fixed_point(output_type.value()),
-                   "fixed_point binary operations require fixed_point output type.");
-  }
-}
-
 namespace jit {
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `scalar` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{rhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff = lhs.type().scale() - rhs.type().scale();
-      if (lhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      }
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `scalar` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     scalar const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() > lhs.type().scale()) {
-      auto const diff = rhs.type().scale() - lhs.type().scale();
-      if (rhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(rhs).value(stream);
-        auto const scale  = scale_type{lhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      } else {
-        CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      }
-    } else {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (rhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of two `column_view`s
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty() or rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
 
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
@@ -639,11 +432,10 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
@@ -667,11 +459,10 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
@@ -697,11 +488,10 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
+  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
+  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
@@ -827,7 +617,7 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs)
 {
-  cudf::detail::fixed_point_binary_operation_validation(op, lhs, rhs);
+  cudf::binops::compiled::fixed_point_binary_operation_validation(op, lhs, rhs);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.scale(), rhs.scale());
   return cudf::data_type{lhs.id(), scale};
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 749a4d7940c..d17c698f91a 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -77,8 +77,7 @@ struct in_place_fill_range_dispatch {
     auto unscaled = static_cast<cudf::fixed_point_scalar<T> const&>(value).value(stream);
     using RepType = typename T::rep;
     auto s        = cudf::numeric_scalar<RepType>(unscaled, value.is_valid(stream));
-    auto view     = cudf::bit_cast(destination, s.type());
-    in_place_fill<RepType>(view, begin, end, s, stream);
+    in_place_fill<RepType>(destination, begin, end, s, stream);
   }
 
   template <typename T, typename... Args>
@@ -93,13 +92,15 @@ struct out_of_place_fill_range_dispatch {
   cudf::column_view const& input;
 
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
+  std::enable_if_t<not cudf::is_rep_layout_compatible<T>() and not cudf::is_fixed_point<T>(),
+                   std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
     CUDF_FAIL("Unsupported type in fill.");
   }
 
-  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
+  template <typename T,
+            CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type begin,
     cudf::size_type end,
@@ -116,8 +117,9 @@ struct out_of_place_fill_range_dispatch {
           0);
       }
 
-      auto ret_view = p_ret->mutable_view();
-      in_place_fill<T>(ret_view, begin, end, value, stream);
+      auto ret_view    = p_ret->mutable_view();
+      using DeviceType = cudf::device_storage_type_t<T>;
+      in_place_fill<DeviceType>(ret_view, begin, end, value, stream);
     }
 
     return p_ret;
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 6a9eaf0af90..e35fa36a289 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -42,6 +42,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 #include <hash/concurrent_unordered_map.cuh>
 
@@ -51,6 +52,8 @@
 #include <unordered_set>
 #include <utility>
 
+#include <cuda/std/atomic>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -634,9 +637,10 @@ bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request
 {
   auto const all_hash_aggregations =
     std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-        return is_hash_aggregation(a->kind);
-      });
+      return cudf::has_atomic_support(r.values.type()) and
+             std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+               return is_hash_aggregation(a->kind);
+             });
     });
 
   // Currently, structs are not supported in any of hash-based aggregations.
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index b5ed43558d2..c655d18a4df 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -196,6 +196,14 @@ __inline__ __device__ numeric::decimal64 decode_value(const char*,
   return numeric::decimal64{};
 }
 
+template <>
+__inline__ __device__ numeric::decimal128 decode_value(const char*,
+                                                       const char*,
+                                                       parse_options_view const&)
+{
+  return numeric::decimal128{};
+}
+
 /**
  * @brief Functor for converting plain text data to cuDF data type value.
  */
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 356d20843e8..5132906a5fc 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -86,7 +86,7 @@ class aggregate_orc_metadata {
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  auto column_name(const int source_idx, const int column_id) const
+  std::string const& column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -98,7 +98,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  auto column_path(const int source_idx, const int column_id) const
+  std::string const& column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 89eac0c9901..44cea6169e4 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -18,6 +18,8 @@
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
+#include <cudf/lists/lists_column_view.hpp>
+
 #include <thrust/tabulate.h>
 
 #include <string>
@@ -472,10 +474,16 @@ void metadata::init_column_names()
   thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
     if (not column_has_parent(col_id)) return std::string{};
     auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
-    // Child columns of lists don't have a name in ORC files, generate placeholder in that case
-    return field_index(col_id) < static_cast<size_type>(parent_field_names.size())
-             ? parent_field_names[field_index(col_id)]
-             : std::to_string(col_id);
+    if (field_index(col_id) < static_cast<size_type>(parent_field_names.size())) {
+      return parent_field_names[field_index(col_id)];
+    }
+
+    // Generate names for list and map child columns
+    if (ff.types[parent_id(col_id)].subtypes.size() == 1) {
+      return std::to_string(lists_column_view::child_column_index);
+    } else {
+      return std::to_string(field_index(col_id));
+    }
   });
 
   column_paths.resize(get_num_columns());
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index f6a7c3f5f03..ad4450bc6a7 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -84,11 +84,6 @@ struct DictionaryEntry {
   uint32_t len;  // Length in data stream
 };
 
-/**
- * @brief Mask to indicate conversion from decimals to float64
- */
-constexpr int orc_decimal2float64_scale = 0x80;
-
 /**
  * @brief Struct to describe per stripe's column information
  */
@@ -111,6 +106,7 @@ struct ColumnDesc {
   ColumnEncodingKind encoding_kind;        // column encoding kind
   TypeKind type_kind;                      // column data type
   uint8_t dtype_len;          // data type length (for types that can be mapped to different sizes)
+  type_id dtype_id;           // TODO
   int32_t decimal_scale;      // number of fractional decimal digits for decimal type
   type_id timestamp_type_id;  // output timestamp type id (type_id::EMPTY by default)
   column_validity_info parent_validity_info;  // consists of parent column valid_map and null count
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 31d4184993f..798cdca178a 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -57,7 +57,7 @@ namespace {
 constexpr type_id to_type_id(const orc::SchemaType& schema,
                              bool use_np_dtypes,
                              type_id timestamp_type_id,
-                             bool decimals_as_float64)
+                             type_id decimal_type_id)
 {
   switch (schema.kind) {
     case orc::BOOLEAN: return type_id::BOOL8;
@@ -79,7 +79,7 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL64;
+    case orc::DECIMAL: return decimal_type_id;
     // Need to update once cuDF plans to support map type
     case orc::MAP:
     case orc::LIST: return type_id::LIST;
@@ -228,15 +228,26 @@ size_t gather_stream_info(const size_t stripe_index,
 }
 
 /**
- * @brief Determines if a column should be converted from decimal to float
+ * @brief Determines cuDF type of an ORC Decimal column.
  */
-bool should_convert_decimal_column_to_float(const std::vector<std::string>& columns_to_convert,
-                                            cudf::io::orc::metadata& metadata,
-                                            int column_index)
+auto decimal_column_type(const std::vector<std::string>& float64_columns,
+                         const std::vector<std::string>& decimal128_columns,
+                         cudf::io::orc::metadata& metadata,
+                         int column_index)
 {
-  return (std::find(columns_to_convert.begin(),
-                    columns_to_convert.end(),
-                    metadata.column_name(column_index)) != columns_to_convert.end());
+  auto const& column_path = metadata.column_path(column_index);
+  auto is_column_in       = [&](const std::vector<std::string>& cols) {
+    return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end();
+  };
+
+  auto const user_selected_float64    = is_column_in(float64_columns);
+  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
+  CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
+               "Both decimal128 and float64 types selected for column " + column_path);
+
+  if (user_selected_float64) return type_id::FLOAT64;
+  if (user_selected_decimal128) return type_id::DECIMAL128;
+  return type_id::DECIMAL64;
 }
 
 }  // namespace
@@ -728,12 +739,12 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
                                                           rmm::cuda_stream_view stream)
 {
   schema_info.name = _metadata.column_name(0, orc_col_id);
-  // If the column type is orc::DECIMAL see if the user
-  // desires it to be converted to float64 or not
-  auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-    _decimal_cols_as_float, _metadata.per_file_metadata[0], orc_col_id);
-  auto const type = to_type_id(
-    _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+  auto const type  = to_type_id(
+    _metadata.get_schema(orc_col_id),
+    _use_np_dtypes,
+    _timestamp_type.id(),
+    decimal_column_type(
+      _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -784,7 +795,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL64) {
+      if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -875,8 +886,9 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Enable or disable the conversion to numpy-compatible dtypes
   _use_np_dtypes = options.is_enabled_use_np_dtypes();
 
-  // Control decimals conversion (float64 or int64 with optional scale)
+  // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
+  decimal128_columns     = options.get_decimal128_columns();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -936,18 +948,18 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto& col : columns_level) {
-      // If the column type is orc::DECIMAL see if the user
-      // desires it to be converted to float64 or not
-      auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-        _decimal_cols_as_float, _metadata.per_file_metadata[0], col.id);
       auto col_type = to_type_id(
-        _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+        _metadata.get_col_type(col.id),
+        _use_np_dtypes,
+        _timestamp_type.id(),
+        decimal_column_type(
+          _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      // Remove this once we support Decimal128 data type
       CUDF_EXPECTS(
         (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
-        "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
-      if (col_type == type_id::DECIMAL64) {
+        "Precision of column " + std::string{_metadata.column_name(0, col.id)} +
+          " is over 18, use 128-bit Decimal.");
+      if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
@@ -1114,13 +1126,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
-            chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-              _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
-            chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+            chunk.dtype_id       = column_types[col_idx].id();
+            chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                     .ff.types[columns_level[col_idx].id]
-                                    .scale.value_or(0) |
-                                  (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
+                                    .scale.value_or(0);
 
             chunk.rowgroup_id   = rowgroup_id;
             chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index c9de2211d48..64e7cbc74e5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -222,6 +222,7 @@ class reader::impl {
   bool _use_index     = true;
   bool _use_np_dtypes = true;
   std::vector<std::string> _decimal_cols_as_float;
+  std::vector<std::string> decimal128_columns;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index bcbe77d9df8..44f106c4f5c 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -45,11 +45,6 @@ inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding
 static __device__ __constant__ int64_t kORCTimeToUTC =
   1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
 
-struct int128_s {
-  uint64_t lo;
-  int64_t hi;
-};
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -127,12 +122,14 @@ struct orcdec_state_s {
     orc_rowdec_state_s rowdec;
   } u;
   union values {
-    uint8_t u8[block_size * 8];
-    uint32_t u32[block_size * 2];
-    int32_t i32[block_size * 2];
-    uint64_t u64[block_size];
-    int64_t i64[block_size];
-    double f64[block_size];
+    uint8_t u8[block_size * 16];
+    uint32_t u32[block_size * 4];
+    int32_t i32[block_size * 4];
+    uint64_t u64[block_size * 2];
+    int64_t i64[block_size * 2];
+    double f64[block_size * 2];
+    __int128_t i128[block_size];
+    __uint128_t u128[block_size];
   } vals;
 };
 
@@ -451,29 +448,18 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ int128_s decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
 {
-  uint32_t b        = bytestream_readbyte(bs, pos++);
-  int64_t sign_mask = -(int32_t)(b & 1);
-  uint64_t v        = (b >> 1) & 0x3f;
-  uint32_t bitpos   = 6;
-  uint64_t lo       = v;
-  uint64_t hi       = 0;
-  while (b > 0x7f && bitpos < 128) {
-    b = bytestream_readbyte(bs, pos++);
-    v |= ((uint64_t)(b & 0x7f)) << (bitpos & 0x3f);
-    if (bitpos == 62) {  // 6 + 7 * 8 = 62
-      lo = v;
-      v  = (b & 0x7f) >> 2;  // 64 - 62
-    }
+  auto byte                  = bytestream_readbyte(bs, pos++);
+  __int128_t const sign_mask = -(int32_t)(byte & 1);
+  __int128_t value           = (byte >> 1) & 0x3f;
+  uint32_t bitpos            = 6;
+  while (byte & 0x80 && bitpos < 128) {
+    byte = bytestream_readbyte(bs, pos++);
+    value |= ((__uint128_t)(byte & 0x7f)) << bitpos;
     bitpos += 7;
   }
-  if (bitpos >= 64) {
-    hi = v;
-  } else {
-    lo = v;
-  }
-  return {(uint64_t)(lo ^ sign_mask), (int64_t)(hi ^ sign_mask)};
+  return value ^ sign_mask;
 }
 
 /**
@@ -1031,6 +1017,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
                                       volatile orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
+                                      type_id dtype_id,
                                       int col_scale,
                                       int t)
 {
@@ -1046,8 +1033,8 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         uint32_t pos = lastpos;
         pos += varint_length<uint4>(bs, pos);
         if (pos > maxpos) break;
-        vals.i64[n] = lastpos;
-        lastpos     = pos;
+        vals.i64[2 * n] = lastpos;
+        lastpos         = pos;
       }
       scratch->num_vals = n;
       bytestream_flush_bytes(bs, lastpos - bs->pos);
@@ -1055,43 +1042,36 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
     __syncthreads();
     uint32_t num_vals_to_read = scratch->num_vals;
     if (t >= num_vals_read and t < num_vals_to_read) {
-      auto const pos = static_cast<int>(vals.i64[t]);
-      int128_s v     = decode_varint128(bs, pos);
+      auto const pos = static_cast<int>(vals.i64[2 * t]);
+      __int128_t v   = decode_varint128(bs, pos);
 
-      if (col_scale & orc_decimal2float64_scale) {
-        double f      = Int128ToDouble_rn(v.lo, v.hi);
+      if (dtype_id == type_id::FLOAT64) {
+        double f      = v;
         int32_t scale = (t < numvals) ? val_scale : 0;
         if (scale >= 0)
           vals.f64[t] = f / kPow10[min(scale, 39)];
         else
           vals.f64[t] = f * kPow10[min(-scale, 39)];
       } else {
-        // Since cuDF column stores just one scale, value needs to
-        // be adjusted to col_scale from val_scale. So the difference
-        // of them will be used to add 0s or remove digits.
-        int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
-        if (scale >= 0) {
-          scale       = min(scale, 27);
-          vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
-        } else  // if (scale < 0)
-        {
-          bool is_negative = (v.hi < 0);
-          uint64_t hi = v.hi, lo = v.lo;
-          scale = min(-scale, 27);
-          if (is_negative) {
-            hi = (~hi) + (lo == 0);
-            lo = (~lo) + 1;
+        auto const scaled_value = [&]() {
+          // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from
+          // val_scale. So the difference of them will be used to add 0s or remove digits.
+          int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
+          if (scale >= 0) {
+            scale = min(scale, 27);
+            return (v * kPow5i[scale]) << scale;
+          } else  // if (scale < 0)
+          {
+            scale = min(-scale, 27);
+            return (v / kPow5i[scale]) >> scale;
           }
-          lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
-          hi >>= (int32_t)scale;
-          if (hi != 0) {
-            // Use intermediate float
-            lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
-            hi = 0;
-          } else {
-            lo /= kPow5i[scale];
+        }();
+        if (dtype_id == type_id::DECIMAL64) {
+          vals.i64[t] = scaled_value;
+        } else {
+          {
+            vals.i128[t] = scaled_value;
           }
-          vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
         }
       }
     }
@@ -1653,8 +1633,14 @@ __global__ void __launch_bounds__(block_size)
           }
           val_scale = (t < numvals) ? (int)s->vals.i64[skip + t] : 0;
           __syncthreads();
-          numvals = Decode_Decimals(
-            &s->bs, &s->u.rle8, s->vals, val_scale, numvals, s->chunk.decimal_scale, t);
+          numvals = Decode_Decimals(&s->bs,
+                                    &s->u.rle8,
+                                    s->vals,
+                                    val_scale,
+                                    numvals,
+                                    s->chunk.dtype_id,
+                                    s->chunk.decimal_scale,
+                                    t);
         }
         __syncthreads();
       } else if (s->chunk.type_kind == FLOAT) {
@@ -1720,9 +1706,15 @@ __global__ void __launch_bounds__(block_size)
             case FLOAT:
             case INT: static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
             case DOUBLE:
-            case LONG:
+            case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              if (s->chunk.dtype_id == type_id::FLOAT64 or
+                  s->chunk.dtype_id == type_id::DECIMAL64) {
+                static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              } else {
+                // decimal128
+                static_cast<__uint128_t*>(data_out)[row] = s->vals.u128[t + vals_skipped];
+              }
               break;
             case MAP:
             case LIST: {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ff7b642be0e..217aee8756e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -117,6 +117,12 @@ static inline __device__ uint64_t zigzag(int64_t v)
   return ((v ^ -s) * 2) + s;
 }
 
+static inline __device__ __uint128_t zigzag(__int128_t v)
+{
+  int64_t s = (v < 0) ? 1 : 0;
+  return ((v ^ -s) * 2) + s;
+}
+
 static inline __device__ uint32_t CountLeadingBytes32(uint32_t v) { return __clz(v) >> 3; }
 static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clzll(v) >> 3; }
 
@@ -279,11 +285,11 @@ static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = {
 /**
  * @brief Encode a varint value, return the number of bytes written
  */
-static inline __device__ uint32_t StoreVarint(uint8_t* dst, uint64_t v)
+static inline __device__ uint32_t StoreVarint(uint8_t* dst, __uint128_t v)
 {
   uint32_t bytecnt = 0;
   for (;;) {
-    uint32_t c = (uint32_t)(v & 0x7f);
+    auto c = static_cast<uint32_t>(v & 0x7f);
     v >>= 7u;
     if (v == 0) {
       dst[bytecnt++] = c;
@@ -938,9 +944,11 @@ __global__ void __launch_bounds__(block_size)
             break;
           case DECIMAL: {
             if (is_value_valid) {
-              uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32)
-                                        ? zigzag(column.element<int32_t>(row))
-                                        : zigzag(column.element<int64_t>(row));
+              auto const id = column.type().id();
+              __uint128_t const zz_val =
+                id == type_id::DECIMAL32   ? zigzag(column.element<int32_t>(row))
+                : id == type_id::DECIMAL64 ? zigzag(column.element<int64_t>(row))
+                                           : zigzag(column.element<__int128_t>(row));
               auto const offset =
                 (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
               StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1e580e360ca..2bf020d08a2 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -43,6 +43,8 @@
 #include <numeric>
 #include <utility>
 
+#include <cuda/std/limits>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -97,7 +99,8 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
     case cudf::type_id::TIMESTAMP_NANOSECONDS: return TypeKind::TIMESTAMP;
     case cudf::type_id::STRING: return TypeKind::STRING;
     case cudf::type_id::DECIMAL32:
-    case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
+    case cudf::type_id::DECIMAL64:
+    case cudf::type_id::DECIMAL128: return TypeKind::DECIMAL;
     case cudf::type_id::LIST: return list_column_as_map ? TypeKind::MAP : TypeKind::LIST;
     case cudf::type_id::STRUCT: return TypeKind::STRUCT;
     default: return TypeKind::INVALID_TYPE_KIND;
@@ -123,9 +126,11 @@ constexpr int32_t to_clockscale(cudf::type_id timestamp_id)
  */
 constexpr auto orc_precision(cudf::type_id decimal_id)
 {
+  using namespace numeric;
   switch (decimal_id) {
-    case cudf::type_id::DECIMAL32: return 9;
-    case cudf::type_id::DECIMAL64: return 18;
+    case cudf::type_id::DECIMAL32: return cuda::std::numeric_limits<decimal32::rep>::digits10;
+    case cudf::type_id::DECIMAL64: return cuda::std::numeric_limits<decimal64::rep>::digits10;
+    case cudf::type_id::DECIMAL128: return cuda::std::numeric_limits<decimal128::rep>::digits10;
     default: return 0;
   }
 }
@@ -1637,13 +1642,17 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                            if (!parent_index.has_value()) return nullptr;
                            return d_cols[parent_index.value()].pushdown_mask;
                          }();
+
                          if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
                            return 0u;
-                         int64_t const element   = (col.type().id() == type_id::DECIMAL32)
-                                                     ? col.element<int32_t>(idx)
-                                                     : col.element<int64_t>(idx);
-                         int64_t const sign      = (element < 0) ? 1 : 0;
-                         uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
+
+                         __int128_t const element =
+                           col.type().id() == type_id::DECIMAL32   ? col.element<int32_t>(idx)
+                           : col.type().id() == type_id::DECIMAL64 ? col.element<int64_t>(idx)
+                                                                   : col.element<__int128_t>(idx);
+
+                         __int128_t const sign      = (element < 0) ? 1 : 0;
+                         __uint128_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
                          uint32_t encoded_length = 1;
                          while (zigzaged_value > 127) {
@@ -1767,7 +1776,7 @@ void writer::impl::write(table_view const& table)
     [&](column_in_metadata& col_meta, std::string default_name) {
       if (col_meta.get_name().empty()) col_meta.set_name(default_name);
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i));
+        add_default_name(col_meta.child(i), std::to_string(i));
       }
     };
   for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index f144a02bc89..28144276066 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -216,7 +216,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
 
   int8_t converted_type = converted;
   if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      column_type_id != type_id::DECIMAL32 && column_type_id != type_id::DECIMAL64) {
+      not cudf::is_fixed_point(column_type_id)) {
     converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2ab5d7d696b..2c7d745bb4c 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -342,6 +342,8 @@ struct leaf_schema_fn {
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
+    } else if (std::is_same_v<T, numeric::decimal128>) {
+      CUDF_FAIL("decimal128 currently not supported for parquet writer");
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index 16894168b31..cf91932ca19 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -76,6 +76,7 @@ std::string get_type_name(data_type type)
     case type_id::STRUCT: return CUDF_STRINGIFY(Struct);
     case type_id::DECIMAL32: return CUDF_STRINGIFY(int32_t);
     case type_id::DECIMAL64: return CUDF_STRINGIFY(int64_t);
+    case type_id::DECIMAL128: return CUDF_STRINGIFY(__int128_t);
 
     default: break;
   }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index def4a400488..a0554833def 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -18,6 +18,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 5eeb6a1deb5..84387aba914 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -53,7 +53,7 @@ std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
     case aggregation::PRODUCT:
       // a product scan on a decimal type with non-zero scale would result in each element having
       // a different scale, and because scale is stored once per column, this is not possible
-      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64 cannot support product scan");
+      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
       return type_dispatcher<dispatch_storage_type>(
         input.type(), DispatchFn<DeviceProduct>(), input, null_handling, stream, mr);
     default: CUDF_FAIL("Unsupported aggregation operator for scan");
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 383b64d45a1..3cae782d18f 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,7 +50,7 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  template <typename T, typename std::enable_if_t<cuda::std::is_arithmetic<T>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -72,7 +72,8 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  std::enable_if_t<not cuda::std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
+    Args&&...)
   {
     CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
   }
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index d83ad91d89b..13dfe5cb26c 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -74,7 +74,7 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
 /**
  * @brief Reduction for `sum`, `product`, `min` and `max` for decimal types
  *
- * @tparam DecimalXX  The `decimal32` or `decimal64` type
+ * @tparam DecimalXX  The `decimal32`, `decimal64` or `decimal128` type
  * @tparam Op         The operator of cudf::reduction::op::
  *
  * @param col         Input column of data to reduce
@@ -395,7 +395,7 @@ struct element_type_dispatcher {
   }
 
   /**
-   * @brief Specialization for reducing integer column types to any output type.
+   * @brief Specialization for reducing fixed_point column types to fixed_point number
    */
   template <typename ElementType,
             typename std::enable_if_t<cudf::is_fixed_point<ElementType>()>* = nullptr>
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 24f55d67a72..36dd2dabd72 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/round.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -46,26 +47,26 @@ inline double __device__ generic_round_half_even(double d) { return rint(d); }
 inline float __device__ generic_modf(float a, float* b) { return modff(a, b); }
 inline double __device__ generic_modf(double a, double* b) { return modf(a, b); }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
-  return abs(value);
+  return numeric::detail::abs(value);
 }
 
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
   return value;
 }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T value)
 {
   return value < 0 ? -1 : 1;
 }
 
 // this is needed to suppress warning: pointless comparison of unsigned integer with zero
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T)
 {
   return 1;
@@ -86,7 +87,7 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -105,7 +106,7 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -122,7 +123,7 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -139,7 +140,7 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -158,7 +159,7 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -175,7 +176,7 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 5b7abdfcaf0..4f6774be184 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -208,6 +208,7 @@ typename fixed_point_scalar<T>::rep_type const* fixed_point_scalar<T>::data() co
  */
 template class fixed_point_scalar<numeric::decimal32>;
 template class fixed_point_scalar<numeric::decimal64>;
+template class fixed_point_scalar<numeric::decimal128>;
 
 namespace detail {
 
@@ -281,6 +282,7 @@ template class fixed_width_scalar<int8_t>;
 template class fixed_width_scalar<int16_t>;
 template class fixed_width_scalar<int32_t>;
 template class fixed_width_scalar<int64_t>;
+template class fixed_width_scalar<__int128_t>;
 template class fixed_width_scalar<uint8_t>;
 template class fixed_width_scalar<uint16_t>;
 template class fixed_width_scalar<uint32_t>;
@@ -339,6 +341,7 @@ template class numeric_scalar<int8_t>;
 template class numeric_scalar<int16_t>;
 template class numeric_scalar<int32_t>;
 template class numeric_scalar<int64_t>;
+template class numeric_scalar<__int128_t>;
 template class numeric_scalar<uint8_t>;
 template class numeric_scalar<uint16_t>;
 template class numeric_scalar<uint32_t>;
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index c29aa6560e9..6944a8eb097 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -37,6 +37,9 @@
 #include <thrust/optional.h>
 #include <thrust/transform.h>
 
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -83,7 +86,7 @@ struct string_to_decimal_check_fn {
   int32_t const scale;
 
   string_to_decimal_check_fn(column_device_view const& d_strings, int32_t scale)
-    : d_strings(d_strings), scale(scale)
+    : d_strings{d_strings}, scale{scale}
   {
   }
 
@@ -97,7 +100,7 @@ struct string_to_decimal_check_fn {
 
     auto const iter_end = d_str.data() + d_str.size_bytes();
 
-    using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+    using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
     auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
 
     // only exponent notation is expected here
@@ -115,7 +118,7 @@ struct string_to_decimal_check_fn {
     // finally, check for overflow based on the exp_ten and scale values
     return (exp_ten < scale) or
            value <= static_cast<UnsignedDecimalType>(
-                      std::numeric_limits<DecimalType>::max() /
+                      cuda::std::numeric_limits<DecimalType>::max() /
                       static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
@@ -206,8 +209,8 @@ struct decimal_to_string_size_fn {
 
     if (scale >= 0) return count_digits(value) + scale;
 
-    auto const abs_value = std::abs(value);
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const abs_value = numeric::detail::abs(value);
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const fraction  = count_digits(abs_value % exp_ten);
     auto const num_zeros = std::max(0, (-scale - fraction));
     return static_cast<int32_t>(value < 0) +    // sign if negative
@@ -247,9 +250,9 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    auto const abs_value = std::abs(value);
     if (value < 0) *d_buffer++ = '-';  // add sign
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const abs_value = numeric::detail::abs(value);
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
     d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 746923526a1..234ecf48f2e 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -64,8 +64,8 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
     *d_buffer = '0';
     return 1;
   }
-  bool const is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
-  //
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
+
   constexpr IntegerType base = 10;
   constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
   char digits[MAX_DIGITS];          // place-holder for digit chars
@@ -97,36 +97,25 @@ template <typename IntegerType>
 constexpr size_type count_digits(IntegerType value)
 {
   if (value == 0) return 1;
-  bool is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;
   // for all integer types, the max() and min() values have the same number of digits
-  value = (value == std::numeric_limits<IntegerType>::min())
-            ? std::numeric_limits<IntegerType>::max()
+  value = (value == cuda::std::numeric_limits<IntegerType>::min())
+            ? cuda::std::numeric_limits<IntegerType>::max()
             : cudf::util::absolute_value(value);
-  // largest 8-byte unsigned value is 18446744073709551615 (20 digits)
-  // clang-format off
-  size_type digits =
-    (value < 10 ? 1 :
-    (value < 100 ? 2 :
-    (value < 1000 ? 3 :
-    (value < 10000 ? 4 :
-    (value < 100000 ? 5 :
-    (value < 1000000 ? 6 :
-    (value < 10000000 ? 7 :
-    (value < 100000000 ? 8 :
-    (value < 1000000000 ? 9 :
-    (value < 10000000000 ? 10 :
-    (value < 100000000000 ? 11 :
-    (value < 1000000000000 ? 12 :
-    (value < 10000000000000 ? 13 :
-    (value < 100000000000000 ? 14 :
-    (value < 1000000000000000 ? 15 :
-    (value < 10000000000000000 ? 16 :
-    (value < 100000000000000000 ? 17 :
-    (value < 1000000000000000000 ? 18 :
-    (value < 10000000000000000000 ? 19 :
-    20)))))))))))))))))));
-  // clang-format on
+
+  auto const digits = [value] {
+    // largest 8-byte  unsigned value is 18446744073709551615 (20 digits)
+    // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits)
+    auto constexpr max_digits = std::is_same_v<IntegerType, __int128_t> ? 39 : 20;
+
+    size_type digits = 1;
+    __int128_t pow10 = 10;
+    for (; digits < max_digits; ++digits, pow10 *= 10)
+      if (value < pow10) break;
+    return digits;
+  }();
+
   return digits + static_cast<size_type>(is_negative);
 }
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index efa011ea4a6..06b03a6b36f 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -205,7 +205,7 @@ struct flatten_functor {
                   thrust::optional<int> parent_index)
   {
     // track branch depth as we reach this list and after we pass it
-    size_type const branch_depth_start = cur_branch_depth;
+    auto const branch_depth_start = cur_branch_depth;
     auto const is_list_inside_struct =
       parent_index && out[parent_index.value()].type().id() == type_id::STRUCT;
     if (is_list_inside_struct) {
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index c316b2c6f4e..e852b00796a 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -160,7 +160,7 @@ struct device_cast {
  * @brief Takes a `fixed_point` column_view as @p input and returns a `fixed_point` column with new
  * @p scale
  *
- * @tparam T     Type of the `fixed_point` column_view (`decimal32` or `decimal64`)
+ * @tparam T     Type of the `fixed_point` column_view (`decimal32`, `decimal64` or `decimal128`)
  * @param input  Input `column_view`
  * @param scale  `scale` of the returned `column`
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -176,7 +176,7 @@ std::unique_ptr<column> rescale(column_view input,
 {
   using namespace numeric;
 
-  if (input.type().scale() > scale) {
+  if (input.type().scale() >= scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
     auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
     return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
@@ -338,9 +338,9 @@ struct dispatch_unary_cast_to {
 
   {
     if (!cudf::is_fixed_width<TargetT>())
-      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
     else if (cudf::is_fixed_point<SourceT>())
-      CUDF_FAIL("Currently only decimal32/64 to floating point/integral is supported");
+      CUDF_FAIL("Currently only decimal32/64/128 to floating point/integral is supported");
     else if (cudf::is_timestamp<SourceT>() && is_numeric<TargetT>())
       CUDF_FAIL("Timestamps can be created only from duration");
     else
@@ -364,7 +364,7 @@ struct dispatch_unary_cast_from {
   template <typename T, typename... Args>
   std::enable_if_t<!cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
   {
-    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
   }
 };
 }  // anonymous namespace
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index e5b20243810..a938fc4d650 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -271,7 +271,7 @@ struct fixed_point_floor {
 template <typename T>
 struct fixed_point_abs {
   T n;
-  __device__ T operator()(T data) { return std::abs(data); }
+  __device__ T operator()(T data) { return numeric::detail::abs(data); }
 };
 
 template <typename T, template <typename> typename FixedPointFunctor>
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index a177a8378b7..7a9f6135bcd 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -107,21 +107,27 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
 // t      	     	t + d
 // d      	d + t	d + d
 
-using Add_types =
-  cudf::test::Types<cudf::test::Types<bool, bool, float>,
-                    cudf::test::Types<int16_t, double, uint8_t>,
-                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
-                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
-                    cudf::test::Types<duration_us, duration_us, duration_D>,
-                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
-                    // Extras
-                    cudf::test::Types<duration_D, duration_D, duration_D>,
-                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
-                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
-                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
-                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+using namespace numeric;
+
+using Add_types = cudf::test::Types<cudf::test::Types<bool, bool, float>,
+                                    cudf::test::Types<int16_t, double, uint8_t>,
+                                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
+                                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
+                                    cudf::test::Types<duration_us, duration_us, duration_D>,
+                                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    // Extras
+                                    cudf::test::Types<duration_D, duration_D, duration_D>,
+                                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
+                                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
+                                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
+                                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
 };
@@ -144,8 +150,13 @@ using Sub_types =
                     cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
                     cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
                     cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<int, decimal32, decimal32>,
+                    cudf::test::Types<int, decimal64, decimal64>,
+                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
@@ -161,14 +172,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
 // n n * n	     	n * d
 // t
 // d d * n
-using Mul_types =
-  cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
-                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
-                    cudf::test::Types<duration_ms, duration_D, int16_t>,
-                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, int, int>>;
+using Mul_types = cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
+                                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
+                                    cudf::test::Types<duration_ms, duration_D, int16_t>,
+                                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    cudf::test::Types<decimal32, int, int>,
+                                    cudf::test::Types<decimal64, int, int>,
+                                    cudf::test::Types<decimal128, int, int>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
@@ -184,16 +201,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
 // n n / n
 // t
 // d d / n	     	d / d
-using Div_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
-                    cudf::test::Types<duration_ns, duration_D, int16_t>,
-                    cudf::test::Types<double, duration_D, duration_ns>,
-                    cudf::test::Types<float, duration_ms, duration_ns>,
-                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+using Div_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                    cudf::test::Types<double, int8_t, int64_t>,
+                                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+                                    cudf::test::Types<duration_ns, duration_D, int16_t>,
+                                    cudf::test::Types<double, duration_D, duration_ns>,
+                                    cudf::test::Types<float, duration_ms, duration_ns>,
+                                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
@@ -209,13 +230,11 @@ TYPED_TEST(BinaryOperationCompiledTest_Div, Vector_Vector)
 // n n / n
 // t
 // d
-using TrueDiv_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<int8_t, bool, u_int32_t>,
-                    cudf::test::Types<u_int64_t, float, int16_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+using TrueDiv_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                        cudf::test::Types<double, int8_t, int64_t>,
+                                        cudf::test::Types<int8_t, bool, u_int32_t>,
+                                        cudf::test::Types<u_int64_t, float, int16_t>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
 };
@@ -458,16 +477,17 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
 
 // Comparison Operations ==, !=, <, >, <=, >=
 // n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
-using Comparison_types =
-  cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
-                    cudf::test::Types<bool, uint32_t, uint16_t>,
-                    cudf::test::Types<bool, uint64_t, double>,
-                    cudf::test::Types<bool, timestamp_D, timestamp_s>,
-                    cudf::test::Types<bool, timestamp_ns, timestamp_us>,
-                    cudf::test::Types<bool, duration_ns, duration_ns>,
-                    cudf::test::Types<bool, duration_us, duration_s>,
-                    cudf::test::Types<bool, std::string, std::string>,
-                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>>;
+using Comparison_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
+                                           cudf::test::Types<bool, uint32_t, uint16_t>,
+                                           cudf::test::Types<bool, uint64_t, double>,
+                                           cudf::test::Types<bool, timestamp_D, timestamp_s>,
+                                           cudf::test::Types<bool, timestamp_ns, timestamp_us>,
+                                           cudf::test::Types<bool, duration_ns, duration_ns>,
+                                           cudf::test::Types<bool, duration_us, duration_s>,
+                                           cudf::test::Types<bool, std::string, std::string>,
+                                           cudf::test::Types<bool, decimal32, decimal32>,
+                                           cudf::test::Types<bool, decimal64, decimal64>,
+                                           cudf::test::Types<bool, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
@@ -519,9 +539,15 @@ using Null_types =
                     cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
                     cudf::test::Types<duration_ns, duration_us, duration_s>,
                     // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, uint32_t, numeric::decimal32>,
-                    cudf::test::Types<int64_t, numeric::decimal64, int64_t>>;
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<decimal32, uint32_t, decimal32>,
+                    cudf::test::Types<decimal64, uint32_t, decimal64>,
+                    cudf::test::Types<decimal128, uint32_t, decimal128>,
+                    cudf::test::Types<int64_t, decimal32, decimal32>,
+                    cudf::test::Types<int64_t, decimal64, decimal64>,
+                    cudf::test::Types<int64_t, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 9cdd03fdd62..427a21512a3 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2018,14 +2018,14 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2053,12 +2053,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2086,7 +2086,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2094,7 +2094,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2108,12 +2108,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2127,12 +2127,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2146,12 +2146,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2163,12 +2163,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2183,12 +2183,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2202,12 +2202,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2221,12 +2221,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2238,12 +2238,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd5)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2255,12 +2255,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd6)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2272,14 +2272,14 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
+TYPED_TEST(FixedPointTestAllReps, FixedPointCast)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2293,7 +2293,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiplyScalar)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2305,12 +2305,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpSimplePlus)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2324,12 +2324,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2340,13 +2340,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2357,12 +2357,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
   auto const result =
-    cudf::jit::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0Null)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2372,13 +2372,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale2Null)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2388,13 +2388,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2415,8 +2415,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add =
-    cudf::jit::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -2427,19 +2426,19 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
 
   auto const btype = cudf::data_type{type_id::BOOL8};
   auto const equal_result =
-    cudf::jit::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
   auto const less_result =
-    cudf::jit::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
   auto const greater_result =
-    cudf::jit::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMaxSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2454,12 +2453,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMinSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2474,12 +2473,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullEqualsSimple)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2490,13 +2489,13 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::jit::binary_operation(
+  auto const result = cudf::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2507,12 +2506,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2523,12 +2522,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2539,12 +2538,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2555,12 +2554,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div6)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2572,12 +2571,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div7)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2589,12 +2588,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div8)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2605,12 +2604,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div9)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2621,12 +2620,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div10)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2637,12 +2636,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div11)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2653,12 +2652,12 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
+TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2666,13 +2665,50 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
 
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
                cudf::logic_error);
 }
 
+template <typename T>
+struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
+};
+
+using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
+TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
+
+TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
+    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
+    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
+    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
+    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
+    auto const bool_type = cudf::data_type{type_id::BOOL8};
+
+    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
+    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
+    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
+    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
+    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
+    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
+  }
+}
+
 }  // namespace binop
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index d2279580c58..306037e6473 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -1549,15 +1549,15 @@ TEST_F(ListsColumnTest, ListOfStructs)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointConcatentate)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index e4846d4b2c6..28ebb6cbcb6 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -899,14 +899,14 @@ TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointScatter)
+TYPED_TEST(FixedPointTestAllReps, FixedPointScatter)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 6ced6e545d5..f305d4a06c7 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -363,4 +363,47 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
   EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
 }
 
+template <typename T>
+class FixedPointAllReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(FixedPointAllReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointAllReps, OutOfPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto const result = cudf::fill(column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+  }
+}
+
+TYPED_TEST(FixedPointAllReps, InPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto column         = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto mut_column = cudf::mutable_column_view{column};
+    cudf::fill_in_place(mut_column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(column, expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 339585756c9..d01fec3e173 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -35,14 +35,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -63,7 +63,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -84,7 +84,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -109,7 +109,7 @@ TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
   EXPECT_EQ(0.000123, static_cast<double>(y));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -138,7 +138,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
   EXPECT_EQ(1.4375, static_cast<double>(num9));
 }
 
-TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -149,7 +149,7 @@ TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, static_cast<double>(num1));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -174,7 +174,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
   EXPECT_EQ(a - b, a);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
+TYPED_TEST(FixedPointTestAllReps, ComparisonOperators)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -193,7 +193,7 @@ TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
   EXPECT_TRUE(SIX / TWO >= ONE);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -223,7 +223,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
   EXPECT_EQ(static_cast<int32_t>(n), 20);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -251,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
   EXPECT_TRUE(FIVE_0 * THREE_0 != TEN_1);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
+TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -276,7 +276,7 @@ TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
   EXPECT_EQ(c - d, zz);
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledTest)
+TYPED_TEST(FixedPointTestAllReps, RescaledTest)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -296,7 +296,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledTest)
   EXPECT_EQ(num5, num6.rescaled(scale_type{-5}));
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
+TYPED_TEST(FixedPointTestAllReps, RescaledRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -311,7 +311,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
   EXPECT_EQ(-1000, static_cast<TypeParam>(num3.rescaled(scale_type{3})));
 }
 
-TYPED_TEST(FixedPointTestBothReps, BoolConversion)
+TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -468,7 +468,7 @@ struct cast_to_int32_fn {
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
   using namespace numeric;
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -489,7 +489,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, w);
 }
 
-TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
+TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
 {
   auto null_mask = cudf::create_null_mask(0, cudf::mask_state::ALL_NULL);
 
@@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
     cudf::logic_error);
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
@@ -524,8 +524,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -547,14 +547,14 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
+TYPED_TEST(FixedPointTestAllReps, ExtremelyLargeNegativeScale)
 {
   // This is testing fixed_point values with an extremely large negative scale. The fixed_point
   // implementation should be able to handle any scale representable by an int32_t
@@ -570,10 +570,10 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
   auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
 
   auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index c650a7191be..51610949f6f 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -40,14 +40,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 062efe6094e..164e967e28e 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -156,12 +156,12 @@ TEST_F(groupby_count_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
+TYPED_TEST(FixedPointTestAllReps, GroupByCountScan)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index c70f3fd942a..c1cabe3fb08 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -169,12 +169,12 @@ TEST_F(groupby_count_string_test, basic)
 // clang-format on
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCount)
+TYPED_TEST(FixedPointTestAllReps, GroupByCount)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index bb2f87fd424..196aeed0430 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -145,12 +145,12 @@ TEST_F(groupby_max_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 8d15401aa09..983802cb9a2 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -252,12 +252,12 @@ TEST_F(groupby_dictionary_max_test, fixed_width)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -281,7 +281,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index 06c0f5ceb3b..e4c018a9ce1 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -143,12 +143,12 @@ TEST_F(groupby_min_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index c2cfca83b29..aca3384768c 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -252,12 +252,12 @@ TEST_F(groupby_dictionary_min_test, fixed_width)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -280,7 +280,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 2de32b70d14..3117f8b1557 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -133,18 +133,17 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX      = TypeParam;
-  using RepType        = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper     = fixed_point_column_wrapper<RepType>;
-  using out_fp_wrapper = fixed_point_column_wrapper<int64_t>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -152,8 +151,8 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
     auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
     auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
 
-    auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
-    auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
+    auto const expect_keys     = key_wrapper{1, 1, 1, 2, 2,  2,  2, 3, 3,  3};
+    auto const expect_vals_sum = fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
     // clang-format on
 
     auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 1aa6358b7b2..5947e309bec 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -157,19 +157,18 @@ TYPED_TEST(groupby_sum_test, dictionary)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -179,7 +178,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg1 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(
@@ -192,14 +191,13 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -209,7 +207,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg5 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 0633dfbf791..da44c91eec3 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -341,9 +341,12 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_vals = random_values<int64_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
   });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
@@ -353,16 +356,17 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col7{col7_data, col7_data + num_rows, validity};
 
-  cudf::test::lists_column_wrapper<int64_t> col7{
+  cudf::test::lists_column_wrapper<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
 
   auto child_col =
     cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
-  auto col8 = cudf::test::structs_column_wrapper{child_col};
+  auto col9 = cudf::test::structs_column_wrapper{child_col};
 
-  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
 
   cudf_io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("bools");
@@ -371,9 +375,10 @@ TEST_F(OrcWriterTest, MultiColumn)
   expected_metadata.column_metadata[3].set_name("int32s");
   expected_metadata.column_metadata[4].set_name("floats");
   expected_metadata.column_metadata[5].set_name("doubles");
-  expected_metadata.column_metadata[6].set_name("decimal");
-  expected_metadata.column_metadata[7].set_name("lists");
-  expected_metadata.column_metadata[8].set_name("structs");
+  expected_metadata.column_metadata[6].set_name("decimal_pos_scale");
+  expected_metadata.column_metadata[7].set_name("decimal_neg_scale");
+  expected_metadata.column_metadata[8].set_name("lists");
+  expected_metadata.column_metadata[9].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
@@ -382,7 +387,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"});
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -1185,7 +1192,6 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  // Need a 64bit decimal column for comparison since the reader always creates DECIMAL64 columns
   auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
     return numeric::decimal64{vals[i], numeric::scale_type{2}};
   });
@@ -1407,4 +1413,90 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
 }
 
+TEST_F(OrcReaderTest, DecimalOptions)
+{
+  constexpr auto num_rows = 10;
+  auto col_vals           = random_values<int64_t>(num_rows);
+  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
+
+  column_wrapper<numeric::decimal128> col{col_data, col_data + num_rows, mask};
+  table_view expected({col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("dec");
+
+  auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options valid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"decc", "fake_name"});
+  // Should not throw
+  EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
+
+  cudf_io::orc_reader_options invalid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"dec", "fake_name"});
+  // Should throw, options overlap
+  EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error);
+}
+
+TEST_F(OrcWriterTest, DecimalOptionsNested)
+{
+  auto const num_rows = 100;
+
+  auto dec_vals  = random_values<int32_t>(num_rows);
+  auto keys_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto vals_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<numeric::decimal64> keys_col{keys_data, keys_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> vals_col{vals_data, vals_data + num_rows, validity};
+
+  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  std::iota(row_offsets.begin(), row_offsets.end(), 0);
+  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows,
+                            offsets.release(),
+                            std::move(struct_col),
+                            cudf::UNKNOWN_NULL_COUNT,
+                            cudf::test::detail::make_null_mask(validity, validity + num_rows));
+
+  table_view expected({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("lists");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("dec64");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("dec128");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"lists.1.dec128"});
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 0bc5d047612..c2cd6202dff 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -874,15 +874,15 @@ TEST_F(MergeTest, StructsNestedWithNulls)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointMerge)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index e3a7a378d35..376f5ce5dd2 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1057,12 +1057,12 @@ TYPED_TEST(ReductionTest, UniqueCount)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductZeroScale)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1086,7 +1086,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
   EXPECT_EQ(result_fp, _24);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProduct)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1106,7 +1106,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductWithNulls)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1126,7 +1126,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSum)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1147,7 +1147,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumAlternate)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1171,7 +1171,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
   EXPECT_EQ(result_scalar->fixed_point_value(), TEN);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumFractional)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1191,7 +1191,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1214,7 +1214,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMin)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1234,7 +1234,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMinLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1255,7 +1255,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMax)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1275,7 +1275,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMaxLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1296,7 +1296,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNUnique)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1315,7 +1315,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumOfSquares)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1335,7 +1335,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianOddNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1355,7 +1355,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianEvenNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1375,7 +1375,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionQuantile)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1397,7 +1397,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNthElement)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1420,6 +1420,66 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
   }
 }
 
+struct Decimal128Only : public cudf::test::BaseFixture {
+};
+
+TEST_F(Decimal128Only, Decimal128ProductReduction)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{2, 2, 2, 2, 2, 2, 2, 2, 2}, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{512, scale_type{i * 9}}};
+
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TEST_F(Decimal128Only, Decimal128ProductReduction2)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4, -5, -6}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 3, 4, 5, 6}, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{720, scale_type{i * 6}}};
+
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TEST_F(Decimal128Only, Decimal128ProductReduction3)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const values   = std::vector(127, -2);
+  auto const scale    = scale_type{0};
+  auto const column   = fp_wrapper{values.cbegin(), values.cend(), scale};
+  auto const lowest   = cuda::std::numeric_limits<RepType>::lowest();
+  auto const expected = decimal128{scaled_integer<RepType>{lowest, scale}};
+
+  auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+  auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+  auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+  EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+}
+
 TYPED_TEST(ReductionTest, NthElement)
 {
   using T = TypeParam;
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index b2c53cf5915..346103de85b 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -52,6 +52,11 @@ struct TypeParam_to_host_type<numeric::decimal64> {
   using type = numeric::decimal64::rep;
 };
 
+template <>
+struct TypeParam_to_host_type<numeric::decimal128> {
+  using type = numeric::decimal128::rep;
+};
+
 template <typename TypeParam, typename T>
 typename std::enable_if<std::is_same_v<TypeParam, cudf::string_view>,
                         thrust::host_vector<std::string>>::type
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index cfafbf26dac..92ccbbfbfd8 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -539,14 +539,14 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReplace)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReplace)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 2c0dd58e91e..fb8e5bdd01a 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -357,12 +357,12 @@ TEST_F(InterleaveStringsColumnsTest, MultiColumnStringMixNullableMix)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointInterleave)
+TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 7d1645faba9..b335bf20f95 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -38,6 +38,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <limits>
+#include <type_traits>
 #include <vector>
 
 using cudf::bitmask_type;
@@ -1199,15 +1200,14 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
 {
   using namespace numeric;
   using namespace cudf;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using fw_wrapper   = cudf::test::fixed_width_column_wrapper<size_type>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<size_type>;
 
   auto const scale              = scale_type{-1};
   auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};
-  auto const expected_sum       = fp64_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_sum       = fp_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_min       = fp_wrapper{{42, 42, 55, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_max       = fp_wrapper{{42, 55, 55, 55, 2, 2}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_lag       = fp_wrapper{{0, 42, 1729, 55, 343, 1}, {0, 1, 0, 1, 0, 1}, scale};
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 1a9302a3e7e..6b2febb9b5c 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -284,6 +284,20 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfNegEven3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(RoundTestsFixedPointTypes, TestForBlog)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper{{25649999}, scale_type{-5}};
+  auto const expected = fp_wrapper{{256}, scale_type{0}};
+  auto const result   = cudf::round(input);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(RoundTestsFloatingPointTypes, SimpleFloatingPointTestHalfUp0)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
@@ -587,6 +601,54 @@ TEST_F(RoundTests, Int64AtBoundaryHalfUp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, result5->view());
 }
 
+TEST_F(RoundTests, FixedPoint128HalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    auto const input    = fp_wrapper{{-160714515306}, scale_type{-13}};
+    auto const expected = fp_wrapper{{-16071451531}, scale_type{-12}};
+    auto const result   = cudf::round(input, 12, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, FixedPointAtBoundaryTestHalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const m = std::numeric_limits<RepType>::max();  // 170141183460469231731687303715884105727
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000}, scale_type{5}};
+    auto const result   = cudf::round(input, -5, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000000000}, scale_type{11}};
+    auto const result   = cudf::round(input, -11, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 1000000000000000}, scale_type{15}};
+    auto const result   = cudf::round(input, -15, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 TEST_F(RoundTests, BoolTestHalfUp)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<bool>;
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 79d992005d8..41bc0af20d9 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1817,12 +1817,12 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointLowerBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1846,7 +1846,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointUpperBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointUpperBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 54cd97301a8..af13c35acfb 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -673,14 +673,14 @@ TEST_F(SortByKey, ValueKeysSizeMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointSortedOrderGather)
+TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index b614b3b49fe..7c188d39f6f 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <limits>
 
 #include <tests/strings/utilities.h>
 
@@ -84,6 +85,76 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsConvertTest, ToFixedPointDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const strings = cudf::test::strings_column_wrapper(
+    {"1234000000000000000000",
+     "-876000000000000000000",
+     "5432e+17",
+     "-12E016",
+     "250000000000000000",
+     "-2800000000000000",
+     "",
+     "-0.0",
+     "170141183460469231731687303715884105727",
+     "17014118346046923173168730371588410572700000000000000000000"});
+
+  auto const scale    = scale_type{20};
+  auto const type     = cudf::data_type{cudf::type_to_id<decimal128>(), scale};
+  auto const results  = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type);
+  auto const max      = cuda::std::numeric_limits<__int128_t>::max();
+  auto const expected = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0, 1701411834604692317, max}, scale};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
+TEST_F(StringsConvertTest, FromFixedPointDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto constexpr max = cuda::std::numeric_limits<__int128_t>::max();
+
+  {
+    auto const input = fp_wrapper{{110, max}, numeric::scale_type{-2}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.10", "1701411834604692317316873037158841057.27"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input = fp_wrapper{{max}, numeric::scale_type{-38}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.70141183460469231731687303715884105727"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input = fp_wrapper({110, max}, numeric::scale_type{2});
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"11000", "17014118346046923173168730371588410572700"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input    = fp_wrapper({-222}, numeric::scale_type{0});
+    auto results        = cudf::strings::from_fixed_point(input);
+    auto const expected = cudf::test::strings_column_wrapper({"-222"});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+}
+
 TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall)
 {
   using DecimalType  = TypeParam;
@@ -182,46 +253,76 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  cudf::test::strings_column_wrapper big_numbers({
-    "2147483647",
-    "-2147483647",
-    "2147483648",
-    "9223372036854775807",
-    "-9223372036854775807",
-    "9223372036854775808",
-    "9223372036854775808000",
-    "100E2147483648",
-  });
+  cudf::test::strings_column_wrapper big_numbers({"2147483647",
+                                                  "-2147483647",
+                                                  "2147483648",
+                                                  "9223372036854775807",
+                                                  "-9223372036854775807",
+                                                  "9223372036854775808",
+                                                  "9223372036854775808000",
+                                                  "100E2147483648",
+                                                  "170141183460469231731687303715884105727",
+                                                  "170141183460469231731687303715884105728"});
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL32});
   auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, false, false, false, false, false, false});
+    {true, true, false, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL64});
   auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, false, false, false});
+    {true, true, true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
+  results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                          cudf::data_type{cudf::type_id::DECIMAL128});
+  auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, true, false});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
+
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
   auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, true, false, false});
+    {true, true, true, true, true, true, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{10}});
-  auto const expected64_scaled_positive =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  auto const expected64_scaled_positive = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled_positive);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
   auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, false, false, false, false, false});
+    {true, true, true, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
+
+TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
+{
+  auto const max = cuda::std::numeric_limits<__int128_t>::max();
+
+  auto const x = numeric::decimal128{max, numeric::scale_type{-10}};
+  EXPECT_EQ(static_cast<std::string>(x), "17014118346046923173168730371.5884105727");
+
+  auto const y = numeric::decimal128{max, numeric::scale_type{10}};
+  EXPECT_EQ(static_cast<std::string>(y), "170141183460469231731687303710000000000");
+
+  auto const z = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{10}}};
+  EXPECT_EQ(static_cast<std::string>(z), "1701411834604692317316873037158841057270000000000");
+
+  auto const a = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{40}}};
+  EXPECT_EQ(static_cast<std::string>(a),
+            "1701411834604692317316873037158841057270000000000000000000000000000000000000000");
+
+  auto const b = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-20}}};
+  EXPECT_EQ(static_cast<std::string>(b), "1701411834604692317.31687303715884105727");
+
+  auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
+  EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
+}
\ No newline at end of file
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index f718fbfc57b..4645ff9be5f 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -164,7 +164,6 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
     ((4 + 0) * CHAR_BIT) + (type_size * 0),
     ((4 + 4) * CHAR_BIT) + (type_size * 1) + 2,
     ((4 + 8) * CHAR_BIT) + (type_size * 3) + 5};
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index b57ccdd816a..4d0009ab20a 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -784,67 +784,130 @@ TYPED_TEST(FixedPointTests, FixedPointToFixedPointSameTypeidDownPositive)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeid)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXX)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-3));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidDown)
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithSmallerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-5));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUp)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUpNullMask)
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector<int32_t>{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperB{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -858,7 +921,6 @@ TEST_F(FixedPointTestSingleType, AvoidOverflowDecimal32ToDecimal64)
   auto const input    = fp_wrapper32{{9999999}, scale_type{3}};
   auto const expected = fp_wrapper64{{9999999}, scale_type{3}};
   auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal64>(3));
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
@@ -874,3 +936,70 @@ TEST_F(FixedPointTestSingleType, AvoidOverflowDecimal32ToInt64)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index c855f112692..f0450483345 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -36,6 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
+        void set_decimal128_columns(vector[string] val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -57,6 +58,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
+        orc_reader_options_builder& decimal128_columns(
+            vector[string] val
+        ) except+
 
         orc_reader_options build() except+
 

From e08ae9cb15fe260015cf70a22181fa67123e779f Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Tue, 16 Nov 2021 18:03:14 -0800
Subject: [PATCH 02/72] Implement Series.datetime.floor (#9571)

Fixes: #7102
Replaces: [#9488](https://github.com/rapidsai/cudf/pull/9488/files)

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9571
---
 cpp/include/cudf/datetime.hpp            |  93 ++++++++++-
 cpp/src/datetime/datetime_ops.cu         | 199 ++++++++++++++++++-----
 cpp/tests/datetime/datetime_ops_test.cpp |  92 ++++++++++-
 docs/cudf/source/api_docs/series.rst     |   2 +
 python/cudf/cudf/_lib/cpp/datetime.pxd   |  17 +-
 python/cudf/cudf/_lib/datetime.pyx       |  33 +++-
 python/cudf/cudf/core/column/datetime.py |   7 +-
 python/cudf/cudf/core/series.py          |  73 ++++++++-
 python/cudf/cudf/tests/test_datetime.py  |  39 ++++-
 9 files changed, 502 insertions(+), 53 deletions(-)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index d67984daa7c..71e5968bf07 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -378,5 +378,96 @@ std::unique_ptr<column> ceil_nanosecond(
   column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Round down to the nearest day
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest hour
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest minute
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest second
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest millisecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest microsecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest nanosecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ccfad56b4ea..717bd7ac0a8 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -54,6 +54,8 @@ enum class datetime_component {
   NANOSECOND
 };
 
+enum class rounding_kind { CEIL, FLOOR };
+
 template <datetime_component Component>
 struct extract_component_operator {
   template <typename Timestamp>
@@ -88,32 +90,59 @@ struct extract_component_operator {
   }
 };
 
-template <datetime_component COMPONENT>
-struct ceil_timestamp {
+// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round
+// function.
+template <typename DurationType>
+struct RoundFunctor {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt)
+  {
+    switch (round_kind) {
+      case rounding_kind::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
+      case rounding_kind::FLOOR: return cuda::std::chrono::floor<DurationType>(dt);
+      default: cudf_assert(false && "Unsupported rounding kind.");
+    }
+    __builtin_unreachable();
+  }
+};
+
+struct RoundingDispatcher {
+  rounding_kind round_kind;
+  datetime_component component;
+
+  RoundingDispatcher(rounding_kind round_kind, datetime_component component)
+    : round_kind(round_kind), component(component)
+  {
+  }
+
   template <typename Timestamp>
   CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
   {
-    using namespace cuda::std::chrono;
-    // want to use this with D, H, T (minute), S, L (millisecond), U
-    switch (COMPONENT) {
+    switch (component) {
       case datetime_component::DAY:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_D>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_D>{}(round_kind, ts));
       case datetime_component::HOUR:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_h>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_h>{}(round_kind, ts));
       case datetime_component::MINUTE:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_m>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_m>{}(round_kind, ts));
       case datetime_component::SECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_s>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_s>{}(round_kind, ts));
       case datetime_component::MILLISECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ms>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ms>{}(round_kind, ts));
       case datetime_component::MICROSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_us>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_us>{}(round_kind, ts));
       case datetime_component::NANOSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ns>(ts));
-      default: cudf_assert(false && "Unexpected resolution");
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ns>{}(round_kind, ts));
+      default: cudf_assert(false && "Unsupported datetime rounding resolution.");
     }
-
-    return {};
+    __builtin_unreachable();
   }
 };
 
@@ -196,10 +225,11 @@ struct is_leap_year_op {
 };
 
 // Specific function for applying ceil/floor date ops
-template <typename TransformFunctor>
-struct dispatch_ceil {
+struct dispatch_round {
   template <typename Timestamp>
   std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    rounding_kind round_kind,
+    datetime_component component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr) const
@@ -221,7 +251,7 @@ struct dispatch_ceil {
                       column.begin<Timestamp>(),
                       column.end<Timestamp>(),
                       output->mutable_view().begin<Timestamp>(),
-                      TransformFunctor{});
+                      RoundingDispatcher{round_kind, component});
 
     return output;
   }
@@ -384,13 +414,14 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   }
 }
 
-template <datetime_component Component>
-std::unique_ptr<column> ceil_general(column_view const& column,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> round_general(rounding_kind round_kind,
+                                      datetime_component component,
+                                      column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   return cudf::type_dispatcher(
-    column.type(), dispatch_ceil<detail::ceil_timestamp<Component>>{}, column, stream, mr);
+    column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column,
@@ -498,53 +529,147 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::DAY>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::HOUR>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MINUTE>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::SECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_millisecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MILLISECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_microsecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MICROSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_nanosecond(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::NANOSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_millisecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_microsecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_nanosecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index c0d2d1cc447..b70ac29fd5d 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -357,9 +357,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
 
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto host_val                     = to_host<T>(input);
   thrust::host_vector<T> timestamps = host_val.first;
@@ -403,6 +403,22 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     ceiled_millisecond.begin(), ceiled_millisecond.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
+
+  std::vector<T> ceiled_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_microsecond.begin(), ceiled_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond);
+
+  std::vector<T> ceiled_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_nanosecond.begin(), ceiled_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond);
 }
 
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
@@ -827,4 +843,76 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+
+  auto host_val                     = to_host<T>(input);
+  thrust::host_vector<T> timestamps = host_val.first;
+
+  std::vector<T> floored_day(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<days>(i));
+  });
+  auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(floored_day.begin(),
+                                                                               floored_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day);
+
+  std::vector<T> floored_hour(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_hour.begin(), floored_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour);
+
+  std::vector<T> floored_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_minute.begin(), floored_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute);
+
+  std::vector<T> floored_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_second.begin(), floored_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second);
+
+  std::vector<T> floored_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_millisecond.begin(), floored_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond);
+
+  std::vector<T> floored_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_microsecond.begin(), floored_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond);
+
+  std::vector<T> floored_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_nanosecond.begin(), floored_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 46a31a0dcf6..b90ee628332 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -300,6 +300,8 @@ Datetime methods
 
    strftime
    isocalendar
+   ceil
+   floor
 
 
 Timedelta properties
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 2af4dd648c5..38ed9fbd769 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -23,7 +23,22 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] ceil_microsecond(
         const column_view& column
     ) except +
-    cdef unique_ptr[column] ceil_nanosecond(const column_view& column) except +
+    cdef unique_ptr[column] ceil_nanosecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_day(const column_view& column) except +
+    cdef unique_ptr[column] floor_hour(const column_view& column) except +
+    cdef unique_ptr[column] floor_minute(const column_view& column) except +
+    cdef unique_ptr[column] floor_second(const column_view& column) except +
+    cdef unique_ptr[column] floor_millisecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_microsecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_nanosecond(
+        const column_view& column
+    ) except +
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 5cda06362b6..3215088c438 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -72,13 +72,13 @@ def ceil_datetime(Column col, object field):
             c_result = move(libcudf_datetime.ceil_day(col_view))
         elif field == "H":
             c_result = move(libcudf_datetime.ceil_hour(col_view))
-        elif field == "T":
+        elif field == "T" or field == "min":
             c_result = move(libcudf_datetime.ceil_minute(col_view))
         elif field == "S":
             c_result = move(libcudf_datetime.ceil_second(col_view))
-        elif field == "L":
+        elif field == "L" or field == "ms":
             c_result = move(libcudf_datetime.ceil_millisecond(col_view))
-        elif field == "U":
+        elif field == "U" or field == "us":
             c_result = move(libcudf_datetime.ceil_microsecond(col_view))
         elif field == "N":
             c_result = move(libcudf_datetime.ceil_nanosecond(col_view))
@@ -89,6 +89,33 @@ def ceil_datetime(Column col, object field):
     return result
 
 
+def floor_datetime(Column col, object field):
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
+        if field == "D":
+            c_result = move(libcudf_datetime.floor_day(col_view))
+        elif field == "H":
+            c_result = move(libcudf_datetime.floor_hour(col_view))
+        elif field == "T" or field == "min":
+            c_result = move(libcudf_datetime.floor_minute(col_view))
+        elif field == "S":
+            c_result = move(libcudf_datetime.floor_second(col_view))
+        elif field == "L" or field == "ms":
+            c_result = move(libcudf_datetime.floor_millisecond(col_view))
+        elif field == "U" or field == "us":
+            c_result = move(libcudf_datetime.floor_microsecond(col_view))
+        elif field == "N":
+            c_result = move(libcudf_datetime.floor_nanosecond(col_view))
+        else:
+            raise ValueError(f"Invalid resolution: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
+
+
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 68379002e6b..756e48edccb 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -222,8 +222,11 @@ def values(self):
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
-    def ceil(self, field: str) -> ColumnBase:
-        return libcudf.datetime.ceil_datetime(self, field)
+    def ceil(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.ceil_datetime(self, freq)
+
+    def floor(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.floor_datetime(self, freq)
 
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, cudf.Scalar):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 00a8ebabe34..c804f2bca2c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4592,11 +4592,76 @@ def _get_dt_field(self, field):
             data=out_column, index=self.series._index, name=self.series.name
         )
 
-    def ceil(self, field):
-        out_column = self.series._column.ceil(field)
+    def ceil(self, freq):
+        """
+        Perform ceil operation on the data to the specified freq.
 
-        return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.ceil("T")
+        0   2001-01-01 00:05:00
+        1   2001-01-01 00:05:00
+        2   2001-01-01 00:06:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.ceil(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
+        )
+
+    def floor(self, freq):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.floor("T")
+        0   2001-01-01 00:04:00
+        1   2001-01-01 00:04:00
+        2   2001-01-01 00:05:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.floor(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
         )
 
     def strftime(self, date_format, *args, **kwargs):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d666dfc0ec1..bf75badc06f 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1777,12 +1777,45 @@ def test_error_values():
     ],
 )
 @pytest.mark.parametrize("time_type", DATETIME_TYPES)
-@pytest.mark.parametrize("resolution", ["D", "H", "T", "S", "L", "U", "N"])
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
 def test_ceil(data, time_type, resolution):
 
-    ps = pd.Series(data, dtype=time_type)
-    gs = cudf.from_pandas(ps)
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
 
     expect = ps.dt.ceil(resolution)
     got = gs.dt.ceil(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            [
+                "2020-05-31 08:00:00",
+                "1999-12-31 18:40:10",
+                "2000-12-31 04:00:05",
+                "1900-02-28 07:00:06",
+                "1800-03-14 07:30:20",
+                "2100-03-14 07:30:20",
+                "1970-01-01 00:00:09",
+                "1969-12-31 12:59:10",
+            ]
+        )
+    ],
+)
+@pytest.mark.parametrize("time_type", DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_floor(data, time_type, resolution):
+
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
+
+    expect = ps.dt.floor(resolution)
+    got = gs.dt.floor(resolution)
+    assert_eq(expect, got)

From 4d13d81bb04a51a1ad7f476184c2b1eb88038126 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Tue, 16 Nov 2021 21:11:28 -0800
Subject: [PATCH 03/72] Fixed build by adding more checks for int8, int16
 (#9707)

Add additional checks for int8, int16

fixes [#/rapidsai/cudf/4127](https://github.com/NVIDIA/spark-rapids/issues/4127)

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9707
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 329c251f72d..729444f460c 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -398,12 +398,13 @@ public final ColumnVector isInteger() {
    * for null entries.
    *
    * @param intType the data type that should be used for bounds checking. Note that only
-   *                integer types are allowed.
+   *                cudf integer types are allowed including signed/unsigned int8 through int64
    * @return Boolean vector
    */
   public final ColumnVector isInteger(DType intType) {
     assert type.equals(DType.STRING);
-    assert intType.isBackedByInt() || intType.isBackedByLong();
+    assert intType.isBackedByInt() || intType.isBackedByLong() || intType.isBackedByByte()
+        || intType.isBackedByShort();
     return new ColumnVector(isIntegerWithType(getNativeView(),
         intType.getTypeId().getNativeId(), intType.getScale()));
   }

From 9aefbc27508bff8b8a2b41fabb06594bc4c5832b Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 17 Nov 2021 07:52:48 -0600
Subject: [PATCH 04/72] Java Support for Decimal 128 (#9485)

This depends on #9483

There may be a few more changes coming to this, but it should be fairly complete
---
 .../java/ai/rapids/cudf/BinaryOperable.java   |  33 +-
 .../java/ai/rapids/cudf/ColumnVector.java     |  13 +
 java/src/main/java/ai/rapids/cudf/DType.java  |  17 +-
 .../java/ai/rapids/cudf/HostColumnVector.java |  40 +-
 .../ai/rapids/cudf/HostColumnVectorCore.java  |  49 +-
 .../main/java/ai/rapids/cudf/ORCOptions.java  |  31 +-
 java/src/main/java/ai/rapids/cudf/Scalar.java |  61 ++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  26 +-
 java/src/main/native/src/ScalarJni.cpp        |  31 ++
 java/src/main/native/src/TableJni.cpp         |   5 +-
 java/src/main/native/src/dtype_utils.hpp      |   3 +-
 .../java/ai/rapids/cudf/BinaryOpTest.java     |  44 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  63 +++
 .../rapids/cudf/DecimalColumnVectorTest.java  |  17 +-
 .../test/java/ai/rapids/cudf/ScalarTest.java  |  11 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 489 ++++++++++++++----
 16 files changed, 784 insertions(+), 149 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 68213c21956..2b1afb92e36 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -80,25 +80,22 @@ static DType implicitConversion(BinaryOp op, BinaryOperable lhs, BinaryOperable
       return DType.BOOL8;
     }
     if (a.isDecimalType() && b.isDecimalType()) {
-      // Here scale is created with value 0 as `scale` is required to create DType of
-      // decimal type. Dtype is discarded for binary operations for decimal types in cudf as a new
-      // DType is created for output type with new scale. New scale for output depends upon operator.
-      int scale = 0;
-      if (a.typeId == DType.DTypeEnum.DECIMAL32) {
-        if (b.typeId == DType.DTypeEnum.DECIMAL32) {
-          return DType.create(DType.DTypeEnum.DECIMAL32,
-              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
-        } else {
-          throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
-        }
-      } else if (a.typeId == DType.DTypeEnum.DECIMAL64) {
-        if (b.typeId == DType.DTypeEnum.DECIMAL64) {
-          return DType.create(DType.DTypeEnum.DECIMAL64,
-              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
-        } else {
-          throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
-        }
+      if (a.typeId != b.typeId) {
+        throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
       }
+      final int scale = ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType());
+      // The output precision/size should be at least as large as the input.
+      // It may be larger if room is needed for it based off of the output scale.
+      final DType.DTypeEnum outputEnum;
+      if (scale <= DType.DECIMAL32_MAX_PRECISION && a.typeId == DType.DTypeEnum.DECIMAL32) {
+        outputEnum = DType.DTypeEnum.DECIMAL32;
+      } else if (scale <= DType.DECIMAL64_MAX_PRECISION &&
+          (a.typeId == DType.DTypeEnum.DECIMAL32 || a.typeId == DType.DTypeEnum.DECIMAL64)) {
+        outputEnum = DType.DTypeEnum.DECIMAL64;
+      } else {
+        outputEnum = DType.DTypeEnum.DECIMAL128;
+      }
+      return DType.create(outputEnum, scale);
     }
     throw new IllegalArgumentException("Unsupported types " + a + " and " + b);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 7eb51a52a7d..3fed6316215 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -24,6 +24,7 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -1391,6 +1392,18 @@ public static ColumnVector decimalFromDoubles(DType type, RoundingMode mode, dou
     }
   }
 
+
+  /**
+   * Create a new decimal vector from BigIntegers
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static ColumnVector decimalFromBigInt(int scale, BigInteger... values) {
+    try (HostColumnVector host = HostColumnVector.decimalFromBigIntegers(scale, values)) {
+      ColumnVector columnVector = host.copyToDevice();
+      return columnVector;
+    }
+  }
+
   /**
    * Create a new string vector from the given values.  This API
    * supports inline nulls. This is really intended to be used only for testing as
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 2d851aa2ae3..742501be375 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -23,6 +23,7 @@ public final class DType {
 
   public static final int DECIMAL32_MAX_PRECISION = 9;
   public static final int DECIMAL64_MAX_PRECISION = 18;
+  public static final int DECIMAL128_MAX_PRECISION = 38;
 
   /* enum representing various types. Whenever a new non-decimal type is added please make sure
   below sections are updated as well:
@@ -77,7 +78,8 @@ public enum DTypeEnum {
     LIST(0, 24),
     DECIMAL32(4, 25),
     DECIMAL64(8, 26),
-    STRUCT(0, 27);
+    DECIMAL128(16, 27),
+    STRUCT(0, 28);
 
     final int sizeInBytes;
     final int nativeId;
@@ -167,6 +169,7 @@ private DType(DTypeEnum id, int decimalScale) {
       LIST,
       null, // DECIMAL32
       null, // DECIMAL64
+      null, // DECIMAL128
       STRUCT
   };
 
@@ -276,6 +279,13 @@ public static DType fromNative(int nativeId, int scale) {
         }
         return new DType(DTypeEnum.DECIMAL64, scale);
       }
+      if (nativeId == DTypeEnum.DECIMAL128.nativeId) {
+        if (-scale > DECIMAL128_MAX_PRECISION) {
+          throw new IllegalArgumentException(
+              "Scale " + (-scale) + " exceeds DECIMAL128_MAX_PRECISION " + DECIMAL128_MAX_PRECISION);
+        }
+        return new DType(DTypeEnum.DECIMAL128, scale);
+      }
     }
     throw new IllegalArgumentException("Could not translate " + nativeId + " into a DType");
   }
@@ -293,6 +303,8 @@ public static DType fromJavaBigDecimal(BigDecimal dec) {
       return new DType(DTypeEnum.DECIMAL32, -dec.scale());
     } else if (dec.precision() <= DECIMAL64_MAX_PRECISION) {
       return new DType(DTypeEnum.DECIMAL64, -dec.scale());
+    } else if (dec.precision() <= DECIMAL128_MAX_PRECISION) {
+      return new DType(DTypeEnum.DECIMAL128, -dec.scale());
     }
     throw new IllegalArgumentException("Precision " + dec.precision() +
         " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION);
@@ -450,7 +462,8 @@ public boolean hasOffsets() {
 
   private static final EnumSet<DTypeEnum> DECIMALS = EnumSet.of(
       DTypeEnum.DECIMAL32,
-      DTypeEnum.DECIMAL64
+      DTypeEnum.DECIMAL64,
+      DTypeEnum.DECIMAL128
   );
 
   private static final EnumSet<DTypeEnum> NESTED_TYPE = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 46255428c1c..e21a4ac81c6 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -525,6 +525,23 @@ public static HostColumnVector decimalFromBoxedLongs(int scale, Long... values)
     });
   }
 
+  /**
+   * Create a new decimal vector from unscaled values (BigInteger array) and scale.
+   * The created vector is of type DType.DECIMAL128.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static HostColumnVector decimalFromBigIntegers(int scale, BigInteger... values) {
+    return build(DType.create(DType.DTypeEnum.DECIMAL128, scale), values.length, (b) -> {
+      for (BigInteger v : values) {
+        if (v == null) {
+          b.appendNull();
+        } else {
+          b.appendUnscaledDecimal(v);
+        }
+      }
+    });
+  }
+
   /**
    * Create a new decimal vector from double floats with specific DecimalType and RoundingMode.
    * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode.
@@ -1222,7 +1239,12 @@ public final ColumnBuilder append(BigDecimal value) {
         data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
         data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
-      } else {
+      } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+        assert currentIndex < rows;
+        byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+        byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
+        data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
+      }  else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
       currentIndex++;
@@ -1450,7 +1472,7 @@ public final Builder append(BigDecimal value) {
      */
     public final Builder append(BigDecimal value, RoundingMode roundingMode) {
       assert type.isDecimalType();
-      assert currentIndex < rows;
+      assert currentIndex < rows: "appended too many values " + currentIndex + " out of total rows " + rows;
       BigInteger unscaledValue = value.setScale(-type.getScale(), roundingMode).unscaledValue();
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
         assert value.precision() <= DType.DECIMAL32_MAX_PRECISION : "value exceeds maximum precision for DECIMAL32";
@@ -1458,6 +1480,10 @@ public final Builder append(BigDecimal value, RoundingMode roundingMode) {
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
         assert value.precision() <= DType.DECIMAL64_MAX_PRECISION : "value exceeds maximum precision for DECIMAL64 ";
         data.setLong(currentIndex * type.getSizeInBytes(), unscaledValue.longValueExact());
+      } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+        assert value.precision() <= DType.DECIMAL128_MAX_PRECISION : "value exceeds maximum precision for DECIMAL128 ";
+        appendUnscaledDecimal(value.unscaledValue());
+        return this;
       } else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
@@ -1481,6 +1507,16 @@ public final Builder appendUnscaledDecimal(long value) {
       return this;
     }
 
+    public final Builder appendUnscaledDecimal(BigInteger value) {
+      assert type.typeId == DType.DTypeEnum.DECIMAL128;
+      assert currentIndex < rows;
+      byte[] unscaledValueBytes = value.toByteArray();
+      byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
+      data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
+      currentIndex++;
+      return this;
+    }
+
     public Builder append(String value) {
       assert value != null : "appendNull must be used to append null strings";
       return appendUTF8String(value.getBytes(StandardCharsets.UTF_8));
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index e4fb71033af..dd07df16553 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -22,6 +22,8 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
@@ -341,6 +343,13 @@ public final BigDecimal getBigDecimal(long index) {
     } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
       long unscaledValue = offHeap.data.getLong(index * type.getSizeInBytes());
       return BigDecimal.valueOf(unscaledValue, -type.getScale());
+    } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+      int sizeInBytes = DType.DTypeEnum.DECIMAL128.sizeInBytes;
+      byte[] dst = new byte[sizeInBytes];
+      // We need to switch the endianness for decimal128 byte arrays between java and native code.
+      offHeap.data.getBytes(dst, 0, (index * sizeInBytes), sizeInBytes);
+      convertInPlaceToBigEndian(dst);
+      return new BigDecimal(new BigInteger(dst), -type.getScale());
     } else {
       throw new IllegalStateException(type + " is not a supported decimal type.");
     }
@@ -534,6 +543,34 @@ public String toString() {
         '}';
   }
 
+  protected static byte[] convertDecimal128FromJavaToCudf(byte[] bytes) {
+    byte[] finalBytes = new byte[DType.DTypeEnum.DECIMAL128.sizeInBytes];
+    byte lastByte = bytes[0];
+    //Convert to 2's complement representation and make sure the sign bit is extended correctly
+    byte setByte = (lastByte & 0x80) > 0 ? (byte)0xff : (byte)0x00;
+    for(int i = bytes.length; i < finalBytes.length; i++) {
+      finalBytes[i] = setByte;
+    }
+    // After setting the sign bits, reverse the rest of the bytes for endianness
+    for(int k = 0; k < bytes.length; k++) {
+      finalBytes[k] = bytes[bytes.length - k - 1];
+    }
+    return finalBytes;
+  }
+
+  private void convertInPlaceToBigEndian(byte[] dst) {
+    assert ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
+    int i =0;
+    int j = dst.length -1;
+    while (j > i) {
+      byte tmp;
+      tmp = dst[j];
+      dst[j] = dst[i];
+      dst[i] = tmp;
+      j--;
+      i++;
+    }
+  }
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
@@ -557,15 +594,9 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
       boolean neededCleanup = false;
       if (data != null || valid != null || offsets != null) {
         try {
-          if (data != null) {
-            data.close();
-          }
-          if (offsets != null) {
-            offsets.close();
-          }
-          if (valid != null) {
-            valid.close();
-          }
+          ColumnVector.closeBuffers(data);
+          ColumnVector.closeBuffers(offsets);
+          ColumnVector.closeBuffers(valid);
         } finally {
           // Always mark the resource as freed even if an exception is thrown.
           // We cannot know how far it progressed before the exception, and
diff --git a/java/src/main/java/ai/rapids/cudf/ORCOptions.java b/java/src/main/java/ai/rapids/cudf/ORCOptions.java
index 359a6b96628..2ff253060f0 100644
--- a/java/src/main/java/ai/rapids/cudf/ORCOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ORCOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,10 @@
 
 package ai.rapids.cudf;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
 /**
  * Options for reading a ORC file
  */
@@ -27,9 +31,11 @@ public class ORCOptions extends ColumnFilterOptions {
 
   private final boolean useNumPyTypes;
   private final DType unit;
+  private final String[] decimal128Columns;
 
   private ORCOptions(Builder builder) {
     super(builder);
+    decimal128Columns = builder.decimal128Columns.toArray(new String[0]);
     useNumPyTypes = builder.useNumPyTypes;
     unit = builder.unit;
   }
@@ -42,6 +48,10 @@ DType timeUnit() {
     return unit;
   }
 
+  String[] getDecimal128Columns() {
+    return decimal128Columns;
+  }
+
   public static Builder builder() {
     return new Builder();
   }
@@ -50,6 +60,8 @@ public static class Builder extends ColumnFilterOptions.Builder<Builder> {
     private boolean useNumPyTypes = true;
     private DType unit = DType.EMPTY;
 
+    final List<String> decimal128Columns = new ArrayList<>();
+
     /**
      * Specify whether the parser should implicitly promote TIMESTAMP_DAYS
      * columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
@@ -73,6 +85,23 @@ public ORCOptions.Builder withTimeUnit(DType unit) {
       return this;
     }
 
+    /**
+     * Specify decimal columns which shall be read as DECIMAL128. Otherwise, decimal columns
+     * will be read as DECIMAL64 by default in ORC.
+     *
+     * In terms of child columns of nested types, their parents need to be prepended as prefix
+     * of the column name, in case of ambiguity. For struct columns, the names of child columns
+     * are formatted as `{struct_col_name}.{child_col_name}`. For list columns, the data(child)
+     * columns are named as `{list_col_name}.1`.
+     *
+     * @param names names of columns which read as DECIMAL128
+     * @return builder for chaining
+     */
+    public Builder decimal128Column(String... names) {
+      decimal128Columns.addAll(Arrays.asList(names));
+      return this;
+    }
+
     public ORCOptions build() { return new ORCOptions(this); }
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 631f091005a..03e77573695 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -22,6 +22,8 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.List;
@@ -86,6 +88,8 @@ public static Scalar fromNull(DType type) {
       return new Scalar(type, makeDecimal32Scalar(0, type.getScale(), false));
     case DECIMAL64:
       return new Scalar(type, makeDecimal64Scalar(0L, type.getScale(), false));
+    case DECIMAL128:
+      return new Scalar(type, makeDecimal128Scalar(BigInteger.ZERO.toByteArray(), type.getScale(), false));
     case LIST:
       throw new IllegalArgumentException("Please call 'listFromNull' to create a null list scalar.");
     default:
@@ -227,6 +231,13 @@ public static Scalar fromDecimal(int scale, long unscaledValue) {
     return new Scalar(DType.create(DType.DTypeEnum.DECIMAL64, scale), handle);
   }
 
+  public static Scalar fromDecimal(int scale, BigInteger unscaledValue) {
+    byte[] unscaledValueBytes = unscaledValue.toByteArray();
+    byte[] finalBytes =  convertDecimal128FromJavaToCudf(unscaledValueBytes);
+    long handle = makeDecimal128Scalar(finalBytes, scale, true);
+    return new Scalar(DType.create(DType.DTypeEnum.DECIMAL128, scale), handle);
+  }
+
   public static Scalar fromFloat(Float value) {
     if (value == null) {
       return Scalar.fromNull(DType.FLOAT32);
@@ -253,8 +264,12 @@ public static Scalar fromDecimal(BigDecimal value) {
     long handle;
     if (dt.typeId == DType.DTypeEnum.DECIMAL32) {
       handle = makeDecimal32Scalar(value.unscaledValue().intValueExact(), -value.scale(), true);
-    } else {
+    } else if (dt.typeId == DType.DTypeEnum.DECIMAL64) {
       handle = makeDecimal64Scalar(value.unscaledValue().longValueExact(), -value.scale(), true);
+    } else {
+      byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+      byte[] finalBytes =  convertDecimal128FromJavaToCudf(unscaledValueBytes);
+      handle = makeDecimal128Scalar(finalBytes, -value.scale(), true);
     }
     return new Scalar(dt, handle);
   }
@@ -470,6 +485,7 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native short getShort(long scalarHandle);
   private static native int getInt(long scalarHandle);
   private static native long getLong(long scalarHandle);
+  private static native byte[] getBigIntegerBytes(long scalarHandle);
   private static native float getFloat(long scalarHandle);
   private static native double getDouble(long scalarHandle);
   private static native byte[] getUTF8(long scalarHandle);
@@ -493,6 +509,7 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native long makeTimestampTimeScalar(int dtypeNativeId, long value, boolean isValid);
   private static native long makeDecimal32Scalar(int value, int scale, boolean isValid);
   private static native long makeDecimal64Scalar(long value, int scale, boolean isValid);
+  private static native long makeDecimal128Scalar(byte[] value, int scale, boolean isValid);
   private static native long makeListScalar(long viewHandle, boolean isValid);
   private static native long makeStructScalar(long[] viewHandles, boolean isValid);
   private static native long repeatString(long scalarHandle, int repeatTimes);
@@ -579,6 +596,15 @@ public long getLong() {
     return getLong(getScalarHandle());
   }
 
+  /**
+   * Returns the BigDecimal unscaled scalar value as a byte array.
+   */
+  public byte[] getBigInteger() {
+    byte[] res = getBigIntegerBytes(getScalarHandle());
+    convertInPlaceToBigEndian(res);
+    return res;
+  }
+
   /**
    * Returns the scalar value as a float.
    */
@@ -601,6 +627,8 @@ public BigDecimal getBigDecimal() {
       return BigDecimal.valueOf(getInt(), -type.getScale());
     } else if (this.type.typeId == DType.DTypeEnum.DECIMAL64) {
       return BigDecimal.valueOf(getLong(), -type.getScale());
+    } else if (this.type.typeId == DType.DTypeEnum.DECIMAL128) {
+      return new BigDecimal(new BigInteger(getBigInteger()), -type.getScale());
     }
     throw new IllegalArgumentException("Couldn't getBigDecimal from nonDecimal scalar");
   }
@@ -844,6 +872,8 @@ public String toString() {
       case DECIMAL32:
         // FALL THROUGH
       case DECIMAL64:
+        // FALL THROUGH
+      case DECIMAL128:
         sb.append(getBigDecimal());
         break;
       case LIST:
@@ -879,6 +909,35 @@ public Scalar repeatString(int repeatTimes) {
     return new Scalar(DType.STRING, repeatString(getScalarHandle(), repeatTimes));
   }
 
+  private static byte[] convertDecimal128FromJavaToCudf(byte[] bytes) {
+    byte[] finalBytes = new byte[DType.DTypeEnum.DECIMAL128.sizeInBytes];
+    byte lastByte = bytes[0];
+    //Convert to 2's complement representation and make sure the sign bit is extended correctly
+    byte setByte = (lastByte & 0x80) > 0 ? (byte)0xff : (byte)0x00;
+    for(int i = bytes.length; i < finalBytes.length; i++) {
+      finalBytes[i] = setByte;
+    }
+    // After setting the sign bits, reverse the rest of the bytes for endianness
+    for(int k = 0; k < bytes.length; k++) {
+      finalBytes[k] = bytes[bytes.length - k - 1];
+    }
+    return finalBytes;
+  }
+
+  private void convertInPlaceToBigEndian(byte[] res) {
+    assert ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
+    int i =0;
+    int j = res.length -1;
+    while (j > i) {
+      byte tmp;
+      tmp = res[j];
+      res[j] = res[i];
+      res[i] = tmp;
+      j--;
+      i++;
+    }
+  }
+
   /**
    * Holds the off-heap state of the scalar so it can be cleaned up, even if it is leaked.
    */
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 68e7a21988a..b0791fb440f 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -27,6 +27,8 @@
 
 import java.io.File;
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.math.MathContext;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -329,10 +331,12 @@ private static native long writeParquetBufferBegin(String[] columnNames,
    * @param usingNumPyTypes   whether the parser should implicitly promote TIMESTAMP
    *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
    * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
    */
   private static native long[] readORC(String[] filterColumnNames,
                                        String filePath, long address, long length,
-                                       boolean usingNumPyTypes, int timeUnit) throws CudfException;
+                                       boolean usingNumPyTypes, int timeUnit,
+                                       String[] decimal128Columns) throws CudfException;
 
   /**
    * Setup everything to write ORC formatted data to a file.
@@ -881,7 +885,9 @@ public static Table readORC(File path) {
    */
   public static Table readORC(ORCOptions opts, File path) {
     return new Table(readORC(opts.getIncludeColumnNames(),
-        path.getAbsolutePath(), 0, 0, opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId()));
+        path.getAbsolutePath(), 0, 0,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns()));
   }
 
   /**
@@ -941,8 +947,9 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
     return new Table(readORC(opts.getIncludeColumnNames(),
-        null, buffer.getAddress() + offset, len, opts.usingNumPyTypes(),
-        opts.timeUnit().typeId.getNativeId()));
+        null, buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns()));
   }
 
   private static class ParquetTableWriter implements TableWriter {
@@ -3808,6 +3815,16 @@ public TestBuilder decimal64Column(int scale, RoundingMode mode, String... value
       return this;
     }
 
+    public TestBuilder decimal128Column(int scale, RoundingMode mode, BigInteger... values) {
+      types.add(new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL128, scale)));
+      BigDecimal[] data = Arrays.stream(values).map((x) -> {
+        if (x == null) return null;
+        return new BigDecimal(x, scale, new MathContext(38, mode));
+      }).toArray(BigDecimal[]::new);
+      typeErasedData.add(data);
+      return this;
+    }
+
     private static ColumnVector from(DType type, Object dataArray) {
       ColumnVector ret = null;
       switch (type.typeId) {
@@ -3852,6 +3869,7 @@ private static ColumnVector from(DType type, Object dataArray) {
           break;
         case DECIMAL32:
         case DECIMAL64:
+        case DECIMAL128:
           int scale = type.getScale();
           if (dataArray instanceof Integer[]) {
             BigDecimal[] data = Arrays.stream(((Integer[]) dataArray))
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index fb4f14fdb80..b00b066742a 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -109,6 +109,20 @@ JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv *env, jcla
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv *env, jclass,
+                                                                           jlong scalar_handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    using ScalarType = cudf::scalar_type_t<__int128_t>;
+    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto val = s->value();
+    jbyte const *ptr = reinterpret_cast<jbyte const *>(&val);
+    cudf::jni::native_jbyteArray jbytes{env, ptr, sizeof(__int128_t)};
+    return jbytes.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jclass,
                                                                 jlong scalar_handle) {
   try {
@@ -455,6 +469,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(JNIEnv *env, jclass,
+                                                                        jbyteArray value,
+                                                                        jint scale,
+                                                                        jboolean is_valid) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
+    cudf::jni::native_jbyteArray jbytes{env, value};
+    auto const value_ = reinterpret_cast<__int128_t *>(jbytes.data());
+    std::unique_ptr<cudf::scalar> s =
+        cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
+    s->set_valid_async(is_valid);
+    return reinterpret_cast<jlong>(s.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclass, jlong lhs_ptr,
                                                               jlong rhs_view, jint int_op,
                                                               jint out_dtype, jint scale) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c66cf13a5ae..a78d40a58f7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1455,7 +1455,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jboolean usingNumPyTypes, jint unit) {
+    jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
@@ -1478,6 +1478,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
     std::unique_ptr<cudf::io::source_info> source;
     if (read_buffer) {
       source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
@@ -1491,6 +1493,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
             .use_index(false)
             .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
             .build();
     cudf::io::table_with_metadata result = cudf::io::read_orc(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 9fae0c585e6..53108ee7268 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -45,7 +45,8 @@ inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
 }
 
 inline bool is_decimal_type(cudf::type_id n_type) {
-  return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64;
+  return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64 ||
+         n_type == cudf::type_id::DECIMAL128;
 }
 
 // create data_type including scale for decimal type
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index df4afb5ff60..894861b8c44 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -22,6 +22,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.util.Arrays;
 import java.util.stream.IntStream;
@@ -54,7 +55,12 @@ public class BinaryOpTest extends CudfTestBase {
   private static final int[] DECIMAL32_1 = new int[]{1000, 2000, 3000, 4000, 5000};
   private static final int[] DECIMAL32_2 = new int[]{100, 200, 300, 400, 50};
   private static final long[] DECIMAL64_1 = new long[]{10L, 23L, 12L, 24L, 123456789L};
-  private static final long[] DECIMAL64_2 = new long[]{20L, 13L, 22L, 14L, 132457689L};
+  private static final long[] DECIMAL64_2 = new long[]{33041L, 97290L, 36438L, 25379L, 48473L};
+
+  private static final BigInteger[] DECIMAL128_1 = new BigInteger[]{new BigInteger("1234567891234567"), new BigInteger("1234567891234567"),
+      new BigInteger("1234567891234567"), new BigInteger("1234567891234567"), new BigInteger("1234567891234567")};
+  private static final BigInteger[] DECIMAL128_2 = new BigInteger[]{new BigInteger("234567891234567"), new BigInteger("234567891234567"),
+      new BigInteger("234567891234567"), new BigInteger("234567891234567"), new BigInteger("234567891234567")};
 
   private static final BigDecimal[] BIGDECIMAL32_1 = new BigDecimal[]{
           BigDecimal.valueOf(12, dec32Scale_1),
@@ -250,7 +256,9 @@ public void testAdd() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector add = icv1.add(icv2);
            ColumnVector expected = forEach(DType.INT32, icv1, icv2,
                    (b, l, r, i) -> b.append(l.getInt(i) + r.getInt(i)))) {
@@ -331,6 +339,14 @@ public void testAdd() {
         }
       }
 
+      try (ColumnVector add = dec128cv1.add(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, -6), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).add(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, add, "dec128");
+        }
+      }
+
       try (Scalar s = Scalar.fromDecimal(2, 100);
            ColumnVector add = dec32cv1.add(s)) {
         try (ColumnVector expected = forEachS(
@@ -381,7 +397,9 @@ public void testSub() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector sub = icv1.sub(icv2);
            ColumnVector expected = forEach(DType.INT32, icv1, icv2,
                    (b, l, r, i) -> b.append(l.getInt(i) - r.getInt(i)))) {
@@ -473,6 +491,14 @@ public void testSub() {
         }
       }
 
+      try (ColumnVector sub = dec128cv1.sub(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, -6), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).subtract(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, sub, "dec128");
+        }
+      }
+
       try (Scalar s = Scalar.fromFloat(1.1f);
            ColumnVector sub = lcv1.sub(s);
            ColumnVector expected = forEachS(DType.FLOAT32, lcv1, 1.1f,
@@ -507,7 +533,9 @@ public void testMul() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector answer = icv.mul(dcv);
            ColumnVector expected = forEach(DType.FLOAT64, icv, dcv,
                    (b, l, r, i) -> b.append(l.getInt(i) * r.getDouble(i)))) {
@@ -560,6 +588,14 @@ public void testMul() {
                    (b, l, r, i) -> b.append(Short.toUnsignedInt(l) * r.getInt(i)))) {
         assertColumnsAreEqual(expected, answer, "scalar uint16 * uint32");
       }
+
+      try (ColumnVector mul = dec128cv1.mul(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, dec128cv1.type.getScale() + dec128cv2.type.getScale()), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).multiply(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, mul, "dec128");
+        }
+      }
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0d007aa0ed7..b7c276d4956 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -28,6 +28,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
@@ -1063,6 +1064,21 @@ void roundDecimal() {
     }
   }
 
+  @Test
+  void decimal128Cv() {
+    final int dec32Scale1 = -2;
+    BigInteger bigInteger1 = new BigInteger("-831457");
+    BigInteger bigInteger2 = new BigInteger("14");
+    BigInteger bigInteger3 = new BigInteger("152345742357340573405745");
+    final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3};
+    try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) {
+      HostColumnVector hostColumnVector = v.copyToHost();
+      assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue());
+      assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue());
+      assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue());
+    }
+  }
+
   @Test
   void testGetDeviceMemorySizeNonStrings() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6);
@@ -1260,6 +1276,9 @@ void testFromScalarZeroRows() {
         case DECIMAL64:
           s = Scalar.fromDecimal(mockScale, 1234567890123456789L);
           break;
+        case DECIMAL128:
+          s = Scalar.fromDecimal(mockScale, new BigInteger("1234567890123456789"));
+          break;
         case TIMESTAMP_DAYS:
           s = Scalar.timestampDaysFromInt(12345);
           break;
@@ -3558,6 +3577,30 @@ void testCastLongToDecimal() {
     );
   }
 
+  @Test
+  void testCastDecimal64ToDecimal128() {
+    testCastDecimal128(DType.DTypeEnum.DECIMAL64, DType.DTypeEnum.DECIMAL128, 0,
+        () -> ColumnVector.fromBoxedLongs(1L, -21L, 345L, null, 8008L, Long.MIN_VALUE, Long.MAX_VALUE),
+        () -> ColumnVector.fromDecimals(new BigDecimal(1), new BigDecimal(-21), new BigDecimal(345),
+            null, new BigDecimal(8008), new BigDecimal(Long.MIN_VALUE), new BigDecimal(Long.MAX_VALUE)),
+        new BigInteger[]{new BigInteger("1"), new BigInteger("-21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Long.MIN_VALUE)),
+            new BigInteger(String.valueOf(Long.MAX_VALUE))}
+    );
+    testCastDecimal128(DType.DTypeEnum.DECIMAL32, DType.DTypeEnum.DECIMAL128, 0,
+        () -> ColumnVector.fromBoxedInts(1, 21, 345, null, 8008, Integer.MIN_VALUE, Integer.MAX_VALUE),
+        () -> ColumnVector.decimalFromBigInt(0, new BigInteger("1"), new BigInteger("21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Integer.MIN_VALUE)),
+            new BigInteger(String.valueOf(Integer.MAX_VALUE))),
+        new BigInteger[]{new BigInteger("1"), new BigInteger("21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Integer.MIN_VALUE)),
+            new BigInteger(String.valueOf(Integer.MAX_VALUE))}
+    );
+  }
+
   @Test
   void testCastFloatToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT32, true, 0,
@@ -3651,6 +3694,26 @@ private static void testCastNumericToDecimalsAndBack(DType sourceType, boolean i
     }
   }
 
+  private static void testCastDecimal128(DType.DTypeEnum sourceType, DType.DTypeEnum targetType, int scale,
+                                         Supplier<ColumnVector> sourceData,
+                                         Supplier<ColumnVector> returnData,
+                                         Object[] unscaledDecimal) {
+    DType decimalType = DType.create(targetType, scale);
+    try (ColumnVector sourceColumn = sourceData.get();
+         ColumnVector expectedColumn = returnData.get();
+         ColumnVector decimalColumn = sourceColumn.castTo(decimalType);
+         HostColumnVector hostDecimalColumn = decimalColumn.copyToHost();
+         ColumnVector returnColumn = decimalColumn.castTo(DType.create(decimalType.typeId, scale))) {
+      for (int i = 0; i < sourceColumn.rows; i++) {
+        Object actual = hostDecimalColumn.isNull(i) ? null :
+            (decimalType.typeId == DType.DTypeEnum.DECIMAL128 ? hostDecimalColumn.getBigDecimal(i).unscaledValue() :
+                ((decimalType.typeId == DType.DTypeEnum.DECIMAL64) ? hostDecimalColumn.getLong(i) : hostDecimalColumn.getInt(i)));
+        assertEquals(unscaledDecimal[i], actual);
+      }
+      assertColumnsAreEqual(expectedColumn, returnColumn);
+    }
+  }
+
   @Test
   void testIsTimestamp() {
       final String[] TIMESTAMP_STRINGS = {
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
index 5f4d20dc8e3..c2772520f57 100644
--- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
@@ -51,6 +51,10 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private final BigDecimal[] overflowDecimal64 = new BigDecimal[]{
       BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal.valueOf(Long.MIN_VALUE)};
 
+  private final BigDecimal[] overflowDecimal128 = new BigDecimal[]{
+      new BigDecimal("340282367000000000000000000000000000001"),
+      new BigDecimal("-340282367000000000000000000000000000001")};
+
   @BeforeAll
   public static void setup() {
     for (int i = 0; i < decimal32Zoo.length; i++) {
@@ -139,7 +143,8 @@ public void testOverrunningTheBuffer() {
   @Test
   public void testDecimalValidation() {
     // precision overflow
-    assertThrows(IllegalArgumentException.class, () -> HostColumnVector.fromDecimals(overflowDecimal64));
+    assertThrows(IllegalArgumentException.class, () -> HostColumnVector.fromDecimals(overflowDecimal128));
+
     assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector ignored = ColumnVector.decimalFromInts(
           -(DType.DECIMAL32_MAX_PRECISION + 1), unscaledDec32Zoo)) {
@@ -153,13 +158,13 @@ public void testDecimalValidation() {
     // precision overflow due to rescaling by min scale
     assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector ignored = ColumnVector.fromDecimals(
-          BigDecimal.valueOf(1.23e10), BigDecimal.valueOf(1.2e-7))) {
+          BigDecimal.valueOf(1.23e30), BigDecimal.valueOf(1.2e-7))) {
       }
     });
-    // exactly hit the MAX_PRECISION_DECIMAL64 after rescaling
+    // exactly hit the MAX_PRECISION_DECIMAL128 after rescaling
     assertDoesNotThrow(() -> {
       try (ColumnVector ignored = ColumnVector.fromDecimals(
-          BigDecimal.valueOf(1.23e10), BigDecimal.valueOf(1.2e-6))) {
+          BigDecimal.valueOf(1.23e30), BigDecimal.valueOf(1.2e-6))) {
       }
     });
   }
@@ -170,6 +175,10 @@ public void testDecimalGeneral() {
     try (ColumnVector cv = ColumnVector.fromDecimals(overflowDecimal32)) {
       assertEquals(DType.create(DType.DTypeEnum.DECIMAL64, 0), cv.getType());
     }
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(overflowDecimal64)) {
+      assertEquals(DType.create(DType.DTypeEnum.DECIMAL128, 0), cv.getType());
+    }
     // Create DECIMAL64 vector with small values
     try (ColumnVector cv =  ColumnVector.decimalFromLongs(0, 0L)) {
       try (HostColumnVector hcv = cv.copyToHost()) {
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 37fd2ecb714..0889363c2d0 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -25,6 +25,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
@@ -186,6 +187,7 @@ public void testDecimal() {
         BigDecimal.valueOf(1234, 0),
         BigDecimal.valueOf(12345678, 2),
         BigDecimal.valueOf(1234567890123L, 6),
+        new BigDecimal(new BigInteger("12312341234123412341234123412341234120"), 4)
     };
     for (BigDecimal dec : bigDecimals) {
       try (Scalar s = Scalar.fromDecimal(dec)) {
@@ -194,17 +196,24 @@ public void testDecimal() {
         assertTrue(s.isValid());
         if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL64) {
           assertEquals(dec.unscaledValue().longValueExact(), s.getLong());
-        } else {
+        } else if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL32) {
           assertEquals(dec.unscaledValue().intValueExact(), s.getInt());
+        } else if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL128) {
+          assertEquals(dec.unscaledValue(), s.getBigDecimal().unscaledValue());
         }
         assertEquals(dec, s.getBigDecimal());
       }
+
       try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue().intValueExact())) {
         assertEquals(dec, s.getBigDecimal());
       } catch (java.lang.ArithmeticException ex) {
         try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue().longValueExact())) {
           assertEquals(dec, s.getBigDecimal());
           assertTrue(s.getType().isBackedByLong());
+        } catch (java.lang.ArithmeticException e) {
+          try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue())) {
+            assertEquals(dec, s.getBigDecimal());
+          }
         }
       }
     }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 280a4d33ae9..4512a08430c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -30,6 +30,8 @@
 import ai.rapids.cudf.ast.ColumnReference;
 import ai.rapids.cudf.ast.CompiledExpression;
 import ai.rapids.cudf.ast.TableReference;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetFileReader;
@@ -45,11 +47,13 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.util.*;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
@@ -224,6 +228,10 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
             assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
                 "Column " + colName + " Row " + tableRow);
             break;
+          case DECIMAL128:
+            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
           case FLOAT32:
             assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
                 "Column " + colName + " Row " + tableRow);
@@ -3659,6 +3667,97 @@ void testMergeApproxPercentile2() {
     }
   }
 
+  @Test
+  void testGroupByMinMaxDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1", "2")
+        .column(0, 1, 3 , 3, 4)
+        .decimal128Column(-4, RoundingMode.HALF_UP,
+            new BigInteger("123456789123456789"),
+            new BigInteger("7979879879879798"),
+            new BigInteger("17979879879879798"),
+            new BigInteger("2234563472398472398"),
+            null)
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .withKeysSorted(true)
+              .withKeysDescending(false, false)
+              .build(), 0, 1)
+          .scan(GroupByScanAggregation.min().onColumn(2),
+              GroupByScanAggregation.max().onColumn(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "1", "2")
+               .column(0, 1, 3, 3, 4)
+               .decimal128Column(-4, RoundingMode.HALF_UP,
+                   new BigInteger("123456789123456789"),
+                   new BigInteger("7979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   null)
+               .decimal128Column(-4, RoundingMode.HALF_UP,
+                   new BigInteger("123456789123456789"),
+                   new BigInteger("7979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   new BigInteger("2234563472398472398"),
+                   null)
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testGroupByMinMaxDecimalAgg() {
+    try (Table t1 = new Table.TestBuilder()
+        .column(-341142443, 48424546)
+        .decimal128Column(-2, RoundingMode.HALF_DOWN,
+            new BigInteger("2978603952268112009"),
+            new BigInteger("571526248386900094"))
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .build(), 0)
+          .aggregate(GroupByAggregation.max().onColumn(1));
+           Table expected = new Table.TestBuilder()
+               .column(-341142443, 48424546)
+               .decimal128Column(-2, RoundingMode.HALF_DOWN,
+                   new BigInteger("2978603952268112009"),
+                   new BigInteger("571526248386900094"))
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testGroupByCountDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1", "2")
+        .column(0, 1, 3 , 3, 4)
+        .decimal128Column(-4, RoundingMode.HALF_UP,
+            new BigInteger("123456789123456789"),
+            new BigInteger("7979879879879798"),
+            new BigInteger("17979879879879798"),
+            new BigInteger("2234563472398472398"),
+            null)
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .withKeysSorted(true)
+              .withKeysDescending(false, false)
+              .build(), 0, 1)
+          .aggregate(GroupByAggregation.count().onColumn(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "2")
+               .column(0, 1, 3, 4)
+               .column(1, 1, 2, 0)
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
   @Test
   void testGroupByUniqueCount() {
     try (Table t1 = new Table.TestBuilder()
@@ -3680,6 +3779,33 @@ void testGroupByUniqueCount() {
     }
   }
 
+  @Test
+  void testOrderByDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1")
+        .column(0, 1, 3 , 3)
+        .decimal64Column(4,
+            123456L,
+            124567L,
+            125678L,
+            126789L)
+        .build()) {
+      try (Table sorted = t1.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "1")
+               .column(   0,    1, 3, 3)
+               .decimal64Column(4,
+                   123456L,
+                   124567L,
+                   125678L,
+                   126789L)
+               .build()) {
+        assertTablesAreEqual(expected, sorted);
+
+      }
+    }
+  }
+
   @Test
   void testGroupByUniqueCountNulls() {
     try (Table t1 = new Table.TestBuilder()
@@ -6707,43 +6833,202 @@ void testTableBasedFilter() {
     }
   }
 
-  private Table getExpectedFileTable() {
-    return getExpectedFileTable(false, false);
+  private enum Columns {
+    BOOL("BOOL"),
+    INT("INT"),
+    BYTE("BYTE"),
+    LONG("LONG"),
+    STRING("STRING"),
+    FLOAT("FLOAT"),
+    DOUBLE("DOUBLE"),
+    DECIMAL64("DECIMAL64"),
+    DECIMAL128("DECIMAL128"),
+    STRUCT("STRUCT"),
+    STRUCT_DEC128("STRUCT_DEC128"),
+    LIST("LIST"),
+    LIST_STRUCT("LIST_STRUCT"),
+    LIST_DEC128("LIST_DEC128");
+
+    final String name;
+
+    Columns(String columnName) {
+      this.name = columnName;
+    }
+  }
+
+  private static class WriteUtils {
+
+    private static final Map<Columns, Function<TestBuilder, TestBuilder>> addColumnFn = Maps.newHashMap();
+
+    static {
+      addColumnFn.put(Columns.BOOL, (t) -> t.column(true, false, false, true, false));
+      addColumnFn.put(Columns.INT, (t) -> t.column(5, 1, 0, 2, 7));
+      addColumnFn.put(Columns.LONG, (t) -> t.column(3l, 9l, 4l, 2l, 20l));
+      addColumnFn.put(Columns.BYTE, (t) -> t.column(new Byte[]{2, 3, 4, 5, 9}));
+      addColumnFn.put(Columns.STRING, (t) -> t.column("this", "is", "a", "test", "string"));
+      addColumnFn.put(Columns.FLOAT, (t) -> t.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f));
+      addColumnFn.put(Columns.DOUBLE, (t) -> t.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d));
+      addColumnFn.put(Columns.DECIMAL64, (t) ->
+          t.decimal64Column(-5, 1L, 323L, 12398423L, -231312412L, 239893414231L));
+      addColumnFn.put(Columns.DECIMAL128, (t) ->
+          t.decimal128Column(-10, RoundingMode.UNNECESSARY, BigInteger.ONE, BigInteger.ZERO,
+              BigInteger.TEN, new BigInteger("100000000000000000000000000000"),
+              new BigInteger("-1234567890123456789012345678")));
+
+      BasicType dec64Type = new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL64, 0));
+      StructType structType = new StructType(true,
+          new BasicType(true, DType.INT32), new BasicType(true, DType.STRING), dec64Type);
+      addColumnFn.put(Columns.STRUCT, (t) -> t.column(structType,
+          struct(1, "k1", BigDecimal.ONE),
+          struct(2, "k2", BigDecimal.ZERO),
+          struct(3, "k3", BigDecimal.TEN),
+          struct(4, "k4", BigDecimal.valueOf(Long.MAX_VALUE)),
+          new HostColumnVector.StructData((List) null)));
+      BasicType dec128Type = new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL128, -5));
+      addColumnFn.put(Columns.STRUCT_DEC128, (t) ->
+          t.column(new StructType(false, dec128Type),
+              struct(BigDecimal.valueOf(Integer.MAX_VALUE, 5)),
+              struct(BigDecimal.valueOf(Long.MAX_VALUE, 5)),
+              struct(new BigDecimal("111111111122222222223333333333").setScale(5)),
+              struct(new BigDecimal("123456789123456789123456789").setScale(5)),
+              struct((BigDecimal) null)));
+
+      addColumnFn.put(Columns.LIST, (t) ->
+          t.column(new ListType(false, new BasicType(false, DType.INT32)),
+              Arrays.asList(1, 2),
+              Arrays.asList(3, 4),
+              Arrays.asList(5),
+              Arrays.asList(6, 7),
+              Arrays.asList(8, 9, 10)));
+      addColumnFn.put(Columns.LIST_STRUCT, (t) ->
+          t.column(new ListType(true, structType),
+              Arrays.asList(struct(1, "k1", BigDecimal.ONE), struct(2, "k2", BigDecimal.ONE),
+                  struct(3, "k3", BigDecimal.ONE)),
+              Arrays.asList(struct(4, "k4", BigDecimal.ONE), struct(5, "k5", BigDecimal.ONE)),
+              Arrays.asList(struct(6, "k6", BigDecimal.ONE)),
+              Arrays.asList(new HostColumnVector.StructData((List) null)),
+              (List) null));
+      addColumnFn.put(Columns.LIST_DEC128, (t) ->
+          t.column(new ListType(true, new StructType(false, dec128Type)),
+              Arrays.asList(struct(BigDecimal.valueOf(Integer.MAX_VALUE, 5)),
+                  struct(BigDecimal.valueOf(Integer.MIN_VALUE, 5))),
+              Arrays.asList(struct(BigDecimal.valueOf(Long.MAX_VALUE, 5)),
+                  struct(BigDecimal.valueOf(0, 5)), struct(BigDecimal.valueOf(-1, 5))),
+              Arrays.asList(struct(new BigDecimal("111111111122222222223333333333").setScale(5))),
+              Arrays.asList(struct(new BigDecimal("123456789123456789123456789").setScale(5))),
+              Arrays.asList(struct((BigDecimal) null))));
+    }
+
+    static TestBuilder addColumn(TestBuilder tb, String colName) {
+      if (!addColumnFn.containsKey(Columns.valueOf(colName))) {
+        throw new IllegalArgumentException("Unknown column name: " + colName);
+      }
+      return addColumnFn.get(Columns.valueOf(colName)).apply(tb);
+    }
+
+    static String[] getAllColumns(boolean withDecimal128) {
+      List<String> columns = Lists.newArrayList(
+          Columns.BOOL.name, Columns.INT.name, Columns.BYTE.name, Columns.LONG.name,
+          Columns.STRING.name, Columns.FLOAT.name, Columns.DOUBLE.name, Columns.DECIMAL64.name,
+          Columns.STRUCT.name, Columns.LIST.name, Columns.LIST_STRUCT.name);
+      if (withDecimal128) {
+        columns.add(Columns.DECIMAL128.name);
+        columns.add(Columns.STRUCT_DEC128.name);
+        columns.add(Columns.LIST_DEC128.name);
+      }
+      String[] ret = new String[columns.size()];
+      columns.toArray(ret);
+      return ret;
+    }
+
+    static String[] getNonNestedColumns(boolean withDecimal128) {
+      List<String> columns = Lists.newArrayList(
+          Columns.BOOL.name, Columns.INT.name, Columns.BYTE.name, Columns.LONG.name,
+          Columns.STRING.name, Columns.FLOAT.name, Columns.DOUBLE.name, Columns.DECIMAL64.name);
+      if (withDecimal128) {
+        columns.add(Columns.DECIMAL128.name);
+      }
+      String[] ret = new String[columns.size()];
+      columns.toArray(ret);
+      return ret;
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, List<String> columns) {
+      for (String colName : columns) {
+        buildWriterOptions(builder, colName);
+      }
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, String... columns) {
+      for (String colName : columns) {
+        buildWriterOptions(builder, colName);
+      }
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, String colName) {
+      switch (Columns.valueOf(colName)) {
+      case BOOL:
+      case INT:
+      case LONG:
+      case FLOAT:
+      case DOUBLE:
+      case BYTE:
+      case STRING:
+        builder.withColumns(false, colName);
+        break;
+      case DECIMAL64:
+        builder.withDecimalColumn(colName, DType.DECIMAL64_MAX_PRECISION);
+        break;
+      case DECIMAL128:
+        builder.withDecimalColumn(colName, DType.DECIMAL128_MAX_PRECISION);
+        break;
+      case STRUCT:
+        builder.withStructColumn(structBuilder(colName)
+            .withNullableColumns("ch_int")
+            .withNullableColumns("ch_str")
+            .withDecimalColumn("ch_dec64", DType.DECIMAL64_MAX_PRECISION, true)
+            .build());
+        break;
+      case LIST:
+        builder.withListColumn(listBuilder(colName, false)
+            .withNonNullableColumns("ch_int")
+            .build());
+        break;
+      case LIST_STRUCT:
+        builder.withListColumn(listBuilder(colName)
+            .withStructColumn(structBuilder(colName)
+                .withNullableColumns("ch_int")
+                .withNullableColumns("ch_str")
+                .withDecimalColumn("ch_dec64", DType.DECIMAL64_MAX_PRECISION, true)
+                .build())
+            .build());
+        break;
+      case STRUCT_DEC128:
+        builder.withStructColumn(structBuilder(colName, false)
+            .withDecimalColumn("ch_dec128", DType.DECIMAL128_MAX_PRECISION, true)
+            .build());
+        break;
+      case LIST_DEC128:
+        builder.withListColumn(listBuilder(colName)
+            .withStructColumn(structBuilder(colName, false)
+                .withDecimalColumn("ch_dec128", DType.DECIMAL128_MAX_PRECISION, true)
+                .build())
+            .build());
+        break;
+      default:
+        throw new IllegalArgumentException("should NOT reach here");
+      }
+    }
   }
 
-  private Table getExpectedFileTable(boolean withNestedColumns) {
-    return getExpectedFileTable(true, true);
+  private Table getExpectedFileTable(String... selectColumns) {
+    return getExpectedFileTable(Lists.newArrayList(selectColumns));
   }
 
-  private Table getExpectedFileTable(boolean withStructColumns, boolean withListColumn) {
-    TestBuilder tb = new TestBuilder()
-        .column(true, false, false, true, false)
-        .column(5, 1, 0, 2, 7)
-        .column(new Byte[]{2, 3, 4, 5, 9})
-        .column(3l, 9l, 4l, 2l, 20l)
-        .column("this", "is", "a", "test", "string")
-        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
-        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
-    StructType nestedType = new StructType(true,
-        new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
-    if (withStructColumns) {
-      tb.column(nestedType,
-          struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
-          struct(4, "k4"), new HostColumnVector.StructData((List) null));
-    }
-    if (withListColumn) {
-      tb.column(new ListType(false, new BasicType(false, DType.INT32)),
-          Arrays.asList(1, 2),
-          Arrays.asList(3, 4),
-          Arrays.asList(5),
-          Arrays.asList(6, 7),
-          Arrays.asList(8, 9, 10))
-          .column(new ListType(false, nestedType),
-              Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
-              Arrays.asList(struct(4, "k4"), struct(5, "k5")),
-              Arrays.asList(struct(6, "k6")),
-              Arrays.asList(new HostColumnVector.StructData((List) null)),
-              Arrays.asList());
+  private Table getExpectedFileTable(List<String> selectColumns) {
+    TestBuilder tb = new TestBuilder();
+    for (String c : selectColumns) {
+      WriteUtils.addColumn(tb, c);
     }
     return tb.build();
   }
@@ -6865,21 +7150,10 @@ void testParquetWriteMap() throws IOException {
 
   @Test
   void testParquetWriteToBufferChunkedWithNested() {
-    ParquetWriterOptions options = ParquetWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-      .withListColumn(listBuilder("_c8")
-            .withNullableColumns("c8-1").build())
-        .withListColumn(listBuilder("c9")
-            .withStructColumn(structBuilder("c9-1")
-                .withNullableColumns("c9-1-1")
-                .withNullableColumns("c9-1-2").build())
-            .build())
-        .build();
-    try (Table table0 = getExpectedFileTable(true);
+    ParquetWriterOptions.Builder optBuilder = ParquetWriterOptions.builder();
+    WriteUtils.buildWriterOptions(optBuilder, WriteUtils.getAllColumns(false));
+    ParquetWriterOptions options = optBuilder.build();
+    try (Table table0 = getExpectedFileTable(WriteUtils.getAllColumns(false));
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
         writer.write(table0);
@@ -6896,20 +7170,18 @@ void testParquetWriteToBufferChunkedWithNested() {
 
   @Test
   void testParquetWriteToBufferChunked() {
-    ParquetWriterOptions options = ParquetWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-        .build();
-    try (Table table0 = getExpectedFileTable(true, false);
+    ParquetWriterOptions.Builder optBuilder = ParquetWriterOptions.builder();
+    List<String> columns = Lists.newArrayList(WriteUtils.getNonNestedColumns(false));
+    columns.add(Columns.STRUCT.name);
+    WriteUtils.buildWriterOptions(optBuilder, columns);
+    ParquetWriterOptions options = optBuilder.build();
+    try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-         try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
-           writer.write(table0);
-           writer.write(table0);
-           writer.write(table0);
-         }
+      try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
+        writer.write(table0);
+        writer.write(table0);
+        writer.write(table0);
+      }
       try (Table table1 = Table.readParquet(ParquetOptions.DEFAULT, consumer.buffer, 0, consumer.offset);
            Table concat = Table.concatenate(table0, table0, table0)) {
         assertTablesAreEqual(concat, table1);
@@ -6987,9 +7259,10 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
   @Test
   void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
     File tempFile = File.createTempFile("test-names-metadata", ".arrow");
-    try (Table table0 = getExpectedFileTable()) {
+    String[] columnNames = WriteUtils.getNonNestedColumns(false);
+    try (Table table0 = getExpectedFileTable(columnNames)) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
-              .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+              .withColumnNames(columnNames)
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -7016,13 +7289,18 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
 
   @Test
   void testArrowIPCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable(true);
+    String[] nonNestedCols = WriteUtils.getNonNestedColumns(false);
+    List<String> columns = Lists.newArrayList(nonNestedCols);
+    columns.add(Columns.STRUCT.name);
+    columns.add(Columns.LIST.name);
+    columns.add(Columns.LIST_STRUCT.name);
+    try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
-              .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
-              .withColumnNames("eighth", "eighth_id", "eighth_name")
-              .withColumnNames("ninth")
-              .withColumnNames("tenth", "child_id", "child_name")
+              .withColumnNames(nonNestedCols)
+              .withColumnNames(Columns.STRUCT.name, "int", "str", "dec64")
+              .withColumnNames(Columns.LIST.name)
+              .withColumnNames(Columns.LIST_STRUCT.name, "int", "str", "dec64")
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) {
         writer.write(table0);
@@ -7049,9 +7327,12 @@ void testArrowIPCWriteToBufferChunked() {
 
   @Test
   void testORCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable(true);
+    String[] selectedColumns = WriteUtils.getAllColumns(false);
+    try (Table table0 = getExpectedFileTable(selectedColumns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-      ORCWriterOptions opts = createORCWriterOptionsWithNested();
+      ORCWriterOptions.Builder builder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(builder, selectedColumns);
+      ORCWriterOptions opts = builder.build();
       try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
@@ -7067,8 +7348,11 @@ void testORCWriteToBufferChunked() {
   @Test
   void testORCWriteToFileChunked() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable(true)) {
-      ORCWriterOptions opts = createORCWriterOptionsWithNested();
+    String[] selectedColumns = WriteUtils.getAllColumns(false);
+    try (Table table0 = getExpectedFileTable(selectedColumns)) {
+      ORCWriterOptions.Builder builder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(builder, selectedColumns);
+      ORCWriterOptions opts = builder.build();
       try (TableWriter writer = Table.writeORCChunked(opts, tempFile.getAbsoluteFile())) {
         writer.write(table0);
       }
@@ -7111,7 +7395,7 @@ void testORCWriteMapChunked() throws IOException {
   @Test
   void testORCWriteToFile() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable()) {
+    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
       table0.writeORC(tempFile.getAbsoluteFile());
       try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table1);
@@ -7124,12 +7408,11 @@ void testORCWriteToFile() throws IOException {
   @Test
   void testORCWriteToFileWithColNames() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    final String[] colNames = new String[]{"bool", "int", "byte","long","str","float","double"};
-    try (Table table0 = getExpectedFileTable()) {
-      ORCWriterOptions options = ORCWriterOptions.builder()
-          .withColumns(true, colNames)
-          .withMetadata("somekey", "somevalue")
-          .build();
+    String[] colNames = WriteUtils.getNonNestedColumns(false);
+    try (Table table0 = getExpectedFileTable(colNames)) {
+      ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optBuilder, colNames);
+      ORCWriterOptions options = optBuilder.build();
       table0.writeORC(options, tempFile.getAbsoluteFile());
       ORCOptions opts = ORCOptions.builder().includeColumn(colNames).build();
       try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
@@ -7140,10 +7423,34 @@ void testORCWriteToFileWithColNames() throws IOException {
     }
   }
 
+  @Test
+  void testORCReadAndWriteForDecimal128() throws IOException {
+    File tempFile = File.createTempFile("test", ".orc");
+    String[] colNames = new String[]{Columns.DECIMAL64.name,
+        Columns.DECIMAL128.name, Columns.STRUCT_DEC128.name, Columns.LIST_DEC128.name};
+    try (Table table0 = getExpectedFileTable(colNames)) {
+      ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optBuilder, colNames);
+      ORCWriterOptions options = optBuilder.build();
+      table0.writeORC(options, tempFile.getAbsoluteFile());
+      ORCOptions opts = ORCOptions.builder()
+          .includeColumn(colNames)
+          .decimal128Column(Columns.DECIMAL128.name,
+              String.format("%s.%s", Columns.STRUCT_DEC128.name, "ch_dec128"),
+              String.format("%s.1.%s", Columns.LIST_DEC128.name, "ch_dec128"))
+          .build();
+      try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
+        assertTablesAreEqual(table0, table1);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
   @Test
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
-    try (Table table0 = getExpectedFileTable()) {
+    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
       String[] colNames = new String[table0.getNumberOfColumns()];
       Arrays.fill(colNames, "");
       ORCWriterOptions opts = ORCWriterOptions.builder()
@@ -7249,27 +7556,7 @@ void fixedWidthRowsRoundTrip() {
 
   // utility methods to reduce typing
 
-  private ORCWriterOptions createORCWriterOptionsWithNested() {
-    // The column metadata should match the table returned from
-    // 'getExpectedFileTable(true)'.
-    return ORCWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-        .withListColumn(listBuilder("_c8")
-            .withNullableColumns("_c8-1").build())
-        .withListColumn(listBuilder("_c9")
-            .withStructColumn(structBuilder("_c9-1")
-                .withNullableColumns("_c9-1-1")
-                .withNullableColumns("_c9-1-2")
-                .build())
-            .build())
-        .build();
-  }
-
-  private StructData struct(Object... values) {
+  private static StructData struct(Object... values) {
     return new StructData(values);
   }
 

From 91141042ac5ce5024975eb2eab63f916047e6b6a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Nov 2021 10:31:10 -0800
Subject: [PATCH 05/72] Add parameters to control row group size in Parquet
 writer (#9677)

Closes https://github.com/rapidsai/cudf/issues/9615

Adds the following API to the Parquet writer:

- Set maximum row group size, in bytes (minimum of 512KB);
- Set maximum row group size, in rows (minimum of 5000).

The API is more limited than its ORC equivalent because of limitation in Parquet page size control/estimation.

Other changes:

- Fix naming in some ORC APIs to be consistent.
- Change `rowgroup` to `row_group` in APIs, since Parquet specs refer to this as "row group", not "rowgroup".
- Replace some `uint32_t` use in Parquet writer.
- Remove unused `target_page_size`.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9677
---
 cpp/include/cudf/io/detail/parquet.hpp   |   2 +-
 cpp/include/cudf/io/orc.hpp              |  20 ++--
 cpp/include/cudf/io/parquet.hpp          | 125 ++++++++++++++++++++++-
 cpp/src/io/functions.cpp                 |   6 +-
 cpp/src/io/orc/writer_impl.cu            |  12 +--
 cpp/src/io/parquet/writer_impl.cu        |  79 +++++++-------
 cpp/src/io/parquet/writer_impl.hpp       |  12 +--
 cpp/tests/io/parquet_test.cpp            |  22 ++++
 python/cudf/cudf/_lib/cpp/io/orc.pxd     |   8 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |  22 +++-
 python/cudf/cudf/_lib/parquet.pyx        |  34 +++---
 python/cudf/cudf/io/parquet.py           |   6 ++
 python/cudf/cudf/tests/test_parquet.py   |  23 +++++
 python/cudf/cudf/utils/ioutils.py        |  10 +-
 14 files changed, 291 insertions(+), 90 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 14f27ef8eef..98922ad10a4 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -148,7 +148,7 @@ class writer {
    * @param[in] metadata_list List of input file metadata
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  static std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+  static std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
     const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 };
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index fb1199fc166..3bc2e6c9ef2 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -475,24 +475,24 @@ class orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
@@ -769,24 +769,24 @@ class chunked_orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 660ec051304..88cf7416506 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -37,6 +37,9 @@ namespace io {
  * @file
  */
 
+constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  // 128MB
+constexpr size_type default_row_group_size_rows = 1000000;
+
 /**
  * @brief Builds parquet_reader_options to use for `read_parquet()`.
  */
@@ -398,6 +401,10 @@ class parquet_writer_options {
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
   std::string _column_chunks_file_path;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink and table.
@@ -472,6 +479,16 @@ class parquet_writer_options {
    */
   std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -510,6 +527,28 @@ class parquet_writer_options {
   {
     _column_chunks_file_path.assign(file_path);
   }
+
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
 };
 
 class parquet_writer_options_builder {
@@ -582,6 +621,30 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
    *
@@ -637,7 +700,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(
  * @param[in] metadata_list List of input file metadata.
  * @return A parquet-compatible blob that contains the data for all row groups in the list.
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 
 /**
@@ -660,6 +723,10 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink.
@@ -703,6 +770,16 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -732,6 +809,28 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -811,6 +910,30 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b678941db21..a8ca1d3a459 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -405,13 +405,13 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
 }
 
 /**
- * @copydoc cudf::io::merge_rowgroup_metadata
+ * @copydoc cudf::io::merge_row_group_metadata
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   CUDF_FUNC_RANGE();
-  return detail_parquet::writer::merge_rowgroup_metadata(metadata_list);
+  return detail_parquet::writer::merge_row_group_metadata(metadata_list);
 }
 
 table_input_metadata::table_input_metadata(table_view const& table,
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 2bf020d08a2..1563e3e1fd7 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1301,10 +1301,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
@@ -1321,10 +1321,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2c7d745bb4c..62803432157 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1026,6 +1026,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1045,6 +1047,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1148,8 +1152,7 @@ void writer::impl::write(table_view const& table)
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
 
-  uint32_t num_fragments =
-    (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+  size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1162,21 +1165,20 @@ void writer::impl::write(table_view const& table)
     init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
   }
 
-  size_t global_rowgroup_base = md.row_groups.size();
+  auto const global_rowgroup_base = static_cast<size_type>(md.row_groups.size());
 
   // Decide row group boundaries based on uncompressed data size
-  size_t rowgroup_size   = 0;
-  uint32_t num_rowgroups = 0;
-  for (uint32_t f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments;
-       f++) {
-    size_t fragment_data_size = 0;
+  auto rowgroup_size = 0ul;
+  auto num_rowgroups = 0;
+  for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) {
+    auto fragment_data_size = 0ul;
     // Replace with STL algorithm to transform and sum
     for (auto i = 0; i < num_columns; i++) {
       fragment_data_size += fragments[i][f].fragment_data_size;
     }
     if (f > rowgroup_start &&
-        (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
-         (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) {
+        (rowgroup_size + fragment_data_size > max_row_group_size ||
+         (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
       md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
@@ -1204,15 +1206,15 @@ void writer::impl::write(table_view const& table)
     }
   }
   // Initialize row groups and column chunks
-  uint32_t num_chunks = num_rowgroups * num_columns;
+  auto const num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
+  for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk = (uint32_t)(
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+    size_type const fragments_in_chunk =
+      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
-    for (int i = 0; i < num_columns; i++) {
+    for (auto i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk* ck = &chunks[r][i];
 
       *ck             = {};
@@ -1244,8 +1246,8 @@ void writer::impl::write(table_view const& table)
   }
 
   auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
-  for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
-    for (int col = 0; col < num_columns; col++) {
+  for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
+    for (auto col = 0; col < num_columns; col++) {
       if (chunks.host_view()[rg][col].use_dictionary) {
         md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
           Encoding::PLAIN_DICTIONARY);
@@ -1274,16 +1276,16 @@ void writer::impl::write(table_view const& table)
   }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<uint32_t> batch_list;
-  uint32_t num_pages          = 0;
-  size_t max_bytes_in_batch   = 1024 * 1024 * 1024;  // 1GB - TBD: Tune this
-  size_t max_uncomp_bfr_size  = 0;
-  size_t max_comp_bfr_size    = 0;
-  size_t max_chunk_bfr_size   = 0;
-  uint32_t max_pages_in_batch = 0;
-  size_t bytes_in_batch       = 0;
-  size_t comp_bytes_in_batch  = 0;
-  for (uint32_t r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
+  std::vector<size_type> batch_list;
+  size_type num_pages          = 0;
+  size_t max_bytes_in_batch    = 1024 * 1024 * 1024;  // 1GB - TODO: Tune this
+  size_t max_uncomp_bfr_size   = 0;
+  size_t max_comp_bfr_size     = 0;
+  size_t max_chunk_bfr_size    = 0;
+  size_type max_pages_in_batch = 0;
+  size_t bytes_in_batch        = 0;
+  size_t comp_bytes_in_batch   = 0;
+  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
     size_t rowgroup_size      = 0;
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
@@ -1331,11 +1333,11 @@ void writer::impl::write(table_view const& table)
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
-  for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
-    uint8_t* bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
-    uint8_t* bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (uint32_t j = 0; j < batch_list[b]; j++, r++) {
-      for (int i = 0; i < num_columns; i++) {
+  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+    auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
+    auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
+    for (auto j = 0; j < batch_list[b]; j++, r++) {
+      for (auto i = 0; i < num_columns; i++) {
         gpu::EncColumnChunk* ck = &chunks[r][i];
         ck->uncompressed_bfr    = bfr;
         ck->compressed_bfr      = bfr_c;
@@ -1360,14 +1362,15 @@ void writer::impl::write(table_view const& table)
   pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
-  for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
+  for (auto b = 0, r = 0, global_r = global_rowgroup_base;
+       b < static_cast<size_type>(batch_list.size());
        b++) {
     // Count pages in this batch
-    uint32_t rnext               = r + batch_list[b];
-    uint32_t first_page_in_batch = chunks[r][0].first_page;
-    uint32_t first_page_in_next_batch =
+    auto const rnext               = r + batch_list[b];
+    auto const first_page_in_batch = chunks[r][0].first_page;
+    auto const first_page_in_next_batch =
       (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch;
+    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
     // device_span<gpu::EncPage> batch_pages{pages.data() + first_page_in_batch, }
     encode_pages(
       chunks,
@@ -1514,7 +1517,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_ch
   return _impl->close(column_chunks_file_path);
 }
 
-std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   std::vector<uint8_t> output;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index c7cdf8effd1..9188218f607 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -56,13 +56,6 @@ using cudf::detail::hostdevice_2dvector;
  * @brief Implementation for parquet writer
  */
 class writer::impl {
-  // Parquet datasets are divided into fixed-size, independent rowgroups
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXSIZE = 128 * 1024 * 1024;  // 128MB
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXROWS = 1000000;            // Or at most 1M rows
-
-  // rowgroups are divided into pages
-  static constexpr uint32_t DEFAULT_TARGET_PAGE_SIZE = 512 * 1024;
-
  public:
   /**
    * @brief Constructor with writer options.
@@ -209,9 +202,8 @@ class writer::impl {
   // Cuda stream to be used
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-  size_t max_rowgroup_size_          = DEFAULT_ROWGROUP_MAXSIZE;
-  size_t max_rowgroup_rows_          = DEFAULT_ROWGROUP_MAXROWS;
-  size_t target_page_size_           = DEFAULT_TARGET_PAGE_SIZE;
+  size_t max_row_group_size          = default_row_group_size_bytes;
+  size_type max_row_group_rows       = default_row_group_size_rows;
   Compression compression_           = Compression::UNCOMPRESSED;
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3bae8d7ab1e..b233819092a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -3056,4 +3056,26 @@ TEST_F(ParquetReaderTest, EmptyOutput)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
+{
+  const auto unused_table = std::make_unique<table>();
+  std::vector<char> out_buffer;
+
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_rows(4999),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_bytes(511 << 10),
+    cudf::logic_error);
+
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_rows(4999),
+               cudf::logic_error);
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_bytes(511 << 10),
+               cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index f0450483345..4b5ec913fb6 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -72,10 +72,10 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options()
         cudf_io_types.sink_info get_sink() except+
         cudf_io_types.compression_type get_compression() except+
-        bool enable_statistics() except+
-        size_t stripe_size_bytes() except+
-        size_type stripe_size_rows() except+
-        size_type row_index_stride() except+
+        bool is_enabled_statistics() except+
+        size_t get_stripe_size_bytes() except+
+        size_type get_stripe_size_rows() except+
+        size_type get_row_index_stride() except+
         cudf_table_view.table_view get_table() except+
         const cudf_io_types.table_input_metadata *get_metadata() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 81ca7e5836b..9d95dce83bc 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -74,6 +74,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -87,6 +89,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_path(
             string column_chunks_file_path
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -116,6 +120,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         parquet_writer_options build() except +
 
@@ -130,6 +140,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_io_types.table_input_metadata* get_metadata(
         ) except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -140,6 +152,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -160,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         chunked_parquet_writer_options build() except +
 
@@ -173,6 +193,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             string column_chunks_file_path,
         ) except+
 
-    cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata(
+    cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
     ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 71705f4d0c1..d17184685fa 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -46,7 +46,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    merge_rowgroup_metadata as parquet_merge_metadata,
+    merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
     parquet_writer_options,
@@ -282,7 +282,9 @@ cpdef write_parquet(
         object compression="snappy",
         object statistics="ROWGROUP",
         object metadata_file_path=None,
-        object int96_timestamps=False):
+        object int96_timestamps=False,
+        object row_group_size_bytes=None,
+        object row_group_size_rows=None):
     """
     Cython function to call into libcudf API, see `write_parquet`.
 
@@ -334,7 +336,6 @@ cpdef write_parquet(
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
-    cdef parquet_writer_options args
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
     cdef string c_column_chunks_file_path
     cdef bool _int96_timestamps = int96_timestamps
@@ -342,16 +343,21 @@ cpdef write_parquet(
         c_column_chunks_file_path = str.encode(metadata_file_path)
 
     # Perform write
+    cdef parquet_writer_options args = move(
+        parquet_writer_options.builder(sink, tv)
+        .metadata(tbl_meta.get())
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .column_chunks_file_path(c_column_chunks_file_path)
+        .int96_timestamps(_int96_timestamps)
+        .build()
+    )
+    if row_group_size_bytes is not None:
+        args.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        args.set_row_group_size_rows(row_group_size_rows)
+
     with nogil:
-        args = move(
-            parquet_writer_options.builder(sink, tv)
-            .metadata(tbl_meta.get())
-            .compression(comp_type)
-            .stats_level(stat_freq)
-            .column_chunks_file_path(c_column_chunks_file_path)
-            .int96_timestamps(_int96_timestamps)
-            .build()
-        )
         out_metadata_c = move(parquet_writer(args))
 
     if metadata_file_path is not None:
@@ -483,11 +489,11 @@ cdef class ParquetWriter:
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
-    Cython function to call into libcudf API, see `merge_rowgroup_metadata`.
+    Cython function to call into libcudf API, see `merge_row_group_metadata`.
 
     See Also
     --------
-    cudf.io.parquet.merge_rowgroup_metadata
+    cudf.io.parquet.merge_row_group_metadata
     """
     cdef vector[unique_ptr[vector[uint8_t]]] list_c
     cdef vector[uint8_t] blob_c
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 302021a082f..9d665d9a0a5 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -441,6 +441,8 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
+    row_group_size_bytes=None,
+    row_group_size_rows=None,
     *args,
     **kwargs,
 ):
@@ -480,6 +482,8 @@ def to_parquet(
                     statistics=statistics,
                     metadata_file_path=metadata_file_path,
                     int96_timestamps=int96_timestamps,
+                    row_group_size_bytes=row_group_size_bytes,
+                    row_group_size_rows=row_group_size_rows,
                 )
         else:
             write_parquet_res = libparquet.write_parquet(
@@ -490,6 +494,8 @@ def to_parquet(
                 statistics=statistics,
                 metadata_file_path=metadata_file_path,
                 int96_timestamps=int96_timestamps,
+                row_group_size_bytes=row_group_size_bytes,
+                row_group_size_rows=row_group_size_rows,
             )
 
         return write_parquet_res
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index df31738050b..b6595be9566 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -231,6 +231,11 @@ def _make_parquet_path_or_buf(src):
     yield _make_parquet_path_or_buf
 
 
+@pytest.fixture(scope="module")
+def large_int64_gdf():
+    return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)}))
+
+
 @pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize(
@@ -2170,3 +2175,21 @@ def test_parquet_reader_brotli(datadir):
     got = cudf.read_parquet(fname).to_pandas(nullable=True)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
+@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
+def test_parquet_writer_row_group_size(
+    tmpdir, large_int64_gdf, size_bytes, size_rows
+):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(
+        fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
+    )
+
+    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    # 8 bytes per row, as the column is int64
+    expected_num_rows = max(
+        math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
+    )
+    assert expected_num_rows == row_groups
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6746753249c..11994830fed 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -221,6 +221,12 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight. If ``False``,
     timestamps will not be altered.
+row_group_size_bytes: integer or None, default None
+    Maximum size of each stripe of the output.
+    If None, 13369344 (128MB) will be used.
+row_group_size_rows: integer or None, default None
+    Maximum number of rows of each stripe of the output.
+    If None, 1000000 will be used.
 
 
 See Also
@@ -404,10 +410,10 @@
 stripe_size_bytes: integer or None, default None
     Maximum size of each stripe of the output.
     If None, 67108864 (64MB) will be used.
-stripe_size_rows: integer or None, default None 1000000
+stripe_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
-row_index_stride: integer or None, default None 10000
+row_index_stride: integer or None, default None
     Row index stride (maximum number of rows in each row group).
     If None, 10000 will be used.
 

From 17e6f5b9d0a9456e82250f725da5fe61ce6c9ff5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 17 Nov 2021 14:58:38 -0800
Subject: [PATCH 06/72] Simplify merge internals and reduce overhead (#9516)

This PR is a pretty thorough rewrite of the internals of merging. There is a ton of complexity imposed by matching all the different edge cases allowed by the pandas API, but I've tried to unify the logic for different code paths as much as possible. I've also added checks for a number of edge cases that were not previously being handled. I see about a 10% performance improvement for merges on small to medium data sizes from this PR (as expected, there's no change for large data where most time is spent in C++). There's also a substantial reduction in total code that should make it easier to address issues going forward. I'm still not entirely happy with the complexity of the result and I think that further simplification should be possible, but I think this is a sufficiently large step forward to be worth pushing forward in this state, especially if it helps enable other changes to joining.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9516
---
 python/cudf/cudf/core/dataframe.py            |  30 +-
 python/cudf/cudf/core/frame.py                |   9 +-
 python/cudf/cudf/core/groupby/groupby.py      |  24 -
 python/cudf/cudf/core/join/__init__.py        |   4 +-
 python/cudf/cudf/core/join/_join_helpers.py   | 118 ++--
 python/cudf/cudf/core/join/join.py            | 505 ++++++++----------
 python/cudf/cudf/tests/test_joining.py        |  34 +-
 python/dask_cudf/dask_cudf/tests/test_join.py |   4 -
 8 files changed, 288 insertions(+), 440 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b2e6588edb2..a95453a4e62 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
-                    new_df = self._from_columns(
-                        [data], index=index, columns=columns
-                    )
+                    if columns is not None:
+                        data = dict(zip(columns, [data]))
+                    else:
+                        data = dict(enumerate([data]))
+                    new_df = DataFrame(data=data, index=index)
+
                     self._data = new_df._data
                     self.index = new_df._index
                     self.columns = new_df.columns
@@ -3760,11 +3763,8 @@ def join(
                 FutureWarning,
             )
 
-        lhs = self
-        rhs = other
-
-        df = lhs.merge(
-            rhs,
+        df = self.merge(
+            other,
             left_index=True,
             right_index=True,
             how=how,
@@ -3772,7 +3772,7 @@ def join(
             sort=sort,
         )
         df.index.name = (
-            None if lhs.index.name != rhs.index.name else lhs.index.name
+            None if self.index.name != other.index.name else self.index.name
         )
         return df
 
@@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
-    @classmethod
-    def _from_columns(cls, cols, index=None, columns=None):
-        """
-        Construct a DataFrame from a list of Columns
-        """
-        if columns is not None:
-            data = dict(zip(columns, cols))
-        else:
-            data = dict(enumerate(cols))
-
-        return cls(data=data, index=index,)
-
     def interpolate(
         self,
         method="linear",
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c0858398492..72239fc2a8e 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -46,7 +46,7 @@
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.join import merge
+from cudf.core.join import Merge, MergeSemi
 from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
@@ -3755,6 +3755,7 @@ def _merge(
         suffixes=("_x", "_y"),
     ):
         lhs, rhs = self, right
+        merge_cls = Merge
         if how == "right":
             # Merge doesn't support right, so just swap
             how = "left"
@@ -3762,8 +3763,10 @@ def _merge(
             left_on, right_on = right_on, left_on
             left_index, right_index = right_index, left_index
             suffixes = (suffixes[1], suffixes[0])
+        elif how in {"leftsemi", "leftanti"}:
+            merge_cls = MergeSemi
 
-        return merge(
+        return merge_cls(
             lhs,
             rhs,
             on=on,
@@ -3775,7 +3778,7 @@ def _merge(
             sort=sort,
             indicator=indicator,
             suffixes=suffixes,
-        )
+        ).perform_merge()
 
     def _is_sorted(self, ascending=None, null_position=None):
         """
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ba69e42674a..dc6461663ce 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1178,18 +1178,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
@@ -1262,18 +1250,6 @@ class SeriesGroupBy(GroupBy):
     Name: Max Speed, dtype: float64
     """
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def agg(self, func):
         result = super().agg(func)
 
diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py
index 0463b8f9df1..71a91c398ad 100644
--- a/python/cudf/cudf/core/join/__init__.py
+++ b/python/cudf/cudf/core/join/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from cudf.core.join.join import merge
+from cudf.core.join.join import Merge, MergeSemi
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index cc9c0fb66da..6dec0b10273 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -3,16 +3,17 @@
 
 import collections
 import warnings
-from typing import TYPE_CHECKING, Any, Iterable, Tuple
+from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
-import pandas as pd
 
 import cudf
+from cudf.api.types import is_dtype_equal
+from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
 if TYPE_CHECKING:
-    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.column import ColumnBase
     from cudf.core.frame import Frame
 
 
@@ -28,61 +29,36 @@ class _Indexer:
     # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
     # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
 
-    def __init__(self, name: Any, column=False, index=False):
-        if column and index:
-            raise ValueError("Cannot specify both column and index")
+    def __init__(self, name: Any):
         self.name = name
-        self.column, self.index = column, index
 
+
+class _ColumnIndexer(_Indexer):
     def get(self, obj: Frame) -> ColumnBase:
-        # get the column from `obj`
-        if self.column:
-            return obj._data[self.name]
-        else:
-            if obj._index is not None:
-                return obj._index._data[self.name]
-        raise KeyError()
+        return obj._data[self.name]
 
     def set(self, obj: Frame, value: ColumnBase, validate=False):
-        # set the colum in `obj`
-        if self.column:
-            obj._data.set_by_label(self.name, value, validate=validate)
-        else:
-            if obj._index is not None:
-                obj._index._data.set_by_label(
-                    self.name, value, validate=validate
-                )
-            else:
-                raise KeyError()
-
-
-def _frame_select_by_indexers(
-    frame: Frame, indexers: Iterable[_Indexer]
-) -> Frame:
-    # Select columns from the given `Frame` using `indexers`,
-    # and return a new `Frame`.
-    index_data = frame._data.__class__()
-    data = frame._data.__class__()
-
-    for idx in indexers:
-        if idx.index:
-            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
-        else:
-            data.set_by_label(idx.name, idx.get(frame), validate=False)
+        obj._data.set_by_label(self.name, value, validate=validate)
 
-    result_index = (
-        cudf.core.index._index_from_data(index_data) if index_data else None
-    )
-    result = cudf.core.frame.Frame(data=data, index=result_index)
-    return result
+
+class _IndexIndexer(_Indexer):
+    def get(self, obj: Frame) -> ColumnBase:
+        if obj._index is not None:
+            return obj._index._data[self.name]
+        raise KeyError
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        if obj._index is not None:
+            obj._index._data.set_by_label(self.name, value, validate=validate)
+        else:
+            raise KeyError
 
 
 def _match_join_keys(
     lcol: ColumnBase, rcol: ColumnBase, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # returns the common dtype that lcol and rcol should be casted to,
-    # before they can be used as left and right join keys.
-    # If no casting is necessary, returns None
+    # Casts lcol and rcol to a common dtype for use as join keys. If no casting
+    # is necessary, they are returned as is.
 
     common_type = None
 
@@ -91,12 +67,22 @@ def _match_join_keys(
     rtype = rcol.dtype
 
     # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _match_categorical_dtypes(lcol, rcol, how)
+    left_is_categorical = isinstance(ltype, CategoricalDtype)
+    right_is_categorical = isinstance(rtype, CategoricalDtype)
+    if left_is_categorical and right_is_categorical:
+        return _match_categorical_dtypes_both(
+            cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how
+        )
+    elif left_is_categorical or right_is_categorical:
+        if left_is_categorical:
+            if how in {"left", "leftsemi", "leftanti"}:
+                return lcol, rcol.astype(ltype)
+            common_type = ltype.categories.dtype
+        else:
+            common_type = rtype.categories.dtype
+        return lcol.astype(common_type), rcol.astype(common_type)
 
-    if pd.api.types.is_dtype_equal(ltype, rtype):
+    if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
     if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
@@ -131,34 +117,9 @@ def _match_join_keys(
     return lcol.astype(common_type), rcol.astype(common_type)
 
 
-def _match_categorical_dtypes(
-    lcol: ColumnBase, rcol: ColumnBase, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
-    # cast the keys lcol and rcol to a common dtype
-    # when at least one of them is a categorical type
-    ltype, rtype = lcol.dtype, rcol.dtype
-
-    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
-        rcol, cudf.core.column.CategoricalColumn
-    ):
-        # if both are categoricals, logic is complicated:
-        return _match_categorical_dtypes_both(lcol, rcol, how)
-
-    if isinstance(ltype, CategoricalDtype):
-        if how in {"left", "leftsemi", "leftanti"}:
-            return lcol, rcol.astype(ltype)
-        common_type = ltype.categories.dtype
-    elif isinstance(rtype, CategoricalDtype):
-        common_type = rtype.categories.dtype
-    return lcol.astype(common_type), rcol.astype(common_type)
-
-
 def _match_categorical_dtypes_both(
     lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # The commontype depends on both `how` and the specifics of the
-    # categorical variables to be merged.
-
     ltype, rtype = lcol.dtype, rcol.dtype
 
     # when both are ordered and both have the same categories,
@@ -184,9 +145,6 @@ def _match_categorical_dtypes_both(
             "neither side is ordered"
         )
 
-    # the following should now always hold
-    assert not ltype.ordered and not rtype.ordered
-
     if how == "inner":
         # cast to category types -- we must cast them back later
         return _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 28b2d5d8167..dd8f462fb1d 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,16 +1,14 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from __future__ import annotations
 
-import functools
-from collections import namedtuple
-from typing import TYPE_CHECKING, Callable, Tuple
+from typing import TYPE_CHECKING, Callable
 
 import cudf
 from cudf import _lib as libcudf
 from cudf.core.join._join_helpers import (
     _coerce_to_tuple,
-    _frame_select_by_indexers,
-    _Indexer,
+    _ColumnIndexer,
+    _IndexIndexer,
     _match_join_keys,
 )
 
@@ -18,47 +16,7 @@
     from cudf.core.frame import Frame
 
 
-def merge(
-    lhs,
-    rhs,
-    *,
-    on,
-    left_on,
-    right_on,
-    left_index,
-    right_index,
-    how,
-    sort,
-    indicator,
-    suffixes,
-):
-    if how in {"leftsemi", "leftanti"}:
-        merge_cls = MergeSemi
-    else:
-        merge_cls = Merge
-    mergeobj = merge_cls(
-        lhs,
-        rhs,
-        on=on,
-        left_on=left_on,
-        right_on=right_on,
-        left_index=left_index,
-        right_index=right_index,
-        how=how,
-        sort=sort,
-        indicator=indicator,
-        suffixes=suffixes,
-    )
-    return mergeobj.perform_merge()
-
-
-_JoinKeys = namedtuple("JoinKeys", ["left", "right"])
-
-
-class Merge(object):
-    # A namedtuple of indexers representing the left and right keys
-    _keys: _JoinKeys
-
+class Merge:
     # The joiner function must have the following signature:
     #
     #     def joiner(
@@ -71,7 +29,7 @@ class Merge(object):
     # join key. The `joiner` returns a tuple of two Columns
     # representing the rows to gather from the left- and right- side
     # tables respectively.
-    _joiner: Callable
+    _joiner: Callable = libcudf.join.join
 
     def __init__(
         self,
@@ -133,150 +91,157 @@ def __init__(
             how=how,
             suffixes=suffixes,
         )
-        self._joiner = functools.partial(libcudf.join.join, how=how)
-
-        self.lhs = lhs
-        self.rhs = rhs
-        self.on = on
-        self.left_on = left_on
-        self.right_on = right_on
-        self.left_index = left_index
-        self.right_index = right_index
+
+        self.lhs = lhs.copy(deep=False)
+        self.rhs = rhs.copy(deep=False)
         self.how = how
         self.sort = sort
-        if suffixes:
-            self.lsuffix, self.rsuffix = suffixes
-        self._compute_join_keys()
-
-    @property
-    def _out_class(self):
-        # type of the result
-        out_class = cudf.DataFrame
+        self.lsuffix, self.rsuffix = suffixes
+
+        # At this point validation guarantees that if on is not None we
+        # don't have any other args, so we can apply it directly to left_on and
+        # right_on.
+        self._using_left_index = bool(left_index)
+        left_on = (
+            lhs.index._data.names if left_index else left_on if left_on else on
+        )
+        self._using_right_index = bool(right_index)
+        right_on = (
+            rhs.index._data.names
+            if right_index
+            else right_on
+            if right_on
+            else on
+        )
 
-        if isinstance(self.lhs, cudf.MultiIndex) or isinstance(
-            self.rhs, cudf.MultiIndex
+        if left_on or right_on:
+            self._left_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_left_index and on in lhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(left_on) if left_on else [])
+            ]
+            self._right_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_right_index and on in rhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(right_on) if right_on else [])
+            ]
+            if len(self._left_keys) != len(self._right_keys):
+                raise ValueError(
+                    "Merge operands must have same number of join key columns"
+                )
+            self._using_left_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._left_keys
+            )
+            self._using_right_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._right_keys
+            )
+        else:
+            # if `on` is not provided and we're not merging
+            # index with column or on both indexes, then use
+            # the intersection  of columns in both frames
+            on_names = set(lhs._data) & set(rhs._data)
+            self._left_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._right_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._using_left_index = False
+            self._using_right_index = False
+
+        if isinstance(lhs, cudf.MultiIndex) or isinstance(
+            rhs, cudf.MultiIndex
         ):
-            out_class = cudf.MultiIndex
-        elif isinstance(self.lhs, cudf.BaseIndex):
-            out_class = self.lhs.__class__
-        return out_class
+            self._out_class = cudf.MultiIndex
+        elif isinstance(lhs, cudf.BaseIndex):
+            self._out_class = lhs.__class__
+        else:
+            self._out_class = cudf.DataFrame
+
+        self._key_columns_with_same_name = (
+            set(_coerce_to_tuple(on))
+            if on
+            else set()
+            if (self._using_left_index or self._using_right_index)
+            else set(
+                [
+                    lkey.name
+                    for lkey, rkey in zip(self._left_keys, self._right_keys)
+                    if lkey.name == rkey.name
+                ]
+            )
+        )
 
     def perform_merge(self) -> Frame:
-        lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs)
-
-        left_table = _frame_select_by_indexers(lhs, self._keys.left)
-        right_table = _frame_select_by_indexers(rhs, self._keys.right)
+        left_join_cols = {}
+        right_join_cols = {}
+
+        for left_key, right_key in zip(self._left_keys, self._right_keys):
+            lcol = left_key.get(self.lhs)
+            rcol = right_key.get(self.rhs)
+            lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how)
+            left_join_cols[left_key.name] = lcol_casted
+            right_join_cols[left_key.name] = rcol_casted
+
+            # Categorical dtypes must be cast back from the underlying codes
+            # type that was returned by _match_join_keys.
+            if (
+                self.how == "inner"
+                and isinstance(lcol.dtype, cudf.CategoricalDtype)
+                and isinstance(rcol.dtype, cudf.CategoricalDtype)
+            ):
+                lcol_casted = lcol_casted.astype("category")
+                rcol_casted = rcol_casted.astype("category")
+
+            left_key.set(self.lhs, lcol_casted, validate=False)
+            right_key.set(self.rhs, rcol_casted, validate=False)
 
         left_rows, right_rows = self._joiner(
-            left_table, right_table, how=self.how,
+            cudf.core.frame.Frame(left_join_cols),
+            cudf.core.frame.Frame(right_join_cols),
+            how=self.how,
         )
-        lhs, rhs = self._restore_categorical_keys(lhs, rhs)
 
-        left_result = cudf.core.frame.Frame()
-        right_result = cudf.core.frame.Frame()
+        gather_index = self._using_left_index or self._using_right_index
 
-        gather_index = self.left_index or self.right_index
-        if left_rows is not None:
-            left_result = lhs._gather(
+        left_result = (
+            self.lhs._gather(
                 left_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
-        if right_rows is not None:
-            right_result = rhs._gather(
+            if left_rows is not None
+            else cudf.core.frame.Frame()
+        )
+        right_result = (
+            self.rhs._gather(
                 right_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
+            if right_rows is not None
+            else cudf.core.frame.Frame()
+        )
 
-        result = self._merge_results(left_result, right_result)
+        result = self._out_class._from_data(
+            *self._merge_results(left_result, right_result)
+        )
 
         if self.sort:
             result = self._sort_result(result)
         return result
 
-    def _compute_join_keys(self):
-        # Computes self._keys
-        left_keys = []
-        right_keys = []
-        if (
-            self.left_index
-            or self.right_index
-            or self.left_on
-            or self.right_on
-        ):
-            if self.left_index:
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.lhs.index._data.names
-                    ]
-                )
-            if self.left_on:
-                # TODO: require left_on or left_index to be specified
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.left_on)
-                    ]
-                )
-            if self.right_index:
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.rhs.index._data.names
-                    ]
-                )
-            if self.right_on:
-                # TODO: require right_on or right_index to be specified
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.right_on)
-                    ]
-                )
-        elif self.on:
-            on_names = _coerce_to_tuple(self.on)
-            for on in on_names:
-                # If `on` is provided, Merge on columns if present,
-                # otherwise default to indexes.
-                if on in self.lhs._data:
-                    left_keys.append(_Indexer(name=on, column=True))
-                else:
-                    left_keys.append(_Indexer(name=on, index=True))
-                if on in self.rhs._data:
-                    right_keys.append(_Indexer(name=on, column=True))
-                else:
-                    right_keys.append(_Indexer(name=on, index=True))
-
-        else:
-            # if `on` is not provided and we're not merging
-            # index with column or on both indexes, then use
-            # the intersection  of columns in both frames
-            on_names = set(self.lhs._data) & set(self.rhs._data)
-            left_keys = [_Indexer(name=on, column=True) for on in on_names]
-            right_keys = [_Indexer(name=on, column=True) for on in on_names]
-
-        if len(left_keys) != len(right_keys):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
-        self._keys = _JoinKeys(left=left_keys, right=right_keys)
-
-    def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
+    def _merge_results(self, left_result: Frame, right_result: Frame):
         # Merge the Frames `left_result` and `right_result` into a single
         # `Frame`, suffixing column names if necessary.
 
         # If two key columns have the same name, a single output column appears
-        # in the result. For all other join types, the key column from the rhs
-        # is simply dropped. For outer joins, the two key columns are combined
-        # by filling nulls in the left key column with corresponding values
-        # from the right key column:
+        # in the result. For all non-outer join types, the key column from the
+        # rhs is simply dropped. For outer joins, the two key columns are
+        # combined by filling nulls in the left key column with corresponding
+        # values from the right key column:
         if self.how == "outer":
-            for lkey, rkey in zip(*self._keys):
+            for lkey, rkey in zip(self._left_keys, self._right_keys):
                 if lkey.name == rkey.name:
                     # fill nulls in lhs from values in the rhs
                     lkey.set(
@@ -285,36 +250,26 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
                         validate=False,
                     )
 
-        # Compute the result column names:
-        # left_names and right_names will be a mappings of input column names
-        # to the corresponding names in the final result.
-        left_names = dict(zip(left_result._data, left_result._data))
-        right_names = dict(zip(right_result._data, right_result._data))
-
-        # For any columns from left_result and right_result that have the same
-        # name:
-        # - if they are key columns, keep only the left column
-        # - if they are not key columns, use suffixes to differentiate them
-        #   in the final result
-        common_names = set(left_names) & set(right_names)
-
-        if self.on:
-            key_columns_with_same_name = self.on
-        else:
-            key_columns_with_same_name = [
-                lkey.name
-                for lkey, rkey in zip(*self._keys)
-                if (
-                    (lkey.index, rkey.index) == (False, False)
-                    and lkey.name == rkey.name
-                )
-            ]
-        for name in common_names:
-            if name not in key_columns_with_same_name:
-                left_names[name] = f"{name}{self.lsuffix}"
-                right_names[name] = f"{name}{self.rsuffix}"
+        # All columns from the left table make it into the output. Non-key
+        # columns that share a name with a column in the right table are
+        # suffixed with the provided suffix.
+        common_names = set(left_result._data.names) & set(
+            right_result._data.names
+        )
+        cols_to_suffix = common_names - self._key_columns_with_same_name
+        data = {
+            (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
+            for name, col in left_result._data.items()
+        }
+
+        # The right table follows the same rule as the left table except that
+        # key columns from the right table are removed.
+        for name, col in right_result._data.items():
+            if name in common_names:
+                if name not in self._key_columns_with_same_name:
+                    data[f"{name}{self.rsuffix}"] = col
             else:
-                del right_names[name]
+                data[name] = col
 
         # determine if the result has multiindex columns.  The result
         # of a join has a MultiIndex as its columns if:
@@ -333,69 +288,44 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
         else:
             multiindex_columns = False
 
-        # Assemble the data columns of the result
-        data = left_result._data.__class__(multiindex=multiindex_columns)
-
-        for lcol in left_names:
-            data.set_by_label(
-                left_names[lcol], left_result._data[lcol], validate=False
-            )
-        for rcol in right_names:
-            data.set_by_label(
-                right_names[rcol], right_result._data[rcol], validate=False
-            )
-
-        # Index of the result:
-        if self.left_index and self.right_index:
+        if self._using_right_index:
+            # right_index and left_on
             index = left_result._index
-        elif self.left_index:
+        elif self._using_left_index:
             # left_index and right_on
             index = right_result._index
-        elif self.right_index:
-            # right_index and left_on
-            index = left_result._index
         else:
             index = None
 
         # Construct result from data and index:
-        result = self._out_class._from_data(data=data, index=index)
-
-        return result
+        return (
+            left_result._data.__class__(
+                data=data, multiindex=multiindex_columns
+            ),
+            index,
+        )
 
     def _sort_result(self, result: Frame) -> Frame:
         # Pandas sorts on the key columns in the
         # same order as given in 'on'. If the indices are used as
         # keys, the index will be sorted. If one index is specified,
         # the key columns on the other side will be used to sort.
-        if self.on:
-            if isinstance(result, cudf.BaseIndex):
-                sort_order = result._get_sorted_inds()
-            else:
-                # need a list instead of a tuple here because
-                # _get_sorted_inds calls down to ColumnAccessor.get_by_label
-                # which handles lists and tuples differently
-                sort_order = result._get_sorted_inds(
-                    list(_coerce_to_tuple(self.on))
-                )
-            return result._gather(
-                sort_order, keep_index=False, check_bounds=False
-            )
         by = []
-        if self.left_index and self.right_index:
+        if self._using_left_index and self._using_right_index:
             if result._index is not None:
                 by.extend(result._index._data.columns)
-        if self.left_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.left_on)]
-            )
-        if self.right_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.right_on)]
-            )
+        if not self._using_left_index:
+            by.extend([result._data[col.name] for col in self._left_keys])
+        if not self._using_right_index:
+            by.extend([result._data[col.name] for col in self._right_keys])
         if by:
-            to_sort = cudf.DataFrame._from_columns(by)
+            to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
             sort_order = to_sort.argsort()
-            result = result._gather(sort_order, check_bounds=False)
+            result = result._gather(
+                sort_order,
+                keep_index=self._using_left_index or self._using_right_index,
+                check_bounds=False,
+            )
         return result
 
     @staticmethod
@@ -410,10 +340,9 @@ def _validate_merge_params(
         how,
         suffixes,
     ):
-        """
-        Error for various invalid combinations of merge input parameters
-        """
-        # must actually support the requested merge type
+        # Error for various invalid combinations of merge input parameters
+
+        # We must actually support the requested merge type
         if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}:
             raise NotImplementedError(f"{how} merge not supported yet")
 
@@ -424,15 +353,55 @@ def _validate_merge_params(
                     'Can only pass argument "on" OR "left_on" '
                     'and "right_on", not a combination of both.'
                 )
+            elif left_index or right_index:
+                # Passing 'on' with 'left_index' or 'right_index' is ambiguous
+                raise ValueError(
+                    'Can only pass argument "on" OR "left_index" '
+                    'and "right_index", not a combination of both.'
+                )
             else:
                 # the validity of 'on' being checked by _Indexer
                 return
+        elif left_on and left_index:
+            raise ValueError(
+                'Can only pass argument "left_on" OR "left_index" not both.'
+            )
+        elif right_on and right_index:
+            raise ValueError(
+                'Can only pass argument "right_on" OR "right_index" not both.'
+            )
+
+        # Can't merge on a column name that is present in both a frame and its
+        # indexes.
+        if on:
+            for key in on:
+                if (key in lhs._data and key in lhs.index._data) or (
+                    key in rhs._data and key in rhs.index._data
+                ):
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if left_on:
+            for key in left_on:
+                if key in lhs._data and key in lhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if right_on:
+            for key in right_on:
+                if key in rhs._data and key in rhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
 
         # Can't merge on unnamed Series
         if (isinstance(lhs, cudf.Series) and not lhs.name) or (
             isinstance(rhs, cudf.Series) and not rhs.name
         ):
-            raise ValueError("Can not merge on unnamed Series")
+            raise ValueError("Cannot merge on unnamed Series")
 
         # If nothing specified, must have common cols to use implicitly
         same_named_columns = set(lhs._data) & set(rhs._data)
@@ -459,59 +428,15 @@ def _validate_merge_params(
                         "lsuffix and rsuffix are not defined"
                     )
 
-    def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
-        # Match the dtypes of the key columns from lhs and rhs
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        for left_key, right_key in zip(*self._keys):
-            lcol, rcol = left_key.get(lhs), right_key.get(rhs)
-            lcol_casted, rcol_casted = _match_join_keys(
-                lcol, rcol, how=self.how
-            )
-            if lcol is not lcol_casted:
-                left_key.set(out_lhs, lcol_casted, validate=False)
-            if rcol is not rcol_casted:
-                right_key.set(out_rhs, rcol_casted, validate=False)
-        return out_lhs, out_rhs
-
-    def _restore_categorical_keys(
-        self, lhs: Frame, rhs: Frame
-    ) -> Tuple[Frame, Frame]:
-        # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
-        # were casted to their category type to produce `lhs` and `rhs`.
-        # Here, we cast them back.
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        if self.how == "inner":
-            for left_key, right_key in zip(*self._keys):
-                if isinstance(
-                    left_key.get(self.lhs).dtype, cudf.CategoricalDtype
-                ) and isinstance(
-                    right_key.get(self.rhs).dtype, cudf.CategoricalDtype
-                ):
-                    left_key.set(
-                        out_lhs,
-                        left_key.get(out_lhs).astype("category"),
-                        validate=False,
-                    )
-                    right_key.set(
-                        out_rhs,
-                        right_key.get(out_rhs).astype("category"),
-                        validate=False,
-                    )
-        return out_lhs, out_rhs
-
 
 class MergeSemi(Merge):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._joiner = functools.partial(
-            libcudf.join.semi_join, how=kwargs["how"]
-        )
+    _joiner: Callable = libcudf.join.semi_join
 
-    def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame:
+    def _merge_results(self, lhs: Frame, rhs: Frame):
         # semi-join result includes only lhs columns
-        if issubclass(self._out_class, cudf.Index):
-            return self._out_class._from_data(lhs._data)
-        else:
-            return self._out_class._from_data(lhs._data, index=lhs._index)
+        return (
+            lhs._data,
+            lhs._index
+            if not issubclass(self._out_class, cudf.Index)
+            else None,
+        )
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index e9f55c9e51a..0518cc2c9b9 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -230,10 +230,7 @@ def test_dataframe_join_combine_cats():
     expect.index = expect.index.astype("category")
     got = lhs.join(rhs, how="outer")
 
-    # TODO: Remove copying to host
-    # after https://github.com/rapidsai/cudf/issues/5676
-    # is implemented
-    assert_eq(expect.index.sort_values(), got.index.to_pandas().sort_values())
+    assert_eq(expect.index.sort_values(), got.index.sort_values())
 
 
 @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
@@ -744,12 +741,6 @@ def test_merge_sort(ons, hows):
     [
         {"left_on": ["a"], "left_index": False, "right_index": True},
         {"right_on": ["b"], "left_index": True, "right_index": False},
-        {
-            "left_on": ["a"],
-            "right_on": ["b"],
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_merge_sort_on_indexes(kwargs):
@@ -1791,12 +1782,6 @@ def test_typecast_on_join_indexes_matching_categorical():
         {"left_index": True, "right_on": "b"},
         {"left_on": "a", "right_index": True},
         {"left_index": True, "right_index": True},
-        {
-            "left_on": "a",
-            "right_on": "b",
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
@@ -2148,3 +2133,20 @@ def test_join_on_index_with_duplicate_names():
     got = lhs.join(rhs, how="inner")
 
     assert_join_results_equal(expect, got, how="inner")
+
+
+def test_join_redundant_params():
+    lhs = cudf.DataFrame(
+        {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c")
+    )
+    rhs = cudf.DataFrame(
+        {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a")
+    )
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, on="a", left_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="c", right_on="b")
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 58811ee98fc..8b2d85c59d7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -245,8 +245,6 @@ def test_merge_should_fail():
         left.merge(right, how="left", on=["b"])
     with pytest.raises(KeyError):
         left.merge(right, how="left", on=["c"])
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on=["a"])
 
     # Same column names
     df2["b"] = np.random.randint(0, 12, 12)
@@ -254,8 +252,6 @@ def test_merge_should_fail():
 
     with pytest.raises(KeyError):
         left.merge(right, how="left", on="NonCol")
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on="a")
 
 
 @pytest.mark.parametrize("how", ["inner", "left"])

From 32bacfaa0a75fd3fb5fb44b106d8138f83001184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <ismael.from.kone@gmail.com>
Date: Thu, 18 Nov 2021 00:24:07 +0100
Subject: [PATCH 07/72] Interchange dataframe protocol (#9071)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR is a basic implementation of the [interchange dataframe protocol](https://github.com/data-apis/dataframe-api/blob/main/protocol/dataframe_protocol.py) for cudf.
As well-known, there are many dataframe libraries out there where one's weakness is handle by another. To work across these libraries, we rely on `pandas` with method like `from_pandas` and `to_pandas`.
This is a bad design as libraries should maintain an additional dependency to pandas peculiarities.
This protocol provides a high level API that must be implemented by dataframe libraries to allow communication between them.
Thus, we get rid of the high coupling with pandas and depend only on the protocol API where each library has the freedom of its implementation details.
To illustrate:

- `df_obj =  cudf_dataframe.__dataframe__()`

`df_obj` can be consumed by any library implementing the protocol.
- `df = cudf.from_dataframe(any_supported_dataframe)`

here we create  a `cudf dataframe` from any dataframe object supporting the protocol.

So far, it supports the following:

-  Column dtypes: `uint8`, `int`, `float`, `bool` and `categorical`.
-  Missing values are handled for all these dtypes.
-  `string` support is on the way.

Additionally, we support dataframe from CPU device like `pandas`. But it is not testable here  as pandas has not yet adopted the protocol. We've tested it locally with a pandas monkey patched implementation of the protocol.

Authors:
  - Ismaël Koné (https://github.com/iskode)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9071
---
 python/cudf/cudf/__init__.py               |   2 +-
 python/cudf/cudf/core/dataframe.py         |  13 +-
 python/cudf/cudf/core/df_protocol.py       | 829 +++++++++++++++++++++
 python/cudf/cudf/tests/test_df_protocol.py | 219 ++++++
 4 files changed, 1061 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/core/df_protocol.py
 create mode 100644 python/cudf/cudf/tests/test_df_protocol.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index bc35551b5bd..f696a00d1ed 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -42,7 +42,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.series import Series
 from cudf.core.multiindex import MultiIndex
 from cudf.core.cut import cut
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a95453a4e62..bfbe8b06c17 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,7 +40,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core import column, reshape
+from cudf.core import column, df_protocol, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     as_column,
@@ -6329,6 +6329,17 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ):
+        return df_protocol.__dataframe__(
+            self, nan_as_null=nan_as_null, allow_copy=allow_copy
+        )
+
+
+def from_dataframe(df, allow_copy=False):
+    return df_protocol.from_dataframe(df, allow_copy=allow_copy)
+
 
 def make_binop_func(op, postprocess=None):
     # This function is used to wrap binary operations in Frame with an
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
new file mode 100644
index 00000000000..8f258ce27b2
--- /dev/null
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -0,0 +1,829 @@
+import collections
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    cast,
+)
+
+import cupy as cp
+import numpy as np
+from numba.cuda import as_cuda_array
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import as_column, build_categorical_column, build_column
+
+# Implementation of interchange protocol classes
+# ----------------------------------------------
+
+
+class _DtypeKind(enum.IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+class _Device(enum.IntEnum):
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+_SUPPORTED_KINDS = {
+    _DtypeKind.INT,
+    _DtypeKind.UINT,
+    _DtypeKind.FLOAT,
+    _DtypeKind.CATEGORICAL,
+    _DtypeKind.BOOL,
+    _DtypeKind.STRING,
+}
+ProtoDtype = Tuple[_DtypeKind, int, str, str]
+
+
+class _CuDFBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(
+        self,
+        buf: cudf.core.buffer.Buffer,
+        dtype: np.dtype,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Use cudf.core.buffer.Buffer object.
+        """
+        # Store the cudf buffer where the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self._buf = buf
+        self._dtype = dtype
+        self._allow_copy = allow_copy
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self._buf.nbytes
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return self._buf.ptr
+
+    def __dlpack__(self):
+        """
+        DLPack not implemented in NumPy yet, so leave it out here.
+        """
+        try:
+            cudarray = as_cuda_array(self._buf).view(self._dtype)
+            res = cp.asarray(cudarray).toDlpack()
+
+        except ValueError:
+            raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
+
+        return res
+
+    def __dlpack_device__(self) -> Tuple[_Device, int]:
+        """
+        _Device type and _Device ID for where the data in the buffer resides.
+        """
+        return (_Device.CUDA, cp.asarray(self._buf).device.id)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(" + str(
+            {
+                "bufsize": self.bufsize,
+                "ptr": self.ptr,
+                "dlpack": self.__dlpack__(),
+                "device": self.__dlpack_device__()[0].name,
+            }
+        )
+        +")"
+
+
+class _CuDFColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+
+    def __init__(
+        self,
+        column: cudf.core.column.ColumnBase,
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        if not isinstance(column, cudf.core.column.ColumnBase):
+            raise TypeError(
+                "column must be a subtype of df.core.column.ColumnBase,"
+                f"got {type(column)}"
+            )
+        self._col = column
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        """
+        return self._col.size
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        """
+        return 0
+
+    @property
+    def dtype(self) -> ProtoDtype:
+        """
+        Dtype description as a tuple
+        ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible
+             (hence the jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1
+             (for bit masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case
+              in the future we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification,
+              and for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding
+              of the categorical (e.g. an integer to string mapping),
+              this can be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null,
+              binary, decimal, and nested (list, struct, map, union) dtypes.
+        """
+        dtype = self._col.dtype
+
+        # For now, assume that, if the column dtype is 'O' (i.e., `object`),
+        # then we have an array of strings
+        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O":
+            return (_DtypeKind.STRING, 8, "u", "=")
+
+        return self._dtype_from_cudfdtype(dtype)
+
+    def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
+        _np_kinds = {
+            "i": _DtypeKind.INT,
+            "u": _DtypeKind.UINT,
+            "f": _DtypeKind.FLOAT,
+            "b": _DtypeKind.BOOL,
+            "U": _DtypeKind.STRING,
+            "M": _DtypeKind.DATETIME,
+            "m": _DtypeKind.DATETIME,
+        }
+        kind = _np_kinds.get(dtype.kind, None)
+        if kind is None:
+            # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
+            if isinstance(dtype, cudf.CategoricalDtype):
+                kind = _DtypeKind.CATEGORICAL
+                # Codes and categories' dtypes are different.
+                # We use codes' dtype as these are stored in the buffer.
+                codes = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes
+                dtype = codes.dtype
+            else:
+                raise ValueError(
+                    f"Data type {dtype} not supported by exchange protocol"
+                )
+
+        if kind not in _SUPPORTED_KINDS:
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+        bitwidth = dtype.itemsize * 8
+        format_str = dtype.str
+        endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "="
+        return (kind, bitwidth, format_str, endianness)
+
+    @property
+    def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary
+                             indices is semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+        """
+        if not self.dtype[0] == _DtypeKind.CATEGORICAL:
+            raise TypeError(
+                "`describe_categorical only works on "
+                "a column with categorical dtype!"
+            )
+        categ_col = cast(cudf.core.column.CategoricalColumn, self._col)
+        ordered = bool(categ_col.dtype.ordered)
+        is_dictionary = True
+        # NOTE: this shows the children approach is better, transforming
+        # `categories` to a "mapping" dict is inefficient
+        categories = categ_col.categories
+        mapping = {ix: val for ix, val in enumerate(categories.values_host)}
+        return ordered, is_dictionary, mapping
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
+
+        Value : if kind is "sentinel value", the actual value.
+        If kind is a bit mask or a byte mask, the value (0 or 1)
+        indicating a missing value.
+        None otherwise.
+        """
+        kind = self.dtype[0]
+        if self.null_count == 0:
+            # there is no validity mask so it is non-nullable
+            return 0, None
+
+        elif kind in _SUPPORTED_KINDS:
+            # bit mask is universally used in cudf for missing
+            return 3, 0
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self.dtype} not yet supported"
+            )
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+        """
+        return self._col.null_count
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        Store specific metadata of the column.
+        """
+        return {}
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFColumn"]:
+        """
+        Return an iterable yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        return (self,)
+
+    def get_buffers(
+        self,
+    ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers = {}
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except RuntimeError:
+            buffers["validity"] = None
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except RuntimeError:
+            buffers["offsets"] = None
+
+        buffers["data"] = self._get_data_buffer()
+
+        return buffers
+
+    def _get_validity_buffer(
+        self,
+    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the mask values
+        indicating missing data and the buffer's associated dtype.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+
+        null, invalid = self.describe_null
+        if null == 3:
+            if self.dtype[0] == _DtypeKind.CATEGORICAL:
+                valid_mask = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes._get_mask_as_column()
+            else:
+                valid_mask = self._col._get_mask_as_column()
+
+            assert (valid_mask is not None) and (
+                valid_mask.data is not None
+            ), "valid_mask(.data) should not be None when "
+            "_CuDFColumn.describe_null[0] = 3"
+            buffer = _CuDFBuffer(
+                valid_mask.data, cp.uint8, allow_copy=self._allow_copy
+            )
+            dtype = (_DtypeKind.UINT, 8, "C", "=")
+            return buffer, dtype
+
+        elif null == 1:
+            raise RuntimeError(
+                "This column uses NaN as null "
+                "so does not have a separate mask"
+            )
+        elif null == 0:
+            raise RuntimeError(
+                "This column is non-nullable so does not have a mask"
+            )
+        else:
+            raise NotImplementedError(
+                f"See {self.__class__.__name__}.describe_null method."
+            )
+
+    def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the offset values for
+        variable-size binary data (e.g., variable-length strings)
+        and the buffer's associated dtype.
+
+        Raises RuntimeError if the data buffer does not have an associated
+        offsets buffer.
+        """
+        if self.dtype[0] == _DtypeKind.STRING:
+            offsets = self._col.children[0]
+            assert (offsets is not None) and (offsets.data is not None), " "
+            "offsets(.data) should not be None for string column"
+
+            buffer = _CuDFBuffer(
+                offsets.data, offsets.dtype, allow_copy=self._allow_copy
+            )
+            dtype = self._dtype_from_cudfdtype(offsets.dtype)
+        else:
+            raise RuntimeError(
+                "This column has a fixed-length dtype "
+                "so does not have an offsets buffer"
+            )
+
+        return buffer, dtype
+
+    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+        """
+        Return the buffer containing the data and
+               the buffer's associated dtype.
+        """
+        if self.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            col_data = self._col
+            dtype = self.dtype
+
+        elif self.dtype[0] == _DtypeKind.CATEGORICAL:
+            col_data = cast(
+                cudf.core.column.CategoricalColumn, self._col
+            ).codes
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        elif self.dtype[0] == _DtypeKind.STRING:
+            col_data = self._col.children[1]
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self._col.dtype} not handled yet"
+            )
+        assert (col_data is not None) and (col_data.data is not None), " "
+        f"col_data(.data) should not be None when dtype = {dtype}"
+        buffer = _CuDFBuffer(
+            col_data.data, col_data.dtype, allow_copy=self._allow_copy
+        )
+
+        return buffer, dtype
+
+
+class _CuDFDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    Instances of this (private) class are returned from
+    ``cudf.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+
+    def __init__(
+        self,
+        df: "cudf.core.dataframe.DataFrame",
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `cudf.DataFrame.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with
+        # ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def metadata(self):
+        # `index` isn't a regular column, and the protocol doesn't support row
+        # labels - so we export it as cuDF-specific metadata here.
+        return {"cudf.index": self._df.index}
+
+    def num_columns(self) -> int:
+        return len(self._df.columns)
+
+    def num_rows(self) -> int:
+        return len(self._df)
+
+    def num_chunks(self) -> int:
+        return 1
+
+    def column_names(self) -> Iterable[str]:
+        return self._df.columns.tolist()
+
+    def get_column(self, i: int) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy
+        )
+
+    def get_column_by_name(self, name: str) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df[name]), allow_copy=self._allow_copy
+        )
+
+    def get_columns(self) -> Iterable[_CuDFColumn]:
+        return [
+            _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy)
+            for name in self._df.columns
+        ]
+
+    def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
+        if not isinstance(indices, collections.abc.Sequence):
+            raise ValueError("`indices` is not a sequence")
+
+        return _CuDFDataFrame(self._df.iloc[:, indices])
+
+    def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
+        if not isinstance(names, collections.Sequence):
+            raise ValueError("`names` is not a sequence")
+
+        return _CuDFDataFrame(
+            self._df.loc[:, names], self._nan_as_null, self._allow_copy
+        )
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFDataFrame"]:
+        """
+        Return an iterator yielding the chunks.
+        """
+        return (self,)
+
+
+def __dataframe__(
+    self, nan_as_null: bool = False, allow_copy: bool = True
+) -> _CuDFDataFrame:
+    """
+    The public method to attach to cudf.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    """
+    return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+
+
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a cudf.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than
+  having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+_FLOATS = {32: cp.float32, 64: cp.float64}
+_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
+
+
+def from_dataframe(
+    df: DataFrameObject, allow_copy: bool = False
+) -> _CuDFDataFrame:
+    """
+    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    if isinstance(df, cudf.DataFrame):
+        return df
+
+    if not hasattr(df, "__dataframe__"):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+
+
+def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
+    """
+    Create a cudf DataFrame object from DataFrameObject.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError("More than one chunk not handled yet")
+
+    # We need a dict of columns here, with each column being a cudf column.
+    columns = dict()
+    _buffers = []  # hold on to buffers, keeps memory alive
+    for name in df.column_names():
+        col = df.get_column_by_name(name)
+
+        if col.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            columns[name], _buf = _protocol_to_cudf_column_numeric(col)
+
+        elif col.dtype[0] == _DtypeKind.CATEGORICAL:
+            columns[name], _buf = _protocol_to_cudf_column_categorical(col)
+
+        elif col.dtype[0] == _DtypeKind.STRING:
+            columns[name], _buf = _protocol_to_cudf_column_string(col)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {col.dtype[0]} not handled yet"
+            )
+
+        _buffers.append(_buf)
+
+    df_new = cudf.DataFrame._from_data(columns)
+    df_new._buffers = _buffers
+    return df_new
+
+
+def _protocol_to_cudf_column_numeric(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert an int, uint, float or bool protocol column
+    to the corresponding cudf column
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    _dbuffer, _ddtype = buffers["data"]
+    _check_buffer_is_on_gpu(_dbuffer)
+    cudfcol_num = build_column(
+        Buffer(_dbuffer.ptr, _dbuffer.bufsize),
+        protocol_dtype_to_cupy_dtype(_ddtype),
+    )
+    return _set_missing_values(col, cudfcol_num), buffers
+
+
+def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
+    if (
+        buffer.__dlpack_device__()[0] != _Device.CUDA
+        and not buffer._allow_copy
+    ):
+        raise TypeError(
+            "This operation must copy data from CPU to GPU. "
+            "Set `allow_copy=True` to allow it."
+        )
+
+    elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
+        raise NotImplementedError(
+            "Only cuDF/GPU dataframes are supported for now. "
+            "CPU (like `Pandas`) dataframes will be supported shortly."
+        )
+
+
+def _set_missing_values(
+    protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase
+) -> cudf.core.column.ColumnBase:
+
+    valid_mask = protocol_col.get_buffers()["validity"]
+    if valid_mask is not None:
+        bitmask = cp.asarray(
+            Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8
+        )
+        cudf_col[~bitmask] = None
+
+    return cudf_col
+
+
+def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    if _dtype[0] not in _SUPPORTED_KINDS:
+        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
+
+    return _CP_DTYPES[kind][bitwidth]
+
+
+def _protocol_to_cudf_column_categorical(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a categorical column to a Series instance
+    """
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError(
+            "Non-dictionary categoricals not supported yet"
+        )
+
+    categories = as_column(mapping.values())
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    codes_buffer, codes_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(codes_buffer)
+    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
+    codes = build_column(
+        Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype
+    )
+
+    cudfcol = build_categorical_column(
+        categories=categories,
+        codes=codes,
+        mask=codes.base_mask,
+        size=codes.size,
+        ordered=ordered,
+    )
+
+    return _set_missing_values(col, cudfcol), buffers
+
+
+def _protocol_to_cudf_column_string(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a string ColumnObject to cudf Column object.
+    """
+    # Retrieve the data buffers
+    buffers = col.get_buffers()
+
+    # Retrieve the data buffer containing the UTF-8 code units
+    assert buffers["data"] is not None, "data buffer should never be None"
+    data_buffer, data_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(data_buffer)
+    encoded_string = build_column(
+        Buffer(data_buffer.ptr, data_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(data_dtype),
+    )
+
+    # Retrieve the offsets buffer containing the index offsets demarcating
+    # the beginning and end of each string
+    assert buffers["offsets"] is not None, "not possible for string column"
+    offset_buffer, offset_dtype = buffers["offsets"]
+    _check_buffer_is_on_gpu(offset_buffer)
+    offsets = build_column(
+        Buffer(offset_buffer.ptr, offset_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(offset_dtype),
+    )
+
+    cudfcol_str = build_column(
+        None, dtype=cp.dtype("O"), children=(offsets, encoded_string)
+    )
+    return _set_missing_values(col, cudfcol_str), buffers
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
new file mode 100644
index 00000000000..d24c8ca2860
--- /dev/null
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -0,0 +1,219 @@
+from typing import Any, Tuple
+
+import cupy as cp
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import build_column
+from cudf.core.df_protocol import (
+    DataFrameObject,
+    _CuDFBuffer,
+    _CuDFColumn,
+    _DtypeKind,
+    _from_dataframe,
+    protocol_dtype_to_cupy_dtype,
+)
+from cudf.testing._utils import assert_eq
+
+
+def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
+    buf, dtype = buffer_and_dtype
+    device_id = cp.asarray(cudfcol.data).device.id
+    assert buf.__dlpack_device__() == (2, device_id)
+    col_from_buf = build_column(
+        Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)
+    )
+    # check that non null values are the equals as nulls are represented
+    # by sentinel values in the buffer.
+    non_null_idxs = cudf.Series(cudfcol) != cudf.NA
+    assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
+
+    if dtype[0] != _DtypeKind.BOOL:
+        array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
+        col_array = cp.asarray(cudfcol.data_array_view)
+        assert_eq(array_from_dlpack.flatten(), col_array.flatten())
+    else:
+        pytest.raises(TypeError, buf.__dlpack__)
+
+
+def assert_column_equal(col: _CuDFColumn, cudfcol):
+    assert col.size == cudfcol.size
+    assert col.offset == 0
+    assert col.null_count == cudfcol.null_count
+    assert col.num_chunks() == 1
+    if col.null_count == 0:
+        pytest.raises(RuntimeError, col._get_validity_buffer)
+        assert col.get_buffers()["validity"] is None
+    else:
+        assert_buffer_equal(
+            col.get_buffers()["validity"],
+            cudfcol._get_mask_as_column().astype(cp.uint8),
+        )
+
+    if col.dtype[0] == _DtypeKind.CATEGORICAL:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes)
+        assert col.get_buffers()["offsets"] is None
+
+    elif col.dtype[0] == _DtypeKind.STRING:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1])
+        assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0])
+
+    else:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol)
+        assert col.get_buffers()["offsets"] is None
+
+    if col.null_count == 0:
+        assert col.describe_null == (0, None)
+    else:
+        assert col.describe_null == (3, 0)
+
+
+def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
+    assert dfo.num_columns() == len(df.columns)
+    assert dfo.num_rows() == len(df)
+    assert dfo.num_chunks() == 1
+    assert dfo.column_names() == list(df.columns)
+    for col in df.columns:
+        assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
+
+
+def assert_from_dataframe_equals(dfobj):
+    df2 = _from_dataframe(dfobj)
+
+    assert_dataframe_equal(dfobj, df2)
+    if isinstance(dfobj._df, cudf.DataFrame):
+        assert_eq(dfobj._df, df2)
+
+    elif isinstance(dfobj._df, pd.DataFrame):
+        assert_eq(cudf.DataFrame(dfobj._df), df2)
+
+    else:
+        raise TypeError(f"{type(dfobj._df)} not supported yet.")
+
+
+def assert_from_dataframe_exception(dfobj):
+    exception_msg = "This operation must copy data from CPU to GPU."
+    " Set `allow_copy=True` to allow it."
+    with pytest.raises(TypeError, match=exception_msg):
+        _from_dataframe(dfobj)
+
+
+def assert_df_unique_dtype_cols(data):
+    cdf = cudf.DataFrame(data=data)
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_from_dataframe():
+    data = dict(a=[1, 2, 3], b=[9, 10, 11])
+    df1 = cudf.DataFrame(data=data)
+    df2 = cudf.from_dataframe(df1)
+    assert_eq(df1, df2)
+
+
+def test_int_dtype():
+    data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_float_dtype():
+    data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_categorical_dtype():
+    cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    cdf["A"] = cdf["A"].astype("category")
+    col = cdf.__dataframe__().get_column_by_name("A")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_bool_dtype():
+    data_bool = dict(a=[True, True, False], b=[False, True, False])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_string_dtype():
+    data_string = dict(a=["a", "b", "cdef", "", "g"])
+    assert_df_unique_dtype_cols(data_string)
+
+
+def test_mixed_dtype():
+    data_mixed = dict(
+        int=[1, 2, 3],
+        float=[1.5, 2.5, 3.5],
+        bool=[True, False, True],
+        categorical=[5, 1, 5],
+        string=["rapidsai-cudf ", "", "df protocol"],
+    )
+    assert_df_unique_dtype_cols(data_mixed)
+
+
+def test_NA_int_dtype():
+    data_int = dict(
+        a=[1, None, 3, None, 5],
+        b=[9, 10, None, 7, 8],
+        c=[6, 19, 20, 100, 1000],
+    )
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_NA_float_dtype():
+    data_float = dict(
+        a=[1.4, None, 3.6, None, 5.2],
+        b=[9.7, 10.9, None, 7.8, 8.2],
+        c=[6.1, 19.2, 20.3, 100.4, 1000.5],
+    )
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_NA_categorical_dtype():
+    df = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    df["B"] = df["A"].astype("category")
+    df.at[[1, 3], "B"] = None  # Set two items to null
+
+    # Some detailed testing for correctness of dtype and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.null_count == 2
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_bool_dtype():
+    data_bool = dict(a=[None, True, False], b=[False, None, None])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_NA_string_dtype():
+    df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
+    df["B"] = df["A"].astype("object")
+    df.at[1, "B"] = cudf.NA  # Set one item to null
+
+    # Test for correctness and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.STRING
+    assert col.null_count == 1
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_mixed_dtype():
+    data_mixed = dict(
+        int=[1, None, 2, 3, 1000],
+        float=[None, 1.5, 2.5, 3.5, None],
+        bool=[True, None, False, None, None],
+        categorical=[5, 1, 5, 3, None],
+        string=[None, None, None, "df protocol", None],
+    )
+    assert_df_unique_dtype_cols(data_mixed)

From d623c9376a1fd7e373247dcad2f9ac9e53654301 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 17 Nov 2021 19:01:13 -0600
Subject: [PATCH 08/72] change minimum pin of cupy (#9636)

This PR updates minimum version pinning of `cupy` that is compatible with CEC.
---
 conda/environments/cudf_dev_cuda11.0.yml | 2 +-
 conda/environments/cudf_dev_cuda11.2.yml | 2 +-
 conda/environments/cudf_dev_cuda11.5.yml | 2 +-
 conda/recipes/cudf/meta.yaml             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 803e4f0ba26..5954525b924 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -9,7 +9,7 @@ channels:
 dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
+  - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 2281d361ebd..0d1989fa213 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -9,7 +9,7 @@ channels:
 dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
+  - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 63800fe786b..d759ca94fbf 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -9,7 +9,7 @@ channels:
 dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
+  - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 615135a6f6b..6d56b0c0c94 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -40,7 +40,7 @@ requirements:
     - python
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
-    - cupy >7.1.0,<10.0.0a0
+    - cupy >=9.5.0,<10.0.0a0
     - numba >=0.53.1
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda

From d4ff5185d10a988e26b9a32affed0ca5af821e78 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 18 Nov 2021 00:07:28 -0600
Subject: [PATCH 09/72] Simplify write_csv by removing unnecessary writer/impl
 classes (#9089)

Depends on #9040 and (unfortunately) #9041

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9089
---
 cpp/include/cudf/io/detail/csv.hpp |  56 ++++---------
 cpp/src/io/csv/durations.hpp       |  39 +++++++++
 cpp/src/io/csv/writer_impl.cu      | 113 +++++++++++++-------------
 cpp/src/io/csv/writer_impl.hpp     | 122 -----------------------------
 cpp/src/io/functions.cpp           |  10 ++-
 5 files changed, 116 insertions(+), 224 deletions(-)
 create mode 100644 cpp/src/io/csv/durations.hpp
 delete mode 100644 cpp/src/io/csv/writer_impl.hpp

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index aac44bed50e..c190340f6c1 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,45 +40,23 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr);
 
-class writer {
- public:
-  class impl;
-
- private:
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor for output to a file.
-   *
-   * @param sinkp The data sink to write the data to
-   * @param options Settings for controlling writing behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  writer(std::unique_ptr<cudf::io::data_sink> sinkp,
-         csv_writer_options const& options,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr);  // cannot provide definition here (because
-                                                // _impl is incomplete hence unique_ptr has
-                                                // not enough sizeof() info)
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~writer();
+/**
+ * @brief Write an entire dataset to CSV format.
+ *
+ * @param sink Output sink
+ * @param table The set of columns
+ * @param metadata The metadata associated with the table
+ * @param options Settings for controlling behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void write_csv(data_sink* sink,
+               table_view const& table,
+               const table_metadata* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-  /**
-   * @brief Writes the entire dataset.
-   *
-   * @param table Set of columns to output
-   * @param metadata Table metadata and column names
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-};
 }  // namespace csv
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
new file mode 100644
index 00000000000..d42ddf3817c
--- /dev/null
+++ b/cpp/src/io/csv/durations.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace io {
+namespace detail {
+namespace csv {
+
+std::unique_ptr<column> pandas_format_durations(
+  column_view const& durations,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace csv
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index e8c673751db..b9b6fc6cf94 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -19,17 +19,25 @@
  * @brief cuDF-IO CSV writer class implementation
  */
 
-#include "writer_impl.hpp"
+#include "durations.hpp"
+
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -40,13 +48,19 @@
 #include <thrust/scan.h>
 
 #include <algorithm>
+#include <memory>
 #include <sstream>
+#include <string>
+#include <vector>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
+using namespace cudf::io::csv;
+using namespace cudf::io;
+
 namespace {
 
 /**
@@ -260,32 +274,16 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-// Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
-               csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mr))
-{
-}
-
-// Destructor within this translation unit
-writer::~writer() = default;
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   csv_writer_options const& options,
-                   rmm::mr::device_memory_resource* mr)
-  : out_sink_(std::move(sink)), mr_(mr), options_(options)
-{
-}
-
 // write the header: column names:
 //
-void writer::impl::write_chunked_begin(table_view const& table,
-                                       const table_metadata* metadata,
-                                       rmm::cuda_stream_view stream)
+void write_chunked_begin(data_sink* out_sink,
+                         table_view const& table,
+                         table_metadata const* metadata,
+                         csv_writer_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
 {
-  if (options_.is_enabled_include_header()) {
+  if (options.is_enabled_include_header()) {
     // need to generate column names if metadata is not provided
     std::vector<std::string> generated_col_names;
     if (metadata == nullptr) {
@@ -298,8 +296,8 @@ void writer::impl::write_chunked_begin(table_view const& table,
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
-    auto const delimiter  = options_.get_inter_column_delimiter();
-    auto const terminator = options_.get_line_terminator();
+    auto const delimiter  = options.get_inter_column_delimiter();
+    auto const terminator = options.get_line_terminator();
 
     // process header names:
     // - if the header name includes the delimiter or terminator character,
@@ -341,18 +339,21 @@ void writer::impl::write_chunked_begin(table_view const& table,
     }
     header.append(terminator);
 
-    out_sink_->host_write(header.data(), header.size());
+    out_sink->host_write(header.data(), header.size());
   }
 }
 
-void writer::impl::write_chunked(strings_column_view const& str_column_view,
-                                 const table_metadata* metadata,
-                                 rmm::cuda_stream_view stream)
+void write_chunked(data_sink* out_sink,
+                   strings_column_view const& str_column_view,
+                   table_metadata const* metadata,
+                   csv_writer_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
 {
   // algorithm outline:
   //
   //  for_each(strings_column.begin(), strings_column.end(),
-  //           [sink = out_sink_](auto str_row) mutable {
+  //           [sink = out_sink](auto str_row) mutable {
   //               auto host_buffer = str_row.host_buffer();
   //               sink->host_write(host_buffer_.data(), host_buffer_.size());
   //           });//or...sink->device_write(device_buffer,...);
@@ -362,7 +363,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options_.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator()};
   auto p_str_col_w_nl =
     cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -370,9 +371,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars_begin();
 
-  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+  if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
-    out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
+    out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
@@ -383,30 +384,33 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
                              stream.value()));
     stream.synchronize();
 
-    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+    out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
 
   // Needs newline at the end, to separate from next chunk
-  if (out_sink_->is_device_write_preferred(newline.size())) {
-    out_sink_->device_write(newline.data(), newline.size(), stream);
+  if (out_sink->is_device_write_preferred(newline.size())) {
+    out_sink->device_write(newline.data(), newline.size(), stream);
   } else {
-    out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator().size());
+    out_sink->host_write(options.get_line_terminator().data(),
+                         options.get_line_terminator().size());
   }
 }
 
-void writer::impl::write(table_view const& table,
-                         const table_metadata* metadata,
-                         rmm::cuda_stream_view stream)
+void write_csv(data_sink* out_sink,
+               table_view const& table,
+               table_metadata const* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(table, metadata, stream);
+  write_chunked_begin(out_sink, table, metadata, options, stream, mr);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
-    auto n_rows_per_chunk = options_.get_rows_per_chunk();
+    auto n_rows_per_chunk = options.get_rows_per_chunk();
     //
     // This outputs the CSV in row chunks to save memory.
     // Maybe we can use the total_rows*count calculation and a memory threshold
@@ -436,7 +440,7 @@ void writer::impl::write(table_view const& table,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -459,32 +463,21 @@ void writer::impl::write(table_view const& table,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options_.get_inter_column_delimiter()};
+      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options_.get_na_rep(),
+                                                    options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
                                                     stream);
-        cudf::string_scalar narep{options_.get_na_rep()};
+        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(str_concat_col->view(), metadata, stream);
+      write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr);
     }
   }
-
-  // finalize (no-op, for now, but offers a hook for future extensions):
-  //
-  write_chunked_end(table, metadata, stream);
-}
-
-void writer::write(table_view const& table,
-                   const table_metadata* metadata,
-                   rmm::cuda_stream_view stream)
-{
-  _impl->write(table, metadata, stream);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
deleted file mode 100644
index 965c036dc75..00000000000
--- a/cpp/src/io/csv/writer_impl.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <cudf/strings/strings_column_view.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV writer
- */
-class writer::impl {
- public:
-  /**
-   * @brief Constructor with writer options.
-   *
-   * @param sink Output sink
-   * @param options Settings for controlling behavior
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  impl(std::unique_ptr<data_sink> sink,
-       csv_writer_options const& options,
-       rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Write an entire dataset to CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write the header of a CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_begin(table_view const& table,
-                           const table_metadata* metadata = nullptr,
-                           rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write dataset to CSV format without header.
-   *
-   * @param strings_column Subset of columns converted to string to be written.
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked(strings_column_view const& strings_column,
-                     const table_metadata* metadata = nullptr,
-                     rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write footer of CSV format (typically, empty).
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_end(table_view const& table,
-                         const table_metadata* metadata = nullptr,
-                         rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
-  {
-    // purposely no-op (for now);
-  }
-
- private:
-  std::unique_ptr<data_sink> out_sink_;
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  csv_writer_options const options_;
-};
-
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index a8ca1d3a459..402e212f07b 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -219,10 +219,14 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
   using namespace cudf::io::detail;
 
   auto sink = make_datasink(options.get_sink());
-  auto writer =
-    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table(), options.get_metadata());
+  return csv::write_csv(  //
+    sink.get(),
+    options.get_table(),
+    options.get_metadata(),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 namespace detail_orc = cudf::io::detail::orc;

From 406429a66fad55414fce22f2723270df411e1b75 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Thu, 18 Nov 2021 10:07:58 -0500
Subject: [PATCH 10/72] ceil/floor for `DatetimeIndex` (#9554)

Follow-up to #9571  where we add `ceil` and `floor` support for `Series`.

Here we add `ceil` and `floor` support to `DatetimeIndex` class. This PR is dependent on #9571 getting merged first since it assumes the `libcudf` implementation for `floor` exists.

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9554
---
 docs/cudf/source/api_docs/index_objects.rst |  2 +
 python/cudf/cudf/core/frame.py              | 14 +++++
 python/cudf/cudf/core/index.py              | 62 +++++++++++++++++++++
 python/cudf/cudf/tests/test_index.py        | 26 +++++++++
 4 files changed, 104 insertions(+)

diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 30269bb2a72..2a4dd5ff9c8 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -280,6 +280,8 @@ Time-specific operations
    :toctree: api/
 
    DatetimeIndex.round
+   DatetimeIndex.ceil
+   DatetimeIndex.floor
 
 Conversion
 ~~~~~~~~~~
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 72239fc2a8e..58fe8a43d8d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3673,6 +3673,13 @@ def ceil(self):
         3    5.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("ceil")
 
     def floor(self):
@@ -3705,6 +3712,13 @@ def floor(self):
         5    3.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("floor")
 
     def scale(self):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 35b80715cca..63fda21152d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1898,6 +1898,68 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
+    def ceil(self, field):
+        """
+        Perform ceil operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
+        ... "1999-12-31 18:40:00"])
+        >>> gIndex.ceil("T")
+        DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.ceil(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
+    def floor(self, field):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59"
+        ... ,"1999-12-31 18:44:59"])
+        >>> gIndex.floor("T")
+        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.floor(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
 
 class TimedeltaIndex(GenericIndex):
     """
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c6cf7c4e6f5..ab211616a02 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2470,3 +2470,29 @@ def test_index_type_methods(data, func):
         assert_eq(False, actual)
     else:
         assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_ceil(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_ceil = pidx.ceil(resolution)
+    cuidx_ceil = cuidx.ceil(resolution)
+
+    assert_eq(pidx_ceil, cuidx_ceil)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_floor(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_floor = pidx.floor(resolution)
+    cuidx_floor = cuidx.floor(resolution)
+
+    assert_eq(pidx_floor, cuidx_floor)

From 91fd74e0e2b9ada200f3c707cc4d0ca4efee329a Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 18 Nov 2021 09:42:48 -0700
Subject: [PATCH 11/72] Support `min` and `max` reduction for structs (#9697)

This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` reduction.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9697
---
 cpp/src/groupby/sort/group_scan_util.cuh      |  20 +--
 .../sort/group_single_pass_reduction_util.cuh |  20 +--
 .../arg_minmax_util.cuh}                      |   4 +-
 cpp/src/reductions/simple.cuh                 |  61 +++++++-
 cpp/tests/reductions/reduction_tests.cpp      | 131 +++++++++++++++++-
 5 files changed, 210 insertions(+), 26 deletions(-)
 rename cpp/src/{groupby/sort/group_util.cuh => reductions/arg_minmax_util.cuh} (98%)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 013ea924cce..b565e8dc6d8 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -221,16 +221,18 @@ struct group_scan_functor<K,
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 4e0820af236..decb127b264 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -271,10 +271,11 @@ struct group_reduction_functor<
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
@@ -288,10 +289,11 @@ struct group_reduction_functor<
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
similarity index 98%
rename from cpp/src/groupby/sort/group_util.cuh
rename to cpp/src/reductions/arg_minmax_util.cuh
index 31ff29ed4c3..40df23bcd8e 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/reductions/arg_minmax_util.cuh
@@ -19,7 +19,7 @@
 #include <cudf/table/row_operators.cuh>
 
 namespace cudf {
-namespace groupby {
+namespace reduction {
 namespace detail {
 
 /**
@@ -62,5 +62,5 @@ struct row_arg_minmax_fn {
 };
 
 }  // namespace detail
-}  // namespace groupby
+}  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 13dfe5cb26c..7dd54e9250a 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,9 +16,13 @@
 
 #pragma once
 
+#include <reductions/arg_minmax_util.cuh>
+
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/reduction.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -28,6 +32,9 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/reduce.h>
 
 namespace cudf {
 namespace reduction {
@@ -252,8 +259,7 @@ struct same_element_type_dispatcher {
   template <typename ElementType>
   static constexpr bool is_supported()
   {
-    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view> ||
-             std::is_same_v<ElementType, cudf::struct_view>);
+    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view>);
   }
 
   template <typename IndexType,
@@ -279,8 +285,55 @@ struct same_element_type_dispatcher {
 
  public:
   template <typename ElementType,
-            std::enable_if_t<is_supported<ElementType>() &&
-                             not cudf::is_fixed_point<ElementType>()>* = nullptr>
+            std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
+                             (std::is_same_v<Op, cudf::reduction::op::min> ||
+                              std::is_same_v<Op, cudf::reduction::op::max>)>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); }
+
+    auto constexpr is_min_op = std::is_same_v<Op, cudf::reduction::op::min>;
+
+    // We will do reduction to find the ARGMIN/ARGMAX index, then return the element at that index.
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    // Perform reduction to find ARGMIN/ARGMAX.
+    auto const do_reduction = [&](auto const& binop) {
+      return thrust::reduce(rmm::exec_policy(stream),
+                            thrust::make_counting_iterator(0),
+                            thrust::make_counting_iterator(input.size()),
+                            size_type{0},
+                            binop);
+    };
+
+    auto const minmax_idx = [&] {
+      if (input.has_nulls()) {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      } else {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      }
+    }();
+
+    return cudf::detail::get_element(input, minmax_idx, stream, mr);
+  }
+
+  template <typename ElementType,
+            std::enable_if_t<is_supported<ElementType>() && !cudf::is_fixed_point<ElementType>() &&
+                             !std::is_same_v<ElementType, cudf::struct_view>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 376f5ce5dd2..2c9279260e7 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -2055,7 +2056,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 struct StructReductionTest : public cudf::test::BaseFixture {
   using SCW = cudf::test::structs_column_wrapper;
 
-  void reduction_test(SCW const& struct_column,
+  void reduction_test(cudf::column_view const& struct_column,
                       cudf::table_view const& expected_value,
                       bool succeeded_condition,
                       bool is_valid,
@@ -2066,7 +2067,7 @@ struct StructReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT));
       auto struct_result = dynamic_cast<cudf::struct_scalar*>(result.get());
       EXPECT_EQ(is_valid, struct_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); }
+      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_value, struct_result->view()); }
     };
 
     if (succeeded_condition) {
@@ -2210,4 +2211,130 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
 }
 
+TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"aab"};
+    auto const expected_child2 = INTS_CW{7};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"zit"};
+    auto const expected_child2 = INTS_CW{5};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 012bfe902949ffad967ecf96e153dd7094c878a0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 18 Nov 2021 13:35:06 -0600
Subject: [PATCH 12/72] [REVIEW] Upgrade `clang` to 11.1.0 (#9716)

* upgrade clang to 11.1.0

* fix pytest
---
 conda/environments/cudf_dev_cuda11.0.yml | 4 ++--
 conda/environments/cudf_dev_cuda11.2.yml | 4 ++--
 conda/environments/cudf_dev_cuda11.5.yml | 4 ++--
 cpp/scripts/run-clang-format.py          | 2 +-
 python/cudf/cudf/tests/test_concat.py    | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 5954525b924..4d106409e1e 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -7,8 +7,8 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
+  - clang=11.1.0
+  - clang-tools=11.1.0
   - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 0d1989fa213..46785603c0e 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -7,8 +7,8 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
+  - clang=11.1.0
+  - clang-tools=11.1.0
   - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index d759ca94fbf..635d838640d 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -7,8 +7,8 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
+  - clang=11.1.0
+  - clang-tools=11.1.0
   - cupy>=9.5.0,<10.0.0a0
   - rmm=21.12.*
   - cmake>=3.20.1
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 178bf2f0c78..a7c83da22c5 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -22,7 +22,7 @@
 import sys
 import tempfile
 
-EXPECTED_VERSION = "11.0.0"
+EXPECTED_VERSION = "11.1.0"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to
 # the cudf repo
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index bb96f3c4290..46707a283af 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -576,7 +576,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].iteritems():
             if is_categorical_dtype(col.dtype):
-                if expected[key].dtype != "category":
+                if not is_categorical_dtype(expected[key].dtype):
                     # TODO: Pandas bug:
                     # https://github.com/pandas-dev/pandas/issues/42840
                     expected[key] = expected[key].fillna("-1").astype("str")
@@ -1186,7 +1186,7 @@ def test_concat_join_empty_dataframes(
         if axis == 0:
             for key, col in actual[actual.columns].iteritems():
                 if is_categorical_dtype(col.dtype):
-                    if expected[key].dtype != "category":
+                    if not is_categorical_dtype(expected[key].dtype):
                         # TODO: Pandas bug:
                         # https://github.com/pandas-dev/pandas/issues/42840
                         expected[key] = (

From fc82b1d206e93a46c9ef3535711c88ec20bd4fde Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 19 Nov 2021 02:06:54 +0530
Subject: [PATCH 13/72] Spell check fixes (#9682)

Regular spell check fixes in comments and docs.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9682
---
 cpp/src/binaryop/compiled/binary_ops.cuh            |  4 ++--
 cpp/src/groupby/sort/aggregate.cpp                  |  4 ++--
 cpp/src/io/orc/aggregate_orc_metadata.cpp           |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp           |  2 +-
 cpp/src/io/orc/stripe_enc.cu                        |  4 ++--
 cpp/src/io/orc/writer_impl.cu                       |  2 +-
 cpp/src/io/parquet/parquet_gpu.hpp                  |  4 ++--
 cpp/src/io/text/multibyte_split.cu                  |  2 +-
 cpp/src/lists/drop_list_duplicates.cu               |  4 ++--
 cpp/src/rolling/rolling_detail.cuh                  |  2 +-
 cpp/tests/column/column_view_shallow_test.cpp       |  2 +-
 cpp/tests/datetime/datetime_ops_test.cpp            |  4 ++--
 cpp/tests/transform/row_bit_count_test.cu           |  6 +++---
 python/cudf/cudf/core/column/column.py              |  4 ++--
 python/cudf/cudf/core/column/datetime.py            |  2 +-
 python/cudf/cudf/core/column/decimal.py             |  2 +-
 python/cudf/cudf/core/dataframe.py                  |  2 +-
 python/cudf/cudf/core/groupby/groupby.py            |  2 +-
 python/cudf/cudf/core/index.py                      |  2 +-
 python/cudf/cudf/core/multiindex.py                 |  2 +-
 python/cudf/cudf/core/series.py                     |  8 ++++----
 python/cudf/cudf/core/udf/pipeline.py               |  2 +-
 python/cudf/cudf/core/udf/typing.py                 |  4 ++--
 python/cudf/cudf/testing/testing.py                 |  2 +-
 python/cudf/cudf/tests/test_binops.py               |  2 +-
 python/cudf/cudf/tests/test_custom_accessor.py      |  2 +-
 python/cudf/cudf/tests/test_datetime.py             |  2 +-
 python/cudf/cudf/tests/test_multiindex.py           | 10 +++++-----
 python/cudf/cudf/tests/test_orc.py                  |  4 ++--
 python/cudf/cudf/utils/gpu_utils.py                 |  2 +-
 python/cudf/cudf/utils/ioutils.py                   |  4 ++--
 python/cudf/cudf/utils/utils.py                     |  4 ++--
 python/dask_cudf/dask_cudf/_version.py              |  2 +-
 python/dask_cudf/dask_cudf/backends.py              |  2 +-
 python/dask_cudf/dask_cudf/io/parquet.py            |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py |  2 +-
 36 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..10e9b2532af 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -117,7 +117,7 @@ struct ops_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
@@ -164,7 +164,7 @@ struct ops2_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 234bb447761..d68b701d75f 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -559,7 +559,7 @@ auto column_view_with_common_nulls(column_view const& column_0, column_view cons
 }
 
 /**
- * @brief Perform covariance betweeen two child columns of non-nullable struct column.
+ * @brief Perform covariance between two child columns of non-nullable struct column.
  *
  */
 template <>
@@ -602,7 +602,7 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
 };
 
 /**
- * @brief Perform correlation betweeen two child columns of non-nullable struct column.
+ * @brief Perform correlation between two child columns of non-nullable struct column.
  *
  */
 template <>
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 45d60605936..82161233a92 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -79,7 +79,7 @@ void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_co
  * @brief Adds the column with the given id to the mapping
  *
  * All nested columns and direct ancestors of column `id` are included.
- * Columns that are not on the direct path are excluded, which may result in prunning.
+ * Columns that are not on the direct path are excluded, which may result in pruning.
  */
 void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 5132906a5fc..01418fd3bd6 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -119,7 +119,7 @@ class aggregate_orc_metadata {
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
    *
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
-   * ommited to match the cuDF table hierarchy.
+   * omitted to match the cuDF table hierarchy.
    *
    * @param column_paths List of full column names (i.e. paths) to select from the ORC file
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 217aee8756e..829e4877c44 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -660,7 +660,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
     auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
     auto dst_offset      = offset + s->nnz;
     auto vbuf_bit_idx    = [](int row) {
-      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      // valid_buf is a circular buffer with validity of 8 rows in each element
       return row % (encode_block_size * 8);
     };
     if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
@@ -696,7 +696,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
         ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
 
       if (!t) {
-        // Number of rows enocoded so far
+        // Number of rows encoded so far
         s->present_out += nrows_encoded;
         s->numvals -= min(s->numvals, nrows_encoded);
       }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1563e3e1fd7..25c4bd65c8f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1426,7 +1426,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       }
     }
     if (col.orc_kind() == LIST or col.orc_kind() == MAP) {
-      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      // Need a new pushdown mask unless both the parent and current column are not nullable
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ac2e6ba5cfb..1bd4cb3c6f4 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -307,7 +307,7 @@ struct EncColumnChunk {
   statistics_chunk const* stats;  //!< Fragment statistics
   uint32_t bfr_size;              //!< Uncompressed buffer size
   uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t max_page_data_size;    //!< Max data size (excuding header) of any page in this chunk
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
   uint32_t start_row;             //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
@@ -489,7 +489,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups
 /**
  * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
  *
- * @param chunks Flat span of chunks to intialize hash maps for
+ * @param chunks Flat span of chunks to initialize hash maps for
  * @param stream CUDA stream to use
  */
 void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a427809c81a..d287b9f2419 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -260,7 +260,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow seperate logic.
+  // would have to follow separate logic.
   multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
     tile_multistates,
     tile_offsets,
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 0663bc18ab3..527e834c76c 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -67,7 +67,7 @@ struct has_negative_nans_fn {
  * @brief A structure to be used along with type_dispatcher to check if a column has any
  * negative NaN value.
  *
- * This functor is neccessary because when calling to segmented sort on the list entries, the
+ * This functor is necessary because when calling to segmented sort on the list entries, the
  * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
  * lists. We want to move all NaN values close together in order to call unique_copy later on.
  */
@@ -563,7 +563,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
                      values ? cudf::empty_like(values.value().parent()) : nullptr};
   }
 
-  // The child column conotaining list entries.
+  // The child column containing list entries.
   auto const keys_child = keys.get_sliced_child(stream);
 
   // Generate a mapping from list entries to their 1-based list indices for the keys column.
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 12227404d83..bc1947dfeed 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -722,7 +722,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   }
 
   // STD aggregations depends on VARIANCE aggregation. Each element is applied
-  // with sqaured-root in the finalize() step.
+  // with square-root in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(data_type,
                                                   cudf::detail::std_aggregation const& agg) override
   {
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index ab324ea8505..4afa96f08d7 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -84,7 +84,7 @@ TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes);
 // Test for fixed_width, dict, string, list, struct
 // column_view, column_view = same hash.
 // column_view, make a copy = same hash.
-// new column_view from colmn = same hash
+// new column_view from column = same hash
 // column_view, copy column = diff hash
 // column_view, diff column = diff hash.
 //
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index b70ac29fd5d..2097e09e674 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -758,7 +758,7 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
         707904541L,     // 1992-06-07 08:09:01 GMT - leap year
         -2181005247L,   // 1900-11-20 09:12:33 GMT - non leap year
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00 - non-leap-year
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT - leap year
         4539564243L,    // 2113-11-08 06:04:03 GMT - non leap year
@@ -827,7 +827,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
         707904541L,     // 1992-06-07 08:09:01 GMT
         -2181005247L,   // 1900-11-20 09:12:33 GMT
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT
         4539564243L,    // 2113-11-08 06:04:03 GMT
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 4645ff9be5f..7fb7326f221 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -228,7 +228,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
   // than max_block_size (256) rows.
   // With a large number of rows, computation spills to multiple thread-blocks,
-  // thus exercising the branch-stack comptutation.
+  // thus exercising the branch-stack computation.
   // The contents of the input column aren't as pertinent to this test as the
   // column size. For what it's worth, it looks as follows:
   //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
@@ -362,7 +362,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_nested_and_exp
   // Inner list column
   // clang-format off
   cudf::test::lists_column_wrapper<int> list{
-    {1, 2, 3, 4, 5},     
+    {1, 2, 3, 4, 5},
     {6, 7, 8},
     {33, 34, 35, 36, 37, 38, 39},
     {-1, -2},
@@ -408,7 +408,7 @@ std::unique_ptr<column> build_nested_column(std::vector<bool> const& struct_vali
 
   // Inner list column
   // clang-format off
-  cudf::test::lists_column_wrapper<int> list{    
+  cudf::test::lists_column_wrapper<int> list{
      {{1, 2, 3, 4, 5}, {2, 3}},
      {{6, 7, 8}, {8, 9}},
      {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}};
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6f2f01c746d..e2bedd9d0b1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -337,7 +337,7 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         else:
             return self.dropna(drop_nan=False).data_array_view
 
-    # TODO: This method is decpreated and can be removed when the associated
+    # TODO: This method is deprecated and can be removed when the associated
     # Frame methods are removed.
     def to_array(self, fillna=None) -> np.ndarray:
         """Get a dense numpy array for the data.
@@ -1851,7 +1851,7 @@ def as_column(
 
         arbitrary = np.asarray(arbitrary)
 
-        # Handle case that `arbitary` elements are cupy arrays
+        # Handle case that `arbitrary` elements are cupy arrays
         if (
             shape
             and shape[0]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 756e48edccb..7c8837ef45f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -543,7 +543,7 @@ def infer_format(element: str, **kwargs) -> str:
     if len(second_parts) > 1:
         # "Z" indicates Zulu time(widely used in aviation) - Which is
         # UTC timezone that currently cudf only supports. Having any other
-        # unsuppported timezone will let the code fail below
+        # unsupported timezone will let the code fail below
         # with a ValueError.
         second_parts.remove("Z")
         second_part = "".join(second_parts[1:])
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6409a9f9196..7037b8e6f36 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -160,7 +160,7 @@ def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
 
-        # Binary Arithmatics between decimal columns. `Scale` and `precision`
+        # Binary Arithmetics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
         if op in ("add", "sub", "mul", "div"):
             scale = _binop_scale(self.dtype, other.dtype, op)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bfbe8b06c17..c0cb6f1917f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6367,7 +6367,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dc6461663ce..7f9f61ed3fd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -18,7 +18,7 @@
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
-# respectively, which are called in the describe() method to ouput
+# respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
 def _quantile_25(x):
     return x.quantile(0.25)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 63fda21152d..5ea9ac945dc 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -623,7 +623,7 @@ def _union(self, other, sort=None):
                 else:
                     return result
 
-        # If all the above optimizations don't cater to the inpputs,
+        # If all the above optimizations don't cater to the inputs,
         # we materialize RangeIndex's into `Int64Index` and
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8c4f87d5f67..a1eda697683 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -970,7 +970,7 @@ def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
 
-        # TODO: Verify if this is really necesary or if we can rely on
+        # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
             colnames = source_data[0].columns
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c804f2bca2c..cf035ef457d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2916,7 +2916,7 @@ def unique(self):
 
     def nunique(self, method="sort", dropna=True):
         """Returns the number of unique values of the Series: approximate version,
-        and exact version to be moved to libgdf
+        and exact version to be moved to libcudf
 
         Excludes NA values by default.
 
@@ -2985,7 +2985,7 @@ def value_counts(
 
         Returns
         -------
-        result : Series contanining counts of unique values.
+        result : Series containing counts of unique values.
 
         See also
         --------
@@ -3802,7 +3802,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
@@ -5054,7 +5054,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
-    Two values in ``a`` and ``b`` are  considiered equal when the following
+    Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index deb4546e8b8..2464906be04 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -316,7 +316,7 @@ def compile_or_get(frame, func, args):
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
     a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output valdity for each row.
+    to produce an output value and output validity for each row.
 
     If the UDF has already been compiled for this requested dtypes,
     a cached version will be returned instead of running compilation.
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 4b0f0bf1283..da7ff4c0e32 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -67,7 +67,7 @@ def unify(self, context, other):
         """
         Often within a UDF an instance arises where a variable could
         be a `MaskedType`, an `NAType`, or a literal based off
-        the data at runtime, for examplem the variable `ret` here:
+        the data at runtime, for example the variable `ret` here:
 
         def f(x):
             if x == 1:
@@ -185,7 +185,7 @@ class NAType(types.Type):
     """
     A type for handling ops against nulls
     Exists so we can:
-    1. Teach numba that all occurances of `cudf.NA` are
+    1. Teach numba that all occurrences of `cudf.NA` are
        to be read as instances of this type instead
     2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 9562fca7399..59c291eea0b 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -410,7 +410,7 @@ def assert_series_equal(
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_series_type : bool, default True
-        Whether to check the seires class, dtype and
+        Whether to check the series class, dtype and
         inferred_type are identical. Currently it is idle,
         and similar to pandas.
     check_less_precise : bool or int, default False
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 542dcd9301c..ba2a6dce369 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1173,7 +1173,7 @@ def make_scalar_product_data():
         )
     )
 
-    # we can muliply any timedelta by any int, or bool
+    # we can multiply any timedelta by any int, or bool
     valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES))
 
     # we can multiply a float by any int, float, or bool
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 16e5b345ce2..bfd2ccbccef 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -44,7 +44,7 @@ def test_dataframe_accessor(gdf):
     "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})]
 )
 def test_dataframe_accessor_idendity(gdf1, gdf2):
-    """Test for accessor idendities
+    """Test for accessor identities
     - An object should hold persistent reference to the same accessor
     - Different objects should hold difference instances of the accessor
     """
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index bf75badc06f..a95be4f7932 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -171,7 +171,7 @@ def test_dt_ops(data):
     assert_eq(pd_data > pd_data, gdf_data > gdf_data)
 
 
-# libgdf doesn't respect timezones
+# libcudf doesn't respect timezones
 @pytest.mark.parametrize("data", [data1()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_series(data, field):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index d409a099806..07407b8d359 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -738,9 +738,9 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 )
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
-    """Test memory idendity for deep copy
+    """Test memory identity for deep copy
     Case1: Constructed from GroupBy, StringColumns
-    Case2: Constrcuted from MultiIndex, NumericColumns
+    Case2: Constructed from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
@@ -768,19 +768,19 @@ def test_multiindex_copy_deep(data, deep):
         mi1 = data
         mi2 = mi1.copy(deep=deep)
 
-        # Assert ._levels idendity
+        # Assert ._levels identity
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._codes idendity
+        # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._data idendity
+        # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 99b5652110b..6b02874146e 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -502,7 +502,7 @@ def test_orc_writer_sliced(tmpdir):
         "TestOrcFile.decimal.orc",
         "TestOrcFile.decimal.same.values.orc",
         "TestOrcFile.decimal.multiple.values.orc",
-        # For addional information take look at PR 7034
+        # For additional information take look at PR 7034
         "TestOrcFile.decimal.runpos.issue.orc",
     ],
 )
@@ -541,7 +541,7 @@ def test_orc_decimal_precision_fail(datadir):
     assert_eq(pdf, gdf)
 
 
-# For addional information take look at PR 6636 and 6702
+# For additional information take look at PR 6636 and 6702
 @pytest.mark.parametrize(
     "orc_file",
     [
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index 77963f8bcc1..dbdd68f2df8 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -143,7 +143,7 @@ def _try_get_old_or_new_symbols():
             cuda_driver_supported_rt_version >= 11000
             and cuda_runtime_version >= 11000
         ):
-            # With cuda enhanced compatibitlity any code compiled
+            # With cuda enhanced compatibility any code compiled
             # with 11.x version of cuda can now run on any
             # driver >= 450.80.02. 11000 is the minimum cuda
             # version 450.80.02 supports.
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 11994830fed..0f9d9d53b23 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1038,7 +1038,7 @@
     should consume messages from. Valid values are 0 - (N-1)
 start_offset : int, Kafka Topic/Partition offset that consumption
     should begin at. Inclusive.
-end_offset : int, Kafka Topic/Parition offset that consumption
+end_offset : int, Kafka Topic/Partition offset that consumption
     should end at. Inclusive.
 batch_timeout : int, default 10000
     Maximum number of milliseconds that will be spent trying to
@@ -1061,7 +1061,7 @@
     or any object with a `read()` method (such as builtin `open()` file handler
     function or `StringIO`).
 delimiter : string, default None, The delimiter that should be used
-    for splitting text chunks into seperate cudf column rows. Currently
+    for splitting text chunks into separate cudf column rows. Currently
     only a single delimiter is supported.
 
 Returns
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4f9b23bf6fe..a9611a91554 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -353,7 +353,7 @@ def get_appropriate_dispatched_func(
 
         elif hasattr(cupy_submodule, fname):
             cupy_func = getattr(cupy_submodule, fname)
-            # Handle case if cupy impliments it as a numpy function
+            # Handle case if cupy implements it as a numpy function
             # Unsure if needed
             if cupy_func is func:
                 return NotImplemented
@@ -374,7 +374,7 @@ def _cast_to_appropriate_cudf_type(val, index=None):
     elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
         # if index is not None and is of a different length
         # than the index, cupy dispatching behaviour is undefined
-        # so we dont impliment it
+        # so we don't implement it
         if (index is None) or (len(index) == len(val)):
             return cudf.Series(val, index=index)
 
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
index eb7457f3465..8ca2cf98381 100644
--- a/python/dask_cudf/dask_cudf/_version.py
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -417,7 +417,7 @@ def render_pep440_old(pieces):
 
     The ".dev0" means dirty.
 
-    Eexceptions:
+    Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
     """
     if pieces["closest-tag"]:
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index f81a4743a4a..89b5301ee83 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -196,7 +196,7 @@ def make_meta_object_cudf(x, index=None):
         )
     elif not hasattr(x, "dtype") and x is not None:
         # could be a string, a dtype object, or a python type. Skip `None`,
-        # because it is implictly converted to `dtype('f8')`, which we don't
+        # because it is implicitly converted to `dtype('f8')`, which we don't
         # want here.
         try:
             dtype = np.dtype(x)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 2e5d55e92d2..b47a5e78095 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -111,7 +111,7 @@ def _read_paths(
             frag = next(ds.get_fragments())
             if frag:
                 # Extract hive-partition keys, and make sure they
-                # are orderd the same as they are in `partitions`
+                # are ordered the same as they are in `partitions`
                 raw_keys = pa_ds._get_partition_keys(frag.partition_expression)
                 partition_keys = [
                     (hive_part.name, raw_keys[hive_part.name])
@@ -173,7 +173,7 @@ def read_partition(
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
 
-        # Assume multi-peice read
+        # Assume multi-piece read
         paths = []
         rgs = []
         last_partition_keys = None
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d93037b3802..706b0e272ea 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -378,7 +378,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
             # one output partition
             assert ddf3.npartitions == 1
         else:
-            # Files can be aggregateed together, but
+            # Files can be aggregated together, but
             # chunksize is not large enough to produce
             # a single output partition
             assert ddf3.npartitions < num_row_groups

From c1bfb26715e0234f6d90aceac7a52caded2e9f9e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Nov 2021 19:29:14 -0500
Subject: [PATCH 14/72] Fix regex non-multiline EOL/$ matching strings ending
 with a new-line (#9715)

Closes #9620

Fixes an edge case described in https://docs.python.org/3/library/re.html#re.MULTILINE
where the '$' EOL regex pattern character (without `MULTILINE` set) should match at the very end of a string and also just before the end of the string if the end of that string contains a new-line.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Sheilah Kirui (https://github.com/skirui-source)

URL: https://github.com/rapidsai/cudf/pull/9715
---
 cpp/src/strings/regex/regex.inl       |  5 ++++-
 cpp/tests/strings/contains_tests.cpp  | 17 +++++++++--------
 python/cudf/cudf/tests/test_string.py |  5 +++--
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 66e99756615..bc0679993d0 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if (last_character || (inst->u1.c == '$' && c == '\n')) {
+            if (last_character ||
+                (c == '\n' && (inst->u1.c == '$' ||
+                               // edge case where \n appears at the end of the string
+                               pos + 1 == dstr.length()))) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 3c11444e4b5..229f9e4cc82 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest)
 
 TEST_F(StringsContainsTests, MultiLine)
 {
-  auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""});
-  auto view  = cudf::strings_column_view(input);
+  auto input =
+    cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
+  auto view = cudf::strings_column_view(input);
 
   auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
+  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   results           = cudf::strings::contains_re(view, "^abc$");
-  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
   results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
+  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
   results          = cudf::strings::matches_re(view, "^abc$");
-  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
   results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
   results        = cudf::strings::count_re(view, "^abc$");
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index c75eb91a335..cf52c4684c8 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1746,12 +1746,13 @@ def test_string_wrap(data, width):
         ["A B", "1.5", "3,000"],
         ["23", "³", "⅕", ""],
         [" ", "\t\r\n ", ""],
-        ["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
+        ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"],
         ["line\nto be wrapped", "another\nline\nto be wrapped"],
     ],
 )
 @pytest.mark.parametrize(
-    "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"]
+    "pat",
+    ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"],
 )
 @pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL])
 def test_string_count(data, pat, flags):

From cb894e0f8c36f63bded74bcc4749c853ffa365c3 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Fri, 19 Nov 2021 12:30:22 -0500
Subject: [PATCH 15/72] Fix `dask-cudf` recipe for Enhanced Compatibility
 (#9733)

The `dask-cudf` recipe was changed after #9456 was opened, so the Enhanced Compatibility changes never made it into this recipe. This PR fixes that.
---
 conda/recipes/dask-cudf/meta.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 5631e262b87..2a88827ad94 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -4,6 +4,7 @@
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
 {% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
   name: dask-cudf
@@ -14,7 +15,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda_{{ cuda_version }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
@@ -34,7 +35,7 @@ requirements:
     - cudf {{ version }}
     - dask>=2021.09.1
     - distributed>=2021.09.1
-    - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
   requires:                             # [linux64]

From e05bd4bf3cf410058417268e4353de46d387ac96 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Fri, 19 Nov 2021 14:29:35 -0500
Subject: [PATCH 16/72] update changelog

---
 CHANGELOG.md | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de00213a6f6..069e12fbd6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,259 @@
-# cuDF 21.10.00 (Date TBD)
+# cuDF 21.10.00 (7 Oct 2021)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Remove Cython APIs for table view generation ([#9199](https://github.com/rapidsai/cudf/pull/9199)) [@vyasr](https://github.com/vyasr)
+- Upgrade `pandas` version in `cudf` ([#9147](https://github.com/rapidsai/cudf/pull/9147)) [@galipremsagar](https://github.com/galipremsagar)
+- Make AST operators nullable ([#9096](https://github.com/rapidsai/cudf/pull/9096)) [@vyasr](https://github.com/vyasr)
+- Remove the option to pass data types as strings to `read_csv` and `read_json` ([#9079](https://github.com/rapidsai/cudf/pull/9079)) [@vuule](https://github.com/vuule)
+- Update JNI java CSV APIs to not use deprecated API ([#9066](https://github.com/rapidsai/cudf/pull/9066)) [@revans2](https://github.com/revans2)
+- Support additional format specifiers in from_timestamps ([#9047](https://github.com/rapidsai/cudf/pull/9047)) [@davidwendt](https://github.com/davidwendt)
+- Expose expression base class publicly and simplify public AST API ([#9045](https://github.com/rapidsai/cudf/pull/9045)) [@vyasr](https://github.com/vyasr)
+- Add support for struct type in ORC writer ([#9025](https://github.com/rapidsai/cudf/pull/9025)) [@vuule](https://github.com/vuule)
+- Remove aliases of various api.types APIs from utils.dtypes. ([#9011](https://github.com/rapidsai/cudf/pull/9011)) [@vyasr](https://github.com/vyasr)
+- Java bindings for conditional join output sizes ([#9002](https://github.com/rapidsai/cudf/pull/9002)) [@jlowe](https://github.com/jlowe)
+- Move compute_column API out of ast namespace ([#8957](https://github.com/rapidsai/cudf/pull/8957)) [@vyasr](https://github.com/vyasr)
+- `cudf.dtype` function ([#8949](https://github.com/rapidsai/cudf/pull/8949)) [@shwina](https://github.com/shwina)
+- Refactor Frame reductions ([#8944](https://github.com/rapidsai/cudf/pull/8944)) [@vyasr](https://github.com/vyasr)
+- Add nested column selection to parquet reader ([#8933](https://github.com/rapidsai/cudf/pull/8933)) [@devavret](https://github.com/devavret)
+- JNI Aggregation Type Changes ([#8919](https://github.com/rapidsai/cudf/pull/8919)) [@revans2](https://github.com/revans2)
+- Add groupby_aggregation and groupby_scan_aggregation classes and force their usage. ([#8906](https://github.com/rapidsai/cudf/pull/8906)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Expand CSV and JSON reader APIs to accept `dtypes` as a vector or map of `data_type` objects ([#8856](https://github.com/rapidsai/cudf/pull/8856)) [@vuule](https://github.com/vuule)
+- Change cudf docs theme to pydata theme ([#8746](https://github.com/rapidsai/cudf/pull/8746)) [@galipremsagar](https://github.com/galipremsagar)
+- Enable compiled binary ops in libcudf, python and java ([#8741](https://github.com/rapidsai/cudf/pull/8741)) [@karthikeyann](https://github.com/karthikeyann)
+- Make groupby transform-like op order match original data order ([#8720](https://github.com/rapidsai/cudf/pull/8720)) [@isVoid](https://github.com/isVoid)
+
+## 🐛 Bug Fixes
+
+- `fixed_point` `cudf::groupby` for `mean` aggregation ([#9296](https://github.com/rapidsai/cudf/pull/9296)) [@codereport](https://github.com/codereport)
+- Fix `interleave_columns` when the input string lists column having empty child column ([#9292](https://github.com/rapidsai/cudf/pull/9292)) [@ttnghia](https://github.com/ttnghia)
+- Update nvcomp to include fixes for installation of headers ([#9276](https://github.com/rapidsai/cudf/pull/9276)) [@devavret](https://github.com/devavret)
+- Fix Java column leak in testParquetWriteMap ([#9271](https://github.com/rapidsai/cudf/pull/9271)) [@jlowe](https://github.com/jlowe)
+- Fix call to thrust::reduce_by_key in argmin/argmax libcudf groupby ([#9263](https://github.com/rapidsai/cudf/pull/9263)) [@davidwendt](https://github.com/davidwendt)
+- Fixing empty input to getMapValue crashing ([#9262](https://github.com/rapidsai/cudf/pull/9262)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix duplicate names issue in `MultiIndex.deserialize ` ([#9258](https://github.com/rapidsai/cudf/pull/9258)) [@galipremsagar](https://github.com/galipremsagar)
+- `Dataframe.sort_index` optimizations ([#9238](https://github.com/rapidsai/cudf/pull/9238)) [@galipremsagar](https://github.com/galipremsagar)
+- Temporarily disabling problematic test in parquet writer ([#9230](https://github.com/rapidsai/cudf/pull/9230)) [@devavret](https://github.com/devavret)
+- Explicitly disable groupby on unsupported key types. ([#9227](https://github.com/rapidsai/cudf/pull/9227)) [@mythrocks](https://github.com/mythrocks)
+- Fix `gather` for sliced input structs column ([#9218](https://github.com/rapidsai/cudf/pull/9218)) [@ttnghia](https://github.com/ttnghia)
+- Fix JNI code for left semi and anti joins ([#9207](https://github.com/rapidsai/cudf/pull/9207)) [@jlowe](https://github.com/jlowe)
+- Only install thrust when using a non &#39;system&#39; version ([#9206](https://github.com/rapidsai/cudf/pull/9206)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove zlib from libcudf public CMake dependencies ([#9204](https://github.com/rapidsai/cudf/pull/9204)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix out-of-bounds memory read in orc gpuEncodeOrcColumnData ([#9196](https://github.com/rapidsai/cudf/pull/9196)) [@davidwendt](https://github.com/davidwendt)
+- Fix `gather()` for `STRUCT` inputs with no nulls in members. ([#9194](https://github.com/rapidsai/cudf/pull/9194)) [@mythrocks](https://github.com/mythrocks)
+- get_cucollections properly uses rapids_cpm_find ([#9189](https://github.com/rapidsai/cudf/pull/9189)) [@robertmaynard](https://github.com/robertmaynard)
+- rapids-export correctly reference build code block and doc strings ([#9186](https://github.com/rapidsai/cudf/pull/9186)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix logic while parsing the sum statistic for numerical orc columns ([#9183](https://github.com/rapidsai/cudf/pull/9183)) [@ayushdg](https://github.com/ayushdg)
+- Add handling for nulls in `dask_cudf.sorting.quantile_divisions` ([#9171](https://github.com/rapidsai/cudf/pull/9171)) [@charlesbluca](https://github.com/charlesbluca)
+- Approximate overflow detection in ORC statistics ([#9163](https://github.com/rapidsai/cudf/pull/9163)) [@vuule](https://github.com/vuule)
+- Use decimal precision metadata when reading from parquet files ([#9162](https://github.com/rapidsai/cudf/pull/9162)) [@shwina](https://github.com/shwina)
+- Fix variable name in Java build script ([#9161](https://github.com/rapidsai/cudf/pull/9161)) [@jlowe](https://github.com/jlowe)
+- Import rapids-cmake modules using the correct cmake variable. ([#9149](https://github.com/rapidsai/cudf/pull/9149)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix conditional joins with empty left table ([#9146](https://github.com/rapidsai/cudf/pull/9146)) [@vyasr](https://github.com/vyasr)
+- Fix joining on indexes with duplicate level names ([#9137](https://github.com/rapidsai/cudf/pull/9137)) [@shwina](https://github.com/shwina)
+- Fixes missing child column name in dtype while reading ORC file. ([#9134](https://github.com/rapidsai/cudf/pull/9134)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Apply type metadata after column is slice-copied ([#9131](https://github.com/rapidsai/cudf/pull/9131)) [@isVoid](https://github.com/isVoid)
+- Fix a bug: inner_join_size return zero if build table is empty ([#9128](https://github.com/rapidsai/cudf/pull/9128)) [@PointKernel](https://github.com/PointKernel)
+- Fix multi hive-partition parquet reading in dask-cudf ([#9122](https://github.com/rapidsai/cudf/pull/9122)) [@rjzamora](https://github.com/rjzamora)
+- Support null literals in expressions ([#9117](https://github.com/rapidsai/cudf/pull/9117)) [@vyasr](https://github.com/vyasr)
+- Fix cudf::hash_join output size for struct joins ([#9107](https://github.com/rapidsai/cudf/pull/9107)) [@jlowe](https://github.com/jlowe)
+- Import fix ([#9104](https://github.com/rapidsai/cudf/pull/9104)) [@shwina](https://github.com/shwina)
+- Fix cudf::strings::is_fixed_point checking of overflow for decimal32 ([#9093](https://github.com/rapidsai/cudf/pull/9093)) [@davidwendt](https://github.com/davidwendt)
+- Fix branch_stack calculation in `row_bit_count()` ([#9076](https://github.com/rapidsai/cudf/pull/9076)) [@mythrocks](https://github.com/mythrocks)
+- Fetch rapids-cmake to work around cuCollection cmake issue ([#9075](https://github.com/rapidsai/cudf/pull/9075)) [@jlowe](https://github.com/jlowe)
+- Fix compilation errors in groupby benchmarks. ([#9072](https://github.com/rapidsai/cudf/pull/9072)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Preserve float16 upscaling ([#9069](https://github.com/rapidsai/cudf/pull/9069)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix memcheck read error in libcudf contiguous_split ([#9067](https://github.com/rapidsai/cudf/pull/9067)) [@davidwendt](https://github.com/davidwendt)
+- Add support for reading ORC file with no row group index ([#9060](https://github.com/rapidsai/cudf/pull/9060)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Various multiindex related fixes ([#9036](https://github.com/rapidsai/cudf/pull/9036)) [@shwina](https://github.com/shwina)
+- Avoid rebuilding cython in build.sh ([#9034](https://github.com/rapidsai/cudf/pull/9034)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add support for percentile dispatch in `dask_cudf` ([#9031](https://github.com/rapidsai/cudf/pull/9031)) [@galipremsagar](https://github.com/galipremsagar)
+- cudf resolve nvcc 11.0 compiler crashes during codegen ([#9028](https://github.com/rapidsai/cudf/pull/9028)) [@robertmaynard](https://github.com/robertmaynard)
+- Fetch correct grouping keys `agg` of dask groupby ([#9022](https://github.com/rapidsai/cudf/pull/9022)) [@galipremsagar](https://github.com/galipremsagar)
+- Allow `where()` to work with a Series and `other=cudf.NA` ([#9019](https://github.com/rapidsai/cudf/pull/9019)) [@sarahyurick](https://github.com/sarahyurick)
+- Use correct index when returning Series from `GroupBy.apply()` ([#9016](https://github.com/rapidsai/cudf/pull/9016)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix `Dataframe` indexer setitem when array is passed ([#9006](https://github.com/rapidsai/cudf/pull/9006)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix ORC reading of files with struct columns that have null values ([#9005](https://github.com/rapidsai/cudf/pull/9005)) [@vuule](https://github.com/vuule)
+- Ensure JNI native libraries load when CompiledExpression loads ([#8997](https://github.com/rapidsai/cudf/pull/8997)) [@jlowe](https://github.com/jlowe)
+- Fix memory read error in get_dremel_data in page_enc.cu ([#8995](https://github.com/rapidsai/cudf/pull/8995)) [@davidwendt](https://github.com/davidwendt)
+- Fix memory write error in get_list_child_to_list_row_mapping utility ([#8994](https://github.com/rapidsai/cudf/pull/8994)) [@davidwendt](https://github.com/davidwendt)
+- Fix debug compile error for csv_test.cpp ([#8981](https://github.com/rapidsai/cudf/pull/8981)) [@davidwendt](https://github.com/davidwendt)
+- Fix memory read/write error in concatenate_lists_ignore_null ([#8978](https://github.com/rapidsai/cudf/pull/8978)) [@davidwendt](https://github.com/davidwendt)
+- Fix concatenation of `cudf.RangeIndex` ([#8970](https://github.com/rapidsai/cudf/pull/8970)) [@galipremsagar](https://github.com/galipremsagar)
+- Java conditional joins should not require matching column counts ([#8955](https://github.com/rapidsai/cudf/pull/8955)) [@jlowe](https://github.com/jlowe)
+- Fix concatenate empty structs ([#8947](https://github.com/rapidsai/cudf/pull/8947)) [@sperlingxx](https://github.com/sperlingxx)
+- Fix cuda-memcheck errors for some libcudf functions ([#8941](https://github.com/rapidsai/cudf/pull/8941)) [@davidwendt](https://github.com/davidwendt)
+- Apply series name to result of `SeriesGroupby.apply()` ([#8939](https://github.com/rapidsai/cudf/pull/8939)) [@charlesbluca](https://github.com/charlesbluca)
+- `cdef packed_columns` as `cppclass` instead of `struct` ([#8936](https://github.com/rapidsai/cudf/pull/8936)) [@charlesbluca](https://github.com/charlesbluca)
+- Inserting a `cudf.NA` into a DataFrame ([#8923](https://github.com/rapidsai/cudf/pull/8923)) [@sarahyurick](https://github.com/sarahyurick)
+- Support casting with Pandas dtype aliases ([#8920](https://github.com/rapidsai/cudf/pull/8920)) [@sarahyurick](https://github.com/sarahyurick)
+- Allow `sort_values` to accept same `kind` values as Pandas ([#8912](https://github.com/rapidsai/cudf/pull/8912)) [@sarahyurick](https://github.com/sarahyurick)
+- Enable casting to pandas nullable dtypes ([#8889](https://github.com/rapidsai/cudf/pull/8889)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix libcudf memory errors ([#8884](https://github.com/rapidsai/cudf/pull/8884)) [@karthikeyann](https://github.com/karthikeyann)
+- Throw KeyError when accessing field from struct with nonexistent key ([#8880](https://github.com/rapidsai/cudf/pull/8880)) [@NV-jpt](https://github.com/NV-jpt)
+- replace auto with auto&amp; ref for cast&lt;&amp;&gt; ([#8866](https://github.com/rapidsai/cudf/pull/8866)) [@karthikeyann](https://github.com/karthikeyann)
+- Add missing include&lt;optional&gt; in binops ([#8864](https://github.com/rapidsai/cudf/pull/8864)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix `select_dtypes` to work when non-class dtypes present in dataframe ([#8849](https://github.com/rapidsai/cudf/pull/8849)) [@sarahyurick](https://github.com/sarahyurick)
+- Re-enable JSON tests ([#8843](https://github.com/rapidsai/cudf/pull/8843)) [@vuule](https://github.com/vuule)
+- Support header with embedded delimiter in csv writer ([#8798](https://github.com/rapidsai/cudf/pull/8798)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Add IO docs page in `cudf` documentation ([#9145](https://github.com/rapidsai/cudf/pull/9145)) [@galipremsagar](https://github.com/galipremsagar)
+- use correct namespace in cuio code examples ([#9037](https://github.com/rapidsai/cudf/pull/9037)) [@cwharris](https://github.com/cwharris)
+- Restructuring `Contributing doc` ([#9026](https://github.com/rapidsai/cudf/pull/9026)) [@iskode](https://github.com/iskode)
+- Update stable version in readme ([#9008](https://github.com/rapidsai/cudf/pull/9008)) [@galipremsagar](https://github.com/galipremsagar)
+- Add spans and more include guidelines to libcudf developer guide ([#8931](https://github.com/rapidsai/cudf/pull/8931)) [@harrism](https://github.com/harrism)
+- Update Java build instructions to mention Arrow S3 and Docker ([#8867](https://github.com/rapidsai/cudf/pull/8867)) [@jlowe](https://github.com/jlowe)
+- List GDS-enabled formats in the docs ([#8805](https://github.com/rapidsai/cudf/pull/8805)) [@vuule](https://github.com/vuule)
+- Change cudf docs theme to pydata theme ([#8746](https://github.com/rapidsai/cudf/pull/8746)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🚀 New Features
+
+- Revert &quot;Add shallow hash function and shallow equality comparison for column_view ([#9185)&quot; (#9283](https://github.com/rapidsai/cudf/pull/9185)&quot; (#9283)) [@karthikeyann](https://github.com/karthikeyann)
+- Align `DataFrame.apply` signature with pandas ([#9275](https://github.com/rapidsai/cudf/pull/9275)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add struct type support for `drop_list_duplicates` ([#9202](https://github.com/rapidsai/cudf/pull/9202)) [@ttnghia](https://github.com/ttnghia)
+- support CUDA async memory resource in JNI ([#9201](https://github.com/rapidsai/cudf/pull/9201)) [@rongou](https://github.com/rongou)
+- Add shallow hash function and shallow equality comparison for column_view ([#9185](https://github.com/rapidsai/cudf/pull/9185)) [@karthikeyann](https://github.com/karthikeyann)
+- Superimpose null masks for STRUCT columns. ([#9144](https://github.com/rapidsai/cudf/pull/9144)) [@mythrocks](https://github.com/mythrocks)
+- Implemented bindings for `ceil` timestamp operation ([#9141](https://github.com/rapidsai/cudf/pull/9141)) [@shaneding](https://github.com/shaneding)
+- Adding MAP type support for ORC Reader ([#9132](https://github.com/rapidsai/cudf/pull/9132)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Implement `interleave_columns` for lists with arbitrary nested type ([#9130](https://github.com/rapidsai/cudf/pull/9130)) [@ttnghia](https://github.com/ttnghia)
+- Add python bindings to fixed-size window and groupby `rolling.var`, `rolling.std` ([#9097](https://github.com/rapidsai/cudf/pull/9097)) [@isVoid](https://github.com/isVoid)
+- Make AST operators nullable ([#9096](https://github.com/rapidsai/cudf/pull/9096)) [@vyasr](https://github.com/vyasr)
+- Java bindings for approx_percentile ([#9094](https://github.com/rapidsai/cudf/pull/9094)) [@andygrove](https://github.com/andygrove)
+- Add `dseries.struct.explode` ([#9086](https://github.com/rapidsai/cudf/pull/9086)) [@isVoid](https://github.com/isVoid)
+- Add support for BaseIndexer in Rolling APIs ([#9085](https://github.com/rapidsai/cudf/pull/9085)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove the option to pass data types as strings to `read_csv` and `read_json` ([#9079](https://github.com/rapidsai/cudf/pull/9079)) [@vuule](https://github.com/vuule)
+- Add handling for nested dicts in dask-cudf groupby ([#9054](https://github.com/rapidsai/cudf/pull/9054)) [@charlesbluca](https://github.com/charlesbluca)
+- Added Series.dt.is_quarter_start and Series.dt.is_quarter_end ([#9046](https://github.com/rapidsai/cudf/pull/9046)) [@TravisHester](https://github.com/TravisHester)
+- Support nested types for nth_element reduction ([#9043](https://github.com/rapidsai/cudf/pull/9043)) [@sperlingxx](https://github.com/sperlingxx)
+- Update sort groupby to use non-atomic operation ([#9035](https://github.com/rapidsai/cudf/pull/9035)) [@karthikeyann](https://github.com/karthikeyann)
+- Add support for struct type in ORC writer ([#9025](https://github.com/rapidsai/cudf/pull/9025)) [@vuule](https://github.com/vuule)
+- Implement `interleave_columns` for structs columns ([#9012](https://github.com/rapidsai/cudf/pull/9012)) [@ttnghia](https://github.com/ttnghia)
+- Add groupby first and last aggregations ([#9004](https://github.com/rapidsai/cudf/pull/9004)) [@shwina](https://github.com/shwina)
+- Add `DecimalBaseColumn` and move `as_decimal_column` ([#9001](https://github.com/rapidsai/cudf/pull/9001)) [@isVoid](https://github.com/isVoid)
+- Python/Cython bindings for multibyte_split ([#8998](https://github.com/rapidsai/cudf/pull/8998)) [@jdye64](https://github.com/jdye64)
+- Support scalar `months` in `add_calendrical_months`, extends API to INT32 support ([#8991](https://github.com/rapidsai/cudf/pull/8991)) [@isVoid](https://github.com/isVoid)
+- Added Series.dt.is_month_end ([#8989](https://github.com/rapidsai/cudf/pull/8989)) [@TravisHester](https://github.com/TravisHester)
+- Support for using tdigests to compute approximate percentiles. ([#8983](https://github.com/rapidsai/cudf/pull/8983)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Support &quot;unflatten&quot; of columns flattened via `flatten_nested_columns()`: ([#8956](https://github.com/rapidsai/cudf/pull/8956)) [@mythrocks](https://github.com/mythrocks)
+- Implement timestamp ceil ([#8942](https://github.com/rapidsai/cudf/pull/8942)) [@shaneding](https://github.com/shaneding)
+- Add nested column selection to parquet reader ([#8933](https://github.com/rapidsai/cudf/pull/8933)) [@devavret](https://github.com/devavret)
+- Expose conditional join size calculation ([#8928](https://github.com/rapidsai/cudf/pull/8928)) [@vyasr](https://github.com/vyasr)
+- Support Nulls in Timeseries Generator ([#8925](https://github.com/rapidsai/cudf/pull/8925)) [@isVoid](https://github.com/isVoid)
+- Avoid index equality check in `_CPackedColumns.from_py_table()` ([#8917](https://github.com/rapidsai/cudf/pull/8917)) [@charlesbluca](https://github.com/charlesbluca)
+- Add dot product binary op ([#8909](https://github.com/rapidsai/cudf/pull/8909)) [@charlesbluca](https://github.com/charlesbluca)
+- Expose `days_in_month` function in libcudf and add python bindings ([#8892](https://github.com/rapidsai/cudf/pull/8892)) [@isVoid](https://github.com/isVoid)
+- Series string repeat ([#8882](https://github.com/rapidsai/cudf/pull/8882)) [@sarahyurick](https://github.com/sarahyurick)
+- Python binding for quarters ([#8862](https://github.com/rapidsai/cudf/pull/8862)) [@shaneding](https://github.com/shaneding)
+- Expand CSV and JSON reader APIs to accept `dtypes` as a vector or map of `data_type` objects ([#8856](https://github.com/rapidsai/cudf/pull/8856)) [@vuule](https://github.com/vuule)
+- Add Java bindings for AST transform ([#8846](https://github.com/rapidsai/cudf/pull/8846)) [@jlowe](https://github.com/jlowe)
+- Series datetime is_month_start ([#8844](https://github.com/rapidsai/cudf/pull/8844)) [@sarahyurick](https://github.com/sarahyurick)
+- Support bracket syntax for cudf::strings::replace_with_backrefs group index values ([#8841](https://github.com/rapidsai/cudf/pull/8841)) [@davidwendt](https://github.com/davidwendt)
+- Support `VARIANCE` and `STD` aggregation in rolling op ([#8809](https://github.com/rapidsai/cudf/pull/8809)) [@isVoid](https://github.com/isVoid)
+- Add quarters to libcudf datetime ([#8779](https://github.com/rapidsai/cudf/pull/8779)) [@shaneding](https://github.com/shaneding)
+- Linear Interpolation of `nan`s via `cupy` ([#8767](https://github.com/rapidsai/cudf/pull/8767)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable compiled binary ops in libcudf, python and java ([#8741](https://github.com/rapidsai/cudf/pull/8741)) [@karthikeyann](https://github.com/karthikeyann)
+- Make groupby transform-like op order match original data order ([#8720](https://github.com/rapidsai/cudf/pull/8720)) [@isVoid](https://github.com/isVoid)
+- multibyte_split ([#8702](https://github.com/rapidsai/cudf/pull/8702)) [@cwharris](https://github.com/cwharris)
+- Implement JNI for `strings:repeat_strings` that repeats each string separately by different numbers of times ([#8572](https://github.com/rapidsai/cudf/pull/8572)) [@ttnghia](https://github.com/ttnghia)
+
+## 🛠️ Improvements
+
+- Pin max `dask` and `distributed` versions to `2021.09.1` ([#9286](https://github.com/rapidsai/cudf/pull/9286)) [@galipremsagar](https://github.com/galipremsagar)
+- Optimized fsspec data transfer for remote file-systems ([#9265](https://github.com/rapidsai/cudf/pull/9265)) [@rjzamora](https://github.com/rjzamora)
+- Skip dask-cudf tests on arm64 ([#9252](https://github.com/rapidsai/cudf/pull/9252)) [@Ethyling](https://github.com/Ethyling)
+- Use nvcomp&#39;s snappy compressor in ORC writer ([#9242](https://github.com/rapidsai/cudf/pull/9242)) [@devavret](https://github.com/devavret)
+- Only run imports tests on x86_64 ([#9241](https://github.com/rapidsai/cudf/pull/9241)) [@Ethyling](https://github.com/Ethyling)
+- Remove unnecessary call to device_uvector::release() ([#9237](https://github.com/rapidsai/cudf/pull/9237)) [@harrism](https://github.com/harrism)
+- Use nvcomp&#39;s snappy decompression in ORC reader ([#9235](https://github.com/rapidsai/cudf/pull/9235)) [@devavret](https://github.com/devavret)
+- Add grouped_rolling test with STRUCT groupby keys. ([#9228](https://github.com/rapidsai/cudf/pull/9228)) [@mythrocks](https://github.com/mythrocks)
+- Optimize `cudf.concat` for `axis=0` ([#9222](https://github.com/rapidsai/cudf/pull/9222)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix some libcudf calls not passing the stream parameter ([#9220](https://github.com/rapidsai/cudf/pull/9220)) [@davidwendt](https://github.com/davidwendt)
+- Add min and max bounds for random dataframe generator numeric types ([#9211](https://github.com/rapidsai/cudf/pull/9211)) [@galipremsagar](https://github.com/galipremsagar)
+- Improve performance of expression evaluation ([#9210](https://github.com/rapidsai/cudf/pull/9210)) [@vyasr](https://github.com/vyasr)
+- Misc optimizations in `cudf` ([#9203](https://github.com/rapidsai/cudf/pull/9203)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove Cython APIs for table view generation ([#9199](https://github.com/rapidsai/cudf/pull/9199)) [@vyasr](https://github.com/vyasr)
+- Add JNI support for drop_list_duplicates ([#9198](https://github.com/rapidsai/cudf/pull/9198)) [@revans2](https://github.com/revans2)
+- Update pandas versions in conda recipes and requirements.txt files ([#9197](https://github.com/rapidsai/cudf/pull/9197)) [@galipremsagar](https://github.com/galipremsagar)
+- Minor C++17 cleanup of `groupby.cu`: structured bindings, more concise lambda, etc ([#9193](https://github.com/rapidsai/cudf/pull/9193)) [@codereport](https://github.com/codereport)
+- Explicit about bitwidth difference between cudf boolean and arrow boolean ([#9192](https://github.com/rapidsai/cudf/pull/9192)) [@isVoid](https://github.com/isVoid)
+- Remove _source_index from MultiIndex ([#9191](https://github.com/rapidsai/cudf/pull/9191)) [@vyasr](https://github.com/vyasr)
+- Fix typo in the name of `cudf-testing-targets.cmake` ([#9190](https://github.com/rapidsai/cudf/pull/9190)) [@trxcllnt](https://github.com/trxcllnt)
+- Add support for single-digits in cudf::to_timestamps ([#9173](https://github.com/rapidsai/cudf/pull/9173)) [@davidwendt](https://github.com/davidwendt)
+- Fix cufilejni build include path ([#9168](https://github.com/rapidsai/cudf/pull/9168)) [@pxLi](https://github.com/pxLi)
+- `dask_cudf` dispatch registering cleanup ([#9160](https://github.com/rapidsai/cudf/pull/9160)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unneeded stream/mr from a cudf::make_strings_column ([#9148](https://github.com/rapidsai/cudf/pull/9148)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `pandas` version in `cudf` ([#9147](https://github.com/rapidsai/cudf/pull/9147)) [@galipremsagar](https://github.com/galipremsagar)
+- make data chunk reader return unique_ptr ([#9129](https://github.com/rapidsai/cudf/pull/9129)) [@cwharris](https://github.com/cwharris)
+- Add backend for `percentile_lookup` dispatch ([#9118](https://github.com/rapidsai/cudf/pull/9118)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor implementation of column setitem ([#9110](https://github.com/rapidsai/cudf/pull/9110)) [@vyasr](https://github.com/vyasr)
+- Fix compile warnings found using nvcc 11.4 ([#9101](https://github.com/rapidsai/cudf/pull/9101)) [@davidwendt](https://github.com/davidwendt)
+- Update to UCX-Py 0.22 ([#9099](https://github.com/rapidsai/cudf/pull/9099)) [@pentschev](https://github.com/pentschev)
+- Simplify read_avro by removing unnecessary writer/impl classes ([#9090](https://github.com/rapidsai/cudf/pull/9090)) [@cwharris](https://github.com/cwharris)
+- Allowing %f in format to return nanoseconds ([#9081](https://github.com/rapidsai/cudf/pull/9081)) [@marlenezw](https://github.com/marlenezw)
+- Java bindings for cudf::hash_join ([#9080](https://github.com/rapidsai/cudf/pull/9080)) [@jlowe](https://github.com/jlowe)
+- Remove stale code in `ColumnBase._fill` ([#9078](https://github.com/rapidsai/cudf/pull/9078)) [@isVoid](https://github.com/isVoid)
+- Add support for `get_group` in GroupBy ([#9070](https://github.com/rapidsai/cudf/pull/9070)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove remaining &quot;support&quot; methods from DataFrame ([#9068](https://github.com/rapidsai/cudf/pull/9068)) [@vyasr](https://github.com/vyasr)
+- Update JNI java CSV APIs to not use deprecated API ([#9066](https://github.com/rapidsai/cudf/pull/9066)) [@revans2](https://github.com/revans2)
+- Added method to remove null_masks if the column has no nulls ([#9061](https://github.com/rapidsai/cudf/pull/9061)) [@razajafri](https://github.com/razajafri)
+- Consolidate Several Series and Dataframe Methods ([#9059](https://github.com/rapidsai/cudf/pull/9059)) [@isVoid](https://github.com/isVoid)
+- Remove usage of string based `set_dtypes` for `csv` &amp; `json` readers ([#9049](https://github.com/rapidsai/cudf/pull/9049)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove some debug print statements from gtests ([#9048](https://github.com/rapidsai/cudf/pull/9048)) [@davidwendt](https://github.com/davidwendt)
+- Support additional format specifiers in from_timestamps ([#9047](https://github.com/rapidsai/cudf/pull/9047)) [@davidwendt](https://github.com/davidwendt)
+- Expose expression base class publicly and simplify public AST API ([#9045](https://github.com/rapidsai/cudf/pull/9045)) [@vyasr](https://github.com/vyasr)
+- move filepath and mmap logic out of json/csv up to functions.cpp ([#9040](https://github.com/rapidsai/cudf/pull/9040)) [@cwharris](https://github.com/cwharris)
+- Refactor Index hierarchy ([#9039](https://github.com/rapidsai/cudf/pull/9039)) [@vyasr](https://github.com/vyasr)
+- cudf now leverages rapids-cmake to reduce CMake boilerplate ([#9030](https://github.com/rapidsai/cudf/pull/9030)) [@robertmaynard](https://github.com/robertmaynard)
+- Add support for `STRUCT` input to `groupby` ([#9024](https://github.com/rapidsai/cudf/pull/9024)) [@mythrocks](https://github.com/mythrocks)
+- Refactor Frame scans ([#9021](https://github.com/rapidsai/cudf/pull/9021)) [@vyasr](https://github.com/vyasr)
+- Remove duplicate `set_categories` code ([#9018](https://github.com/rapidsai/cudf/pull/9018)) [@isVoid](https://github.com/isVoid)
+- Map support for ParquetWriter ([#9013](https://github.com/rapidsai/cudf/pull/9013)) [@razajafri](https://github.com/razajafri)
+- Remove aliases of various api.types APIs from utils.dtypes. ([#9011](https://github.com/rapidsai/cudf/pull/9011)) [@vyasr](https://github.com/vyasr)
+- Java bindings for conditional join output sizes ([#9002](https://github.com/rapidsai/cudf/pull/9002)) [@jlowe](https://github.com/jlowe)
+- Remove _copy_construct factory ([#8999](https://github.com/rapidsai/cudf/pull/8999)) [@vyasr](https://github.com/vyasr)
+- ENH Allow arbitrary CMake config options in build.sh ([#8996](https://github.com/rapidsai/cudf/pull/8996)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- A small optimization for JNI copy column view to column vector ([#8985](https://github.com/rapidsai/cudf/pull/8985)) [@revans2](https://github.com/revans2)
+- Fix nvcc warnings in ORC writer ([#8975](https://github.com/rapidsai/cudf/pull/8975)) [@devavret](https://github.com/devavret)
+- Support nested structs in rank and dense rank ([#8962](https://github.com/rapidsai/cudf/pull/8962)) [@rwlee](https://github.com/rwlee)
+- Move compute_column API out of ast namespace ([#8957](https://github.com/rapidsai/cudf/pull/8957)) [@vyasr](https://github.com/vyasr)
+- Series datetime is_year_end and is_year_start ([#8954](https://github.com/rapidsai/cudf/pull/8954)) [@marlenezw](https://github.com/marlenezw)
+- Make Java AstNode public ([#8953](https://github.com/rapidsai/cudf/pull/8953)) [@jlowe](https://github.com/jlowe)
+- Replace allocate with device_uvector for subword_tokenize internal tables ([#8952](https://github.com/rapidsai/cudf/pull/8952)) [@davidwendt](https://github.com/davidwendt)
+- `cudf.dtype` function ([#8949](https://github.com/rapidsai/cudf/pull/8949)) [@shwina](https://github.com/shwina)
+- Refactor Frame reductions ([#8944](https://github.com/rapidsai/cudf/pull/8944)) [@vyasr](https://github.com/vyasr)
+- Add deprecation warning for `Series.set_mask` API ([#8943](https://github.com/rapidsai/cudf/pull/8943)) [@galipremsagar](https://github.com/galipremsagar)
+- Move AST evaluator into a separate header ([#8930](https://github.com/rapidsai/cudf/pull/8930)) [@vyasr](https://github.com/vyasr)
+- JNI Aggregation Type Changes ([#8919](https://github.com/rapidsai/cudf/pull/8919)) [@revans2](https://github.com/revans2)
+- Move template parameter to function parameter in cudf::detail::left_semi_anti_join ([#8914](https://github.com/rapidsai/cudf/pull/8914)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` &amp; `pyarrow` to `5.0.0` ([#8908](https://github.com/rapidsai/cudf/pull/8908)) [@galipremsagar](https://github.com/galipremsagar)
+- Add groupby_aggregation and groupby_scan_aggregation classes and force their usage. ([#8906](https://github.com/rapidsai/cudf/pull/8906)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Move `structs_column_tests.cu` to `.cpp`. ([#8902](https://github.com/rapidsai/cudf/pull/8902)) [@mythrocks](https://github.com/mythrocks)
+- Add stream and memory-resource parameters to struct-scalar copy ctor ([#8901](https://github.com/rapidsai/cudf/pull/8901)) [@davidwendt](https://github.com/davidwendt)
+- Combine linearizer and ast_plan ([#8900](https://github.com/rapidsai/cudf/pull/8900)) [@vyasr](https://github.com/vyasr)
+- Add Java bindings for conditional join gather maps ([#8888](https://github.com/rapidsai/cudf/pull/8888)) [@jlowe](https://github.com/jlowe)
+- Remove max version pin for `dask` &amp; `distributed` on development branch ([#8881](https://github.com/rapidsai/cudf/pull/8881)) [@galipremsagar](https://github.com/galipremsagar)
+- fix cufilejni build w/ c++17 ([#8877](https://github.com/rapidsai/cudf/pull/8877)) [@pxLi](https://github.com/pxLi)
+- Add struct accessor to dask-cudf ([#8874](https://github.com/rapidsai/cudf/pull/8874)) [@NV-jpt](https://github.com/NV-jpt)
+- Migrate dask-cudf CudfEngine to leverage ArrowDatasetEngine ([#8871](https://github.com/rapidsai/cudf/pull/8871)) [@rjzamora](https://github.com/rjzamora)
+- Add JNI for extract_quarter, add_calendrical_months, and is_leap_year ([#8863](https://github.com/rapidsai/cudf/pull/8863)) [@revans2](https://github.com/revans2)
+- Change cudf::scalar copy and move constructors to protected ([#8857](https://github.com/rapidsai/cudf/pull/8857)) [@davidwendt](https://github.com/davidwendt)
+- Replace `is_same&lt;&gt;::value` with `is_same_v&lt;&gt;` ([#8852](https://github.com/rapidsai/cudf/pull/8852)) [@codereport](https://github.com/codereport)
+- Add min `pytorch` version to `importorskip` in pytest ([#8851](https://github.com/rapidsai/cudf/pull/8851)) [@galipremsagar](https://github.com/galipremsagar)
+- Java bindings for regex replace ([#8847](https://github.com/rapidsai/cudf/pull/8847)) [@jlowe](https://github.com/jlowe)
+- Remove make strings children with null mask ([#8830](https://github.com/rapidsai/cudf/pull/8830)) [@davidwendt](https://github.com/davidwendt)
+- Refactor conditional joins ([#8815](https://github.com/rapidsai/cudf/pull/8815)) [@vyasr](https://github.com/vyasr)
+- Small cleanup (unused headers / commented code removals) ([#8799](https://github.com/rapidsai/cudf/pull/8799)) [@codereport](https://github.com/codereport)
+- ENH Replace gpuci_conda_retry with gpuci_mamba_retry ([#8770](https://github.com/rapidsai/cudf/pull/8770)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Update cudf java bindings to 21.10.0-SNAPSHOT ([#8765](https://github.com/rapidsai/cudf/pull/8765)) [@pxLi](https://github.com/pxLi)
+- Refactor and improve join benchmarks with nvbench ([#8734](https://github.com/rapidsai/cudf/pull/8734)) [@PointKernel](https://github.com/PointKernel)
+- Refactor Python factories and remove usage of Table for libcudf output handling ([#8687](https://github.com/rapidsai/cudf/pull/8687)) [@vyasr](https://github.com/vyasr)
+- Optimize URL Decoding ([#8622](https://github.com/rapidsai/cudf/pull/8622)) [@gaohao95](https://github.com/gaohao95)
+- Parquet writer dictionary encoding refactor ([#8476](https://github.com/rapidsai/cudf/pull/8476)) [@devavret](https://github.com/devavret)
+- Use nvcomp&#39;s snappy decompression in parquet reader ([#8252](https://github.com/rapidsai/cudf/pull/8252)) [@devavret](https://github.com/devavret)
+- Use nvcomp&#39;s snappy compressor in parquet writer ([#8229](https://github.com/rapidsai/cudf/pull/8229)) [@devavret](https://github.com/devavret)
 
 # cuDF 21.08.00 (4 Aug 2021)
 

From 0906a0903cbdaac8a5a104a1fbf771b85abc389c Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Fri, 19 Nov 2021 17:21:12 -0500
Subject: [PATCH 17/72] fix merge issues

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 79dd2f9a31c..b55669f7f50 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,6 @@
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
 
-# cuDF 21.10.00 (Date TBD)
 # cuDF 21.10.00 (7 Oct 2021)
 
 ## 🚨 Breaking Changes

From 05dd5415b1391270ea74d1f33080bbbf58f848cc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 19 Nov 2021 14:32:37 -0800
Subject: [PATCH 18/72] Use List of Columns as Input for `drop_nulls`, `gather`
 and `drop_duplicates` (#9558)

Currently, there are several APIs that accepts a `Frame` object as input, in corresponding to their libcudf counterparts that accepts a `table_view`. To make some also work for columns, currently we pass them through `as_frame` and return with `_as_column`. This PR changes the cython API to accept a list of columns and greatly reduces the overhead of column roundtrip (see benchmark for column APIs below).

Starting as a pilot study of standardizing cython calling convention for table APIs, some decisions were made in this PR:
1. Use `list` as the container for the collection of the columns. Ideally, an iterable is most pythonic, but may lose some type safety.
2. The column collection is agnostic to index/data columns, libcudf handle index column separately either. This helps simplify cython logics.

<details>
<summary>Gather/Take Benchmark</summary>

```
----------------------------------- benchmark '100-random-False': 4 tests ------------------------------------
Name (time in us)                                      Min                   Max                Mean
--------------------------------------------------------------------------------------------------------------
gather_single_column[100-random-False] (afte)     420.4372 (1.0)        552.7758 (1.0)      428.8227 (1.0)
gather_single_column[100-random-False] (befo)     597.7047 (1.42)       811.8181 (1.47)     606.3709 (1.41)
take_multiple_column[100-random-False] (afte)     849.6591 (2.02)     6,339.7521 (11.47)    870.1292 (2.03)
take_multiple_column[100-random-False] (befo)     864.0001 (2.06)     1,091.5170 (1.97)     872.8270 (2.04)
--------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-random-True': 4 tests -----------------------------------
Name (time in us)                                     Min                   Max                Mean
-------------------------------------------------------------------------------------------------------------
gather_single_column[100-random-True] (afte)     141.4879 (1.0)      3,144.3723 (2.64)     145.7316 (1.0)
gather_single_column[100-random-True] (befo)     291.5259 (2.06)     3,083.7669 (2.59)     299.2343 (2.05)
take_multiple_column[100-random-True] (afte)     958.2350 (6.77)     1,295.6643 (1.09)     971.2230 (6.66)
take_multiple_column[100-random-True] (befo)     967.4439 (6.84)     1,191.7809 (1.0)      976.4725 (6.70)
-------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-reverse-False': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[100-reverse-False] (afte)     414.2257 (1.0)      6,856.2678 (2.05)     426.5804 (1.0)
gather_single_column[100-reverse-False] (befo)     589.7889 (1.42)     3,387.3413 (1.01)     602.0794 (1.41)
take_multiple_column[100-reverse-False] (afte)     849.6824 (2.05)     4,650.7069 (1.39)     862.7702 (2.02)
take_multiple_column[100-reverse-False] (befo)     863.7700 (2.09)     3,348.6579 (1.0)      877.5145 (2.06)
---------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '100-reverse-True': 4 tests ------------------------------------
Name (time in us)                                      Min                   Max                Mean
--------------------------------------------------------------------------------------------------------------
gather_single_column[100-reverse-True] (afte)     141.5601 (1.0)        292.0129 (1.0)      144.5997 (1.0)
gather_single_column[100-reverse-True] (befo)     286.7738 (2.03)     4,374.5530 (14.98)    297.3910 (2.06)
take_multiple_column[100-reverse-True] (afte)     960.0958 (6.78)     1,354.3908 (4.64)     973.7589 (6.73)
take_multiple_column[100-reverse-True] (befo)     963.5990 (6.81)     1,175.8050 (4.03)     975.9332 (6.75)
--------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '100-sequence-False': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[100-sequence-False] (afte)     418.4479 (1.0)      4,602.9259 (2.09)     436.3953 (1.0)
gather_single_column[100-sequence-False] (befo)     589.5318 (1.41)     4,665.3422 (2.12)     605.6177 (1.39)
take_multiple_column[100-sequence-False] (afte)     851.3979 (2.03)     5,037.6062 (2.29)     866.8329 (1.99)
take_multiple_column[100-sequence-False] (befo)     858.9821 (2.05)     2,197.5730 (1.0)      872.5517 (2.00)
----------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-sequence-True': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[100-sequence-True] (afte)     145.0991 (1.0)        229.3726 (1.0)      148.7882 (1.0)
gather_single_column[100-sequence-True] (befo)     289.9761 (2.00)       363.9143 (1.59)     295.9855 (1.99)
take_multiple_column[100-sequence-True] (afte)     961.4970 (6.63)     1,028.0283 (4.48)     969.3146 (6.51)
take_multiple_column[100-sequence-True] (befo)     962.7347 (6.64)     1,048.2450 (4.57)     973.8807 (6.55)
---------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-random-False': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-random-False] (afte)     419.3909 (1.0)        669.2931 (1.0)      427.0140 (1.0)
gather_single_column[10000-random-False] (befo)     600.0311 (1.43)     2,198.0200 (3.28)     610.3418 (1.43)
take_multiple_column[10000-random-False] (afte)     862.4257 (2.06)     4,764.4433 (7.12)     880.1974 (2.06)
take_multiple_column[10000-random-False] (befo)     873.0851 (2.08)     1,024.1494 (1.53)     881.4482 (2.06)
----------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-random-True': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[10000-random-True] (afte)     134.2846 (1.0)      4,995.3298 (12.11)    139.0623 (1.0)
gather_single_column[10000-random-True] (befo)     284.2899 (2.12)       412.4213 (1.0)      289.8005 (2.08)
take_multiple_column[10000-random-True] (afte)     960.2159 (7.15)     1,361.8441 (3.30)     973.4057 (7.00)
take_multiple_column[10000-random-True] (befo)     965.8998 (7.19)     1,140.6899 (2.77)     976.9224 (7.03)
---------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-reverse-False': 4 tests -----------------------------------
Name (time in us)                                         Min                   Max                Mean
-----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-reverse-False] (afte)     419.7811 (1.0)        634.7937 (1.0)      428.2997 (1.0)
gather_single_column[10000-reverse-False] (befo)     600.3999 (1.43)       762.5762 (1.20)     608.6369 (1.42)
take_multiple_column[10000-reverse-False] (afte)     856.1970 (2.04)     1,138.3081 (1.79)     870.1638 (2.03)
take_multiple_column[10000-reverse-False] (befo)     869.8748 (2.07)     3,184.0033 (5.02)     889.7182 (2.08)
-----------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-reverse-True': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-reverse-True] (afte)     135.4842 (1.0)      3,634.2950 (7.81)     140.8658 (1.0)
gather_single_column[10000-reverse-True] (befo)     284.9372 (2.10)       465.4219 (1.0)      292.6105 (2.08)
take_multiple_column[10000-reverse-True] (afte)     957.0192 (7.06)     1,240.3540 (2.67)     966.7779 (6.86)
take_multiple_column[10000-reverse-True] (befo)     967.6940 (7.14)     1,062.0849 (2.28)     975.9307 (6.93)
----------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-sequence-False': 4 tests ------------------------------------
Name (time in us)                                          Min                   Max                Mean
------------------------------------------------------------------------------------------------------------------
gather_single_column[10000-sequence-False] (afte)     420.3622 (1.0)        555.1544 (1.0)      427.4441 (1.0)
gather_single_column[10000-sequence-False] (befo)     601.7918 (1.43)     3,534.9689 (6.37)     613.6190 (1.44)
take_multiple_column[10000-sequence-False] (afte)     858.0340 (2.04)     1,166.5919 (2.10)     868.6121 (2.03)
take_multiple_column[10000-sequence-False] (befo)     871.3542 (2.07)     1,118.0961 (2.01)     881.9761 (2.06)
------------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-sequence-True': 4 tests -----------------------------------
Name (time in us)                                         Min                   Max                Mean
-----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-sequence-True] (afte)     135.8581 (1.0)      3,894.4702 (3.55)     141.3496 (1.0)
gather_single_column[10000-sequence-True] (befo)     284.5018 (2.09)     2,703.6560 (2.47)     290.8583 (2.06)
take_multiple_column[10000-sequence-True] (afte)     957.4448 (7.05)     1,096.1141 (1.0)      966.4487 (6.84)
take_multiple_column[10000-sequence-True] (befo)     966.2341 (7.11)     1,242.0323 (1.13)     978.3753 (6.92)
-----------------------------------------------------------------------------------------------------------------
```
</details>

<details>
<summary>Dropna Benchmark</summary>

```
------------------------------------ benchmark '100-False': 2 tests -----------------------------------
Name (time in us)                               Min                   Max                Mean
-------------------------------------------------------------------------------------------------------
dropna_single_column[100-False] (afte)     143.9294 (1.0)      6,808.9343 (1.58)     150.8468 (1.0)
dropna_single_column[100-False] (befo)     306.3441 (2.13)     4,297.9000 (1.0)      315.3899 (2.09)
-------------------------------------------------------------------------------------------------------

---------------------------------- benchmark '100-True': 2 tests -----------------------------------
Name (time in us)                              Min                 Max                Mean
----------------------------------------------------------------------------------------------------
dropna_single_column[100-True] (afte)     275.7823 (1.0)      327.2779 (1.0)      279.8443 (1.0)
dropna_single_column[100-True] (befo)     548.6836 (1.99)     692.2791 (2.12)     557.9867 (1.99)
----------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-False': 2 tests -----------------------------------
Name (time in us)                                 Min                   Max                Mean
---------------------------------------------------------------------------------------------------------
dropna_single_column[10000-False] (afte)     164.9209 (1.0)      5,742.9820 (1.61)     170.0143 (1.0)
dropna_single_column[10000-False] (befo)     328.6479 (1.99)     3,565.7589 (1.0)      336.6208 (1.98)
---------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-True': 2 tests ------------------------------------
Name (time in us)                                Min                   Max                Mean
--------------------------------------------------------------------------------------------------------
dropna_single_column[10000-True] (afte)     304.6701 (1.0)        441.9931 (1.0)      309.9858 (1.0)
dropna_single_column[10000-True] (befo)     571.9690 (1.88)     5,526.0560 (12.50)    586.4943 (1.89)
--------------------------------------------------------------------------------------------------------
```
</details>

<details>
<summary> Unique/Drop_duplicate Benchmark</summary>

```
------------------------------------ benchmark '100': 4 tests -----------------------------------
Name (time in us)                         Min                   Max                Mean
-------------------------------------------------------------------------------------------------
drop_duplicate_df[100] (afte)        891.9560 (2.77)     1,151.0071 (2.76)     904.5752 (2.74)
drop_duplicate_df[100] (befo)        880.9832 (2.74)     5,528.1101 (13.23)    896.1535 (2.72)
unique_single_column[100] (afte)     322.0579 (1.0)        417.7210 (1.0)      329.5932 (1.0)
unique_single_column[100] (befo)     480.7310 (1.49)     4,470.7772 (10.70)    491.7183 (1.49)
-------------------------------------------------------------------------------------------------

-------------------------------- benchmark '10000': 4 tests -------------------------------
Name (time in ms)                         Min               Max              Mean
-------------------------------------------------------------------------------------------
drop_duplicate_df[10000] (afte)        1.0108 (2.23)     3.9981 (4.72)     1.0280 (2.17)
drop_duplicate_df[10000] (befo)        1.0021 (2.21)     3.5031 (4.14)     1.0177 (2.15)
unique_single_column[10000] (afte)     0.4534 (1.0)      4.5188 (5.33)     0.4740 (1.0)
unique_single_column[10000] (befo)     0.6095 (1.34)     0.8471 (1.0)      0.6332 (1.34)
-------------------------------------------------------------------------------------------
```
</details>

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/9558
---
 python/cudf/cudf/_lib/copying.pyx           |  39 ++---
 python/cudf/cudf/_lib/stream_compaction.pyx |  81 +++-------
 python/cudf/cudf/_lib/utils.pxd             |   1 +
 python/cudf/cudf/_lib/utils.pyx             |  34 +++-
 python/cudf/cudf/core/column/column.py      |  56 ++++---
 python/cudf/cudf/core/frame.py              | 162 ++++++++++++--------
 python/cudf/cudf/core/index.py              |  29 +---
 python/cudf/cudf/core/indexed_frame.py      | 113 +++++++++++++-
 python/cudf/cudf/core/multiindex.py         |   8 +-
 python/cudf/cudf/utils/utils.py             |  18 +++
 10 files changed, 322 insertions(+), 219 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 26ef428f21f..28bd78733a3 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -37,7 +37,12 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_table_view,
+    data_from_unique_ptr,
+    table_view_from_columns,
+)
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
@@ -144,27 +149,12 @@ def copy_range(Column input_column,
 
 
 def gather(
-    source_table,
+    columns: list,
     Column gather_map,
-    bool keep_index=True,
-    bool nullify=False,
-    bool check_bounds=True
+    bool nullify=False
 ):
-    if not pd.api.types.is_integer_dtype(gather_map.dtype):
-        raise ValueError("Gather map is not integer dtype.")
-
-    if check_bounds and len(gather_map) > 0 and not nullify:
-        gm_min, gm_max = minmax(gather_map)
-        if gm_min < -len(source_table) or gm_max >= len(source_table):
-            raise IndexError(f"Gather map index with min {gm_min},"
-                             f" max {gm_max} is out of bounds in"
-                             f" {type(source_table)} with {len(source_table)}"
-                             f" rows.")
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view gather_map_view = gather_map.view()
     cdef cpp_copying.out_of_bounds_policy policy = (
         cpp_copying.out_of_bounds_policy.NULLIFY if nullify
@@ -180,16 +170,7 @@ def gather(
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (
-                source_table._index is None)
-            or keep_index is False
-            else source_table._index_names
-        )
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def scatter(object source, Column scatter_map, Column target_column,
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 7167d18409e..ef47e843723 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -24,40 +24,34 @@ from cudf._lib.cpp.types cimport (
     null_policy,
     size_type,
 )
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_unique_ptr,
+    table_view_from_columns,
+    table_view_from_table,
+)
 
 
-def drop_nulls(source_table, how="any", keys=None, thresh=None):
+def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     """
     Drops null rows from cols depending on key columns.
 
     Parameters
     ----------
-    source_table : source table whose null rows are dropped to form new table
+    columns : list of columns
     how  : "any" or "all". If thresh is None, drops rows of cols that have any
            nulls or all nulls (respectively) in subset (default: "any")
-    keys : List of Column names. If set, then these columns are checked for
-           nulls rather than all of cols (optional)
+    keys : List of column indices. If set, then these columns are checked for
+           nulls rather than all of columns (optional)
     thresh : Minimum number of non-nulls required to keep a row (optional)
 
     Returns
     -------
-    Frame with null rows dropped
+    columns with null rows dropped
     """
 
-    num_index_columns = (
-        0 if source_table._index is None else
-        source_table._index._num_columns)
-    # shifting the index number by number of index columns
     cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
+        keys if keys is not None else range(len(columns))
     )
 
     cdef size_type c_keep_threshold = cpp_keys.size()
@@ -67,7 +61,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
         c_keep_threshold = 1
 
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -78,13 +72,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index is None
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def apply_boolean_mask(source_table, Column boolean_mask):
@@ -124,26 +112,29 @@ def apply_boolean_mask(source_table, Column boolean_mask):
     )
 
 
-def drop_duplicates(source_table,
+def drop_duplicates(columns: list,
                     object keys=None,
                     object keep='first',
-                    bool nulls_are_equal=True,
-                    bool ignore_index=False):
+                    bool nulls_are_equal=True):
     """
     Drops rows in source_table as per duplicate rows in keys.
 
     Parameters
     ----------
-    source_table : source_table whose rows gets dropped
-    keys : List of Column names belong to source_table
+    columns : List of columns
+    keys : List of column indices. If set, then these columns are checked for
+           duplicates rather than all of columns (optional)
     keep : keep 'first' or 'last' or none of the duplicate rows
     nulls_are_equal : if True, nulls are treated equal else not.
 
     Returns
     -------
-    Frame with duplicate dropped
+    columns with duplicate dropped
     """
 
+    cdef vector[size_type] cpp_keys = (
+        keys if keys is not None else range(len(columns))
+    )
     cdef duplicate_keep_option cpp_keep_option
 
     if keep == 'first':
@@ -155,30 +146,14 @@ def drop_duplicates(source_table,
     else:
         raise ValueError('keep must be either "first", "last" or False')
 
-    num_index_columns =(
-        0 if (source_table._index is None or ignore_index)
-        else source_table._index._num_columns)
     # shifting the index number by number of index columns
-    cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
-    )
-
     cdef null_equality cpp_nulls_equal = (
         null_equality.EQUAL
         if nulls_are_equal
         else null_equality.UNEQUAL
     )
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -190,13 +165,7 @@ def drop_duplicates(source_table,
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (source_table._index is None or ignore_index)
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 10f76279401..50893ef9838 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -16,3 +16,4 @@ cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
+cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 18eed2b3396..40edd4bf9a2 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -36,7 +36,6 @@ PARQUET_META_TYPE_MAP = {
     for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
 }
 
-
 cdef table_view table_view_from_columns(columns) except*:
     """Create a cudf::table_view from an iterable of Columns."""
     cdef vector[column_view] column_views
@@ -221,6 +220,32 @@ def _index_level_name(index_name, level, column_names):
         return f"__index_level_{level}__"
 
 
+cdef columns_from_unique_ptr(
+    unique_ptr[table] c_tbl
+):
+    """Convert a libcudf table into list of columns.
+
+    Parameters
+    ----------
+    c_tbl : unique_ptr[cudf::table]
+        The libcudf table whose columns will be extracted
+
+    Returns
+    -------
+    list[Column]
+        A list of columns.
+    """
+    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
+    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
+
+    cdef size_t i
+
+    columns = [Column.from_unique_ptr(move(dereference(it+i)))
+               for i in range(c_columns.size())]
+
+    return columns
+
+
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=None
 ):
@@ -255,13 +280,8 @@ cdef data_from_unique_ptr(
     tuple(Dict[str, Column], Optional[Index])
         A dict of the columns in the output table.
     """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
 
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
+    columns = columns_from_unique_ptr(move(c_tbl))
 
     # First construct the index, if any
     index = (
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e2bedd9d0b1..1d113f6e159 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -34,7 +34,11 @@
     create_null_mask,
 )
 from cudf._lib.scalar import as_device_scalar
-from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
+from cudf._lib.stream_compaction import (
+    distinct_count as cpp_distinct_count,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
@@ -71,7 +75,7 @@
     pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_np_dtypes,
 )
-from cudf.utils.utils import mask_dtype
+from cudf.utils.utils import _gather_map_is_valid, mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
 
@@ -200,11 +204,8 @@ def any(self, skipna: bool = True) -> bool:
         return result_col
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        if drop_nan:
-            col = self.nans_to_nulls()
-        else:
-            col = self
-        return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
+        col = self.nans_to_nulls() if drop_nan else self
+        return drop_nulls([col])[0]
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -686,28 +687,27 @@ def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
     def take(
-        self: T,
-        indices: ColumnBase,
-        keep_index: bool = True,
-        nullify: bool = False,
+        self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
     ) -> T:
-        """Return Column by taking values from the corresponding *indices*."""
+        """Return Column by taking values from the corresponding *indices*.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
         # Handle zero size
         if indices.size == 0:
             return cast(T, column_empty_like(self, newsize=0))
-        try:
-            return (
-                self.as_frame()
-                ._gather(indices, keep_index=keep_index, nullify=nullify)
-                ._as_column()
-                ._with_type_metadata(self.dtype)
-            )
-        except RuntimeError as e:
-            if "out of bounds" in str(e):
-                raise IndexError(
-                    f"index out of bounds for column of size {len(self)}"
-                ) from e
-            raise
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(indices.dtype):
+            indices = indices.astype("int32")
+        if not _gather_map_is_valid(indices, len(self), check_bounds, nullify):
+            raise IndexError("Gather map index is out of bounds.")
+
+        return libcudf.copying.gather([self], indices, nullify=nullify)[
+            0
+        ]._with_type_metadata(self.dtype)
 
     def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
@@ -1098,11 +1098,7 @@ def unique(self) -> ColumnBase:
         # the following issue resolved:
         # https://github.com/rapidsai/cudf/issues/5286
 
-        return (
-            self.as_frame()
-            .drop_duplicates(keep="first", ignore_index=True)
-            ._as_column()
-        )
+        return drop_duplicates([self], keep="first")[0]
 
     def serialize(self) -> Tuple[dict, list]:
         header: Dict[Any, Any] = {}
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 58fe8a43d8d..d7a75cb9f40 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -52,6 +52,7 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type, is_column_like
+from cudf.utils.utils import _gather_map_is_valid
 
 T = TypeVar("T", bound="Frame")
 
@@ -140,6 +141,37 @@ def _from_data(
         Frame.__init__(obj, data, index)
         return obj
 
+    @classmethod
+    def _from_columns(
+        cls,
+        columns: List[ColumnBase],
+        column_names: List[str],
+        index_names: Optional[List[str]] = None,
+    ):
+        """Construct a `Frame` object from a list of columns.
+
+        If `index_names` is set, the first `len(index_names)` columns are
+        used to construct the index of the frame.
+        """
+        index = None
+        n_index_columns = 0
+        if index_names is not None:
+            n_index_columns = len(index_names)
+            index = cudf.core.index._index_from_data(
+                dict(zip(range(n_index_columns), columns))
+            )
+            if isinstance(index, cudf.MultiIndex):
+                index.names = index_names
+            else:
+                index.name = index_names[0]
+
+        data = {
+            name: columns[i + n_index_columns]
+            for i, name in enumerate(column_names)
+        }
+
+        return cls._from_data(data, index)
+
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
     ) -> Optional[Frame]:
@@ -520,22 +552,32 @@ def _get_columns_by_index(self, indices):
     def _gather(
         self, gather_map, keep_index=True, nullify=False, check_bounds=True
     ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        # TODO: `keep_index` argument is to be removed.
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
         if not is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
-        result = self.__class__._from_data(
-            *libcudf.copying.gather(
-                self,
-                as_column(gather_map),
-                keep_index=keep_index,
-                nullify=nullify,
-                check_bounds=check_bounds,
-            )
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._columns), gather_map, nullify=nullify,
+            ),
+            self._column_names,
         )
 
-        result._copy_type_metadata(self, include_index=keep_index)
-        result._data.names = self._data.names
-        if keep_index and self._index is not None:
-            result._index.names = self._index.names
+        result._copy_type_metadata(self)
         return result
 
     def _hash(self, method, initial_hash=None):
@@ -1396,10 +1438,8 @@ def _drop_na_rows(
         diff = set(subset) - set(self._data)
         if len(diff) != 0:
             raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [
-            name for name, col in self._data.items() if name in subset
-        ]
-        if len(subset_cols) == 0:
+
+        if len(subset) == 0:
             return self.copy(deep=True)
 
         frame = self.copy(deep=False)
@@ -1412,16 +1452,19 @@ def _drop_na_rows(
                 else:
                     frame._data[name] = col
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_nulls(
-                frame, how=how, keys=subset, thresh=thresh
-            )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_nulls(
+                list(self._index._data.columns + frame._columns),
+                how=how,
+                keys=self._positions_from_column_names(
+                    subset, offset_by_index_columns=True
+                ),
+                thresh=thresh,
+            ),
+            self._column_names,
+            self._index.names,
         )
         result._copy_type_metadata(frame)
-        if self._index is not None:
-            result._index.name = self._index.name
-            if isinstance(self._index, cudf.MultiIndex):
-                result._index.names = self._index.names
         return result
 
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
@@ -2262,55 +2305,45 @@ def to_arrow(self):
         )
 
     def drop_duplicates(
-        self,
-        subset=None,
-        keep="first",
-        nulls_are_equal=True,
-        ignore_index=False,
+        self, keep="first", nulls_are_equal=True,
     ):
         """
-        Drops rows in frame as per duplicate rows in `subset` columns from
-        self.
+        Drop duplicate rows in frame.
 
-        subset : list, optional
-            List of columns to consider when dropping rows.
-        keep : ["first", "last", False] first will keep first of duplicate,
-            last will keep last of the duplicate and False drop all
-            duplicate
-        nulls_are_equal: null elements are considered equal to other null
-            elements
-        ignore_index: bool, default False
-            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        keep : ["first", "last", False], default "first"
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
         """
-        if subset is None:
-            subset = self._column_names
-        elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
-            or isinstance(subset, tuple)
-            and subset in self._data.names
-        ):
-            subset = (subset,)
-        diff = set(subset) - set(self._data)
-        if len(diff) != 0:
-            raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [name for name in self._column_names if name in subset]
-        if len(subset_cols) == 0:
-            return self.copy(deep=True)
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_duplicates(
-                self,
-                keys=subset,
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
-                ignore_index=ignore_index,
-            )
+            ),
+            self._column_names,
         )
-
+        # TODO: _copy_type_metadata is a common pattern to apply after the
+        # roundtrip from libcudf. We should build this into a factory function
+        # to increase reusability.
         result._copy_type_metadata(self)
         return result
 
+    def _positions_from_column_names(self, column_names):
+        """Map each column name into their positions in the frame.
+
+        The order of indices returned corresponds to the column order in this
+        Frame.
+        """
+        return [
+            i
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
     def replace(
         self,
         to_replace=None,
@@ -2589,7 +2622,10 @@ def _copy_type_metadata(
                     self._index, cudf.core.index.CategoricalIndex
                 ):
                     self._index = cudf.Index(
-                        cast(cudf.core.index.NumericIndex, self._index)._column
+                        cast(
+                            cudf.core.index.NumericIndex, self._index
+                        )._column,
+                        name=self._index.name,
                     )
 
         return self
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 5ea9ac945dc..8f905ee6d49 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -86,6 +86,7 @@ def _lexsorted_equal_range(
 
 def _index_from_data(data: MutableMapping, name: Any = None):
     """Construct an index of the appropriate type from some data."""
+
     if len(data) == 0:
         raise ValueError("Cannot construct Index from any empty Table")
     if len(data) == 1:
@@ -770,34 +771,6 @@ def deserialize(cls, header, frames):
 
         return super().deserialize(header, frames)
 
-    def drop_duplicates(self, keep="first"):
-        """
-        Return Index with duplicate values removed
-
-        Parameters
-        ----------
-        keep : {‘first’, ‘last’, False}, default ‘first’
-            * ‘first’ : Drop duplicates except for the
-                first occurrence.
-            * ‘last’ : Drop duplicates except for the
-                last occurrence.
-            *  False : Drop all duplicates.
-
-        Returns
-        -------
-        Index
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
-        >>> idx
-        StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object')
-        >>> idx.drop_duplicates()
-        StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object')
-        """  # noqa: E501
-        return super().drop_duplicates(keep=keep)
-
     def _binaryop(
         self,
         other: T,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index cf12907d96a..2044bad9675 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,17 +8,19 @@
 from uuid import uuid4
 
 import cupy as cp
+import numpy as np
 import pandas as pd
 from nvtx import annotate
 
 import cudf
+import cudf._lib as libcudf
 from cudf._typing import ColumnLike
-from cudf.api.types import is_categorical_dtype, is_list_like
+from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import cached_property
+from cudf.utils.utils import _gather_map_is_valid, cached_property
 
 
 def _indices_from_labels(obj, labels):
@@ -435,6 +437,113 @@ def sort_index(
             out = out.reset_index(drop=True)
         return self._mimic_inplace(out, inplace=inplace)
 
+    def _gather(
+        self, gather_map, keep_index=True, nullify=False, check_bounds=True
+    ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(gather_map.dtype):
+            gather_map = gather_map.astype("int32")
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._index._columns + self._columns)
+                if keep_index
+                else list(self._columns),
+                gather_map,
+                nullify=nullify,
+            ),
+            self._column_names,
+            self._index.names if keep_index else None,
+        )
+
+        result._copy_type_metadata(self, include_index=keep_index)
+        return result
+
+    def _positions_from_column_names(
+        self, column_names, offset_by_index_columns=False
+    ):
+        """Map each column name into their positions in the frame.
+
+        Return positions of the provided column names, offset by the number of
+        index columns `offset_by_index_columns` is True. The order of indices
+        returned corresponds to the column order in this Frame.
+        """
+        num_index_columns = (
+            len(self._index._data) if offset_by_index_columns else 0
+        )
+        return [
+            i + num_index_columns
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
+    def drop_duplicates(
+        self,
+        subset=None,
+        keep="first",
+        nulls_are_equal=True,
+        ignore_index=False,
+    ):
+        """
+        Drop duplicate rows in frame.
+
+        subset : list, optional
+            List of columns to consider when dropping rows.
+        keep : ["first", "last", False]
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
+        ignore_index: bool, default False
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
+        """
+        if subset is None:
+            subset = self._column_names
+        elif (
+            not np.iterable(subset)
+            or isinstance(subset, str)
+            or isinstance(subset, tuple)
+            and subset in self._data.names
+        ):
+            subset = (subset,)
+        diff = set(subset) - set(self._data)
+        if len(diff) != 0:
+            raise KeyError(f"columns {diff} do not exist")
+        subset_cols = [name for name in self._column_names if name in subset]
+        if len(subset_cols) == 0:
+            return self.copy(deep=True)
+
+        keys = self._positions_from_column_names(
+            subset, offset_by_index_columns=not ignore_index
+        )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns)
+                if ignore_index
+                else list(self._index._columns + self._columns),
+                keys=keys,
+                keep=keep,
+                nulls_are_equal=nulls_are_equal,
+            ),
+            self._column_names,
+            self._index.names if not ignore_index else None,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def sort_values(
         self,
         by,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a1eda697683..e0c68e56f63 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -137,9 +137,9 @@ def __init__(
             else:
                 level = cudf.DataFrame({column_name: levels[i]})
 
-            source_data[column_name] = libcudf.copying.gather(level, col)[0][
-                column_name
-            ]
+            source_data[column_name] = libcudf.copying.gather(
+                [level._data[column_name]], col
+            )[0]
 
         super().__init__(source_data)
         self._levels = levels
@@ -1409,7 +1409,7 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     def unique(self):
-        return self.drop_duplicates(ignore_index=True)
+        return self.drop_duplicates(keep="first")
 
     def _clean_nulls_from_index(self):
         """
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index a9611a91554..cea384b9c11 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -12,6 +12,7 @@
 import rmm
 
 import cudf
+from cudf._lib.reduce import minmax
 from cudf.core import column
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
@@ -506,3 +507,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     if (indices == cp.arange(start, stop, step)).all():
         return slice(start, stop, step)
     return indices
+
+
+def _gather_map_is_valid(
+    gather_map: "cudf.core.column.ColumnBase",
+    nrows: int,
+    check_bounds: bool,
+    nullify: bool,
+) -> bool:
+    """Returns true if gather map is valid.
+
+    A gather map is valid if empty or all indices are within the range
+    ``[-nrows, nrows)``, except when ``nullify`` is specifed.
+    """
+    if not check_bounds or nullify or len(gather_map) == 0:
+        return True
+    gm_min, gm_max = minmax(gather_map)
+    return gm_min >= -nrows and gm_max < nrows

From 09a8a4773f74ef6241e9eac4e674181bc753de50 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Nov 2021 16:57:25 -0600
Subject: [PATCH 19/72] Use stop instead of stop_. (#9735)

Small fix to inconsistent variable names in tests, following up from #9571.

Previous conversation: https://github.com/rapidsai/cudf/pull/9571#discussion_r750568195

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9735
---
 cpp/tests/datetime/datetime_ops_test.cpp | 11 +++++------
 cpp/tests/wrappers/timestamps_test.cu    | 15 +++++++--------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 2097e09e674..4ac24317145 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -183,10 +183,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents)
   using namespace cudf::datetime;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto timestamps =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto timestamps = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years =
     fixed_width_column_wrapper<int16_t>{1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033};
@@ -221,9 +220,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
   auto timestamps =
-    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop_));
+    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years = fixed_width_column_wrapper<int16_t>{
     {1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033},
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index b458f34cca8..097b786aefe 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -78,10 +78,9 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
   using namespace cudf::test;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto chrono_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto chrono_col = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   // round-trip through the host to copy `chrono_col` values
   // to a new fixed_width_column_wrapper `primitive_col`
@@ -135,14 +134,14 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 
   auto start_lhs = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
   auto start_rhs = milliseconds(-2400000000000);  // Tue, 12 Dec 1893 05:20:00 GMT
-  auto stop_lhs_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto stop_rhs_ = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
+  auto stop_lhs  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop_rhs  = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
 
   auto chrono_lhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs));
 
   auto chrono_rhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs));
 
   rmm::device_uvector<int32_t> indices(this->size(), rmm::cuda_stream_default);
   thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end());

From f0367c0e1ebec54c964a2114b248926b8f82ec04 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 19 Nov 2021 15:06:25 -0800
Subject: [PATCH 20/72] Use cuFile direct device reads/writes by default in
 cuIO (#9722)

Making this change early in 22.02 to test through internal use + nightly builds before the release.

- Modify the way cuFile integration is enabled to match the nvCOMP integration.
- Change the default from OFF to GDS (GDS on, only for direct reads/writes, no compatibility mode).
- cuFile JSON config file is now modified on first cuFile use (same time as the driver), instead of the first query that checks if GDS use is enabled.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9722
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/src/io/utilities/config_utils.cpp         |  86 ++++++++++++++
 cpp/src/io/utilities/config_utils.hpp         |  37 +++---
 cpp/src/io/utilities/datasource.cpp           |   9 +-
 cpp/src/io/utilities/file_io_utilities.cpp    | 110 +++++++++---------
 cpp/src/io/utilities/file_io_utilities.hpp    |  26 -----
 .../cudf/source/basics/io-gds-integration.rst |  11 +-
 .../source/basics/io-nvcomp-integration.rst   |   7 +-
 8 files changed, 171 insertions(+), 116 deletions(-)
 create mode 100644 cpp/src/io/utilities/config_utils.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 966728d7647..7a556d2c0f6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -310,6 +310,7 @@ add_library(
   src/io/statistics/parquet_column_statistics.cu
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/config_utils.cpp
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
new file mode 100644
index 00000000000..2c1dc1cc0aa
--- /dev/null
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdlib>
+#include <string>
+
+namespace cudf::io::detail {
+
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
+{
+  auto const env_val = std::getenv(env_var_name.c_str());
+  return std::string{(env_val == nullptr) ? default_val : env_val};
+}
+
+namespace cufile_integration {
+
+namespace {
+/**
+ * @brief Defines which cuFile usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "GDS") return usage_policy::GDS;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; }
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
+
+namespace {
+/**
+ * @brief Defines which nvCOMP usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "STABLE") return usage_policy::STABLE;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_NVCOMP_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
+
+}  // namespace nvcomp_integration
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index a1d8e747e44..baa45fef08a 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cstdlib>
 #include <string>
 
 namespace cudf::io::detail {
@@ -24,44 +23,34 @@ namespace cudf::io::detail {
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
  */
-inline std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
-{
-  auto const env_val = std::getenv(env_var_name.c_str());
-  return std::string{(env_val == nullptr) ? default_val : env_val};
-}
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val);
 
-namespace nvcomp_integration {
+namespace cufile_integration {
 
-namespace {
 /**
- * @brief Defines which nvCOMP usage to enable.
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
  */
-enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+bool is_always_enabled();
 
 /**
- * @brief Get the current usage policy.
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
  */
-inline usage_policy get_env_policy()
-{
-  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
-  if (env_val == "OFF") return usage_policy::OFF;
-  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
-  return usage_policy::STABLE;
-}
-}  // namespace
+bool is_gds_enabled();
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
  */
-inline bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+bool is_all_enabled();
 
 /**
  * @brief Returns true if stable nvCOMP use is enabled.
  */
-inline bool is_stable_enabled()
-{
-  return is_all_enabled() or get_env_policy() == usage_policy::STABLE;
-}
+bool is_stable_enabled();
 
 }  // namespace nvcomp_integration
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7afffaede9e..3de6f35cb0d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
+#include "file_io_utilities.hpp"
+
 #include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <cudf/utilities/error.hpp>
-#include "file_io_utilities.hpp"
-
 namespace cudf {
 namespace io {
 namespace {
@@ -239,7 +240,7 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_config::instance()->is_required()) {
+  if (detail::cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 387452e171a..7a48b7d7301 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -51,45 +51,14 @@ file_wrapper::~file_wrapper() { close(fd); }
 
 #ifdef CUFILE_FOUND
 
-cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
-{
-  if (is_enabled()) {
-    // Modify the config file based on the policy
-    auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
-    std::ifstream user_config_file(config_file_path);
-    // Modified config file is stored in a temporary directory
-    auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
-    std::ofstream cudf_config_file(cudf_config_path);
-
-    std::string line;
-    while (std::getline(user_config_file, line)) {
-      std::string const tag = "\"allow_compat_mode\"";
-      if (line.find(tag) != std::string::npos) {
-        // TODO: only replace the true/false value
-        // Enable compatiblity mode when cuDF does not fall back to host path
-        cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
-      } else {
-        cudf_config_file << line << '\n';
-      }
-
-      // Point libcufile to the modified config file
-      CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
-                   "Failed to set the cuFile config file environment variable.");
-    }
-  }
-}
-cufile_config const* cufile_config::instance()
-{
-  static cufile_config _instance;
-  return &_instance;
-}
-
 /**
  * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
  */
 class cufile_shim {
  private:
   cufile_shim();
+  void modify_cufile_json() const;
+  void load_cufile_lib();
 
   void* cf_lib                              = nullptr;
   decltype(cuFileDriverOpen)* driver_open   = nullptr;
@@ -116,25 +85,60 @@ class cufile_shim {
   decltype(cuFileWrite)* write                        = nullptr;
 };
 
+void cufile_shim::modify_cufile_json() const
+{
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  // Modify the config file based on the policy
+  auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+  std::ifstream user_config_file(config_file_path);
+  // Modified config file is stored in a temporary directory
+  auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+  std::ofstream cudf_config_file(cudf_config_path);
+
+  std::string line;
+  while (std::getline(user_config_file, line)) {
+    std::string const tag = "\"allow_compat_mode\"";
+    if (line.find(tag) != std::string::npos) {
+      // TODO: only replace the true/false value instead of replacing the whole line
+      // Enable compatibility mode when cuDF does not fall back to host path
+      cudf_config_file << tag << ": "
+                       << (cufile_integration::is_always_enabled() ? "true" : "false") << ",\n";
+    } else {
+      cudf_config_file << line << '\n';
+    }
+
+    // Point libcufile to the modified config file
+    CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                 "Failed to set the cuFile config file environment variable.");
+  }
+}
+
+void cufile_shim::load_cufile_lib()
+{
+  cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+  driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+  CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+  driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+  CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+  handle_register =
+    reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+  CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+  handle_deregister =
+    reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+  CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile cuFileHandleDeregister symbol");
+  read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+  CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+  write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+  CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+}
+
 cufile_shim::cufile_shim()
 {
   try {
-    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
-    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
-    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
-    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
-    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
-    handle_register =
-      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
-    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
-    handle_deregister =
-      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
-    CUDF_EXPECTS(handle_deregister != nullptr,
-                 "could not find cuFile cuFileHandleDeregister symbol");
-    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
-    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
-    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
-    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+    modify_cufile_json();
+    load_cufile_lib();
 
     CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
   } catch (cudf::logic_error const& err) {
@@ -285,11 +289,11 @@ std::future<void> cufile_output_impl::write_async(void const* data, size_t offse
 std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_input_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
@@ -299,11 +303,11 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_output_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0119f9b7abd..ede0eb6f925 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -162,32 +162,6 @@ class cufile_output : public cufile_io_base {
 
 class cufile_shim;
 
-/**
- * @brief Class that manages cuFile configuration.
- */
-class cufile_config {
-  std::string const default_policy    = "OFF";
-  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-
-  std::string const policy = default_policy;
-  temp_directory tmp_config_dir{"cudf_cufile_config"};
-
-  cufile_config();
-
- public:
-  /**
-   * @brief Returns true when cuFile use is enabled.
-   */
-  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
-
-  /**
-   * @brief Returns true when cuDF should not fall back to host IO.
-   */
-  bool is_required() const { return policy == "ALWAYS"; }
-
-  static cufile_config const* instance();
-};
-
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 29cbc2024fc..20f3ec87ccb 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -5,17 +5,18 @@ Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
 GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
 GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
+GDS is also included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
+Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. 
 This variable also controls the GDS compatibility mode. 
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "GDS": Enable GDS use; GDS compatibility mode is *off*.
 - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "OFF": Compretely disable GDS use.
+- "OFF": Completely disable GDS use.
 
-Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
+If no value is set, behavior will be the same as the "GDS" option.
 
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
@@ -30,5 +31,3 @@ Operations that support the use of GPUDirect Storage:
 - `to_csv`
 - `to_parquet`
 - `to_orc`
-
-NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
index af89ab5285f..521833e2afd 100644
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -3,15 +3,16 @@ nvCOMP Integration
 
 Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
 
-Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
+Which implementation is used by default depends on the data format and the compression type.
+Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
 - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
 - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
 
-Any other value (or no value set) will result in the same behavior as the "STABLE" option.
+If no value is set, behavior will be the same as the "STABLE" option.
 
 
 .. table:: Current policy for nvCOMP use for different types

From 65af9a301acd19784fe7d2d03702be827ce97661 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Nov 2021 16:02:13 -0800
Subject: [PATCH 21/72] Improve cmake format script (#9723)

This PR ports some improvements from rapidsai/rmm#913.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9723
---
 .pre-commit-config.yaml         |  4 ++--
 cpp/scripts/run-cmake-format.sh | 32 +++++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e993f548e1d..1e1ad94ab0b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,7 +72,7 @@ repos:
                 args: ['-fallback-style=none']
               - id: cmake-format
                 name: cmake-format
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-format
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
@@ -81,7 +81,7 @@ repos:
                   - cmake-format==0.6.11
               - id: cmake-lint
                 name: cmake-lint
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 76de008b14a..9c981c6cdaa 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
+# This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
 # cmakelang linters between a local config file and a second config file that's
 # shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
@@ -16,19 +16,33 @@
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
 # 
-# While this script can be invoked directly (but only from the repo root since
-# all paths are relative to that), it is advisable to instead use the
-# pre-commit hooks via
-# `pre-commit run (cmake-format)|(cmake-format)`.
+# This script can be invoked directly anywhere within the project repository.
+# Alternatively, it may be invoked as a pre-commit hook via
+# `pre-commit run (cmake-format)|(cmake-lint)`.
 #
 # Usage:
 # bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
 
-# Note that pre-commit always runs from the root of the repository, so relative
-# paths are automatically relative to the repo root.
+status=0
+if [ -z ${CUDF_ROOT:+PLACEHOLDER} ]; then
+    CUDF_BUILD_DIR=$(git rev-parse --show-toplevel 2>&1)/cpp/build
+    status=$?
+else
+    CUDF_BUILD_DIR=${CUDF_ROOT}
+fi
+
+if ! [ ${status} -eq 0 ]; then
+    if [[ ${CUDF_BUILD_DIR} == *"not a git repository"* ]]; then
+        echo "This script must be run inside the cudf repository, or the CUDF_ROOT environment variable must be set."
+    else
+        echo "Script failed with unknown error attempting to determine project root:"
+        echo ${CUDF_BUILD_DIR}
+    fi
+    exit 1
+fi
+
 DEFAULT_FORMAT_FILE_LOCATIONS=(
-  "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" 
-  "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 

From 43a13c6aac76a2a5a42674b4e3e05dbb65ddb741 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Mon, 22 Nov 2021 13:43:19 +0800
Subject: [PATCH 22/72] Skip cufile tests in JNI build script (#9744)

Signed-off-by: Peixin Li <pxli@nyu.edu>

related to #9722
skip cufile test in JNI build while we have a separate pipeline for GDS testing

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Tim Liu (https://github.com/NvTimLiu)
  - Gary Shen (https://github.com/GaryShen2008)

URL: https://github.com/rapidsai/cudf/pull/9744
---
 java/ci/build-in-docker.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index e596cdae5b3..df4ca853398 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -16,7 +16,7 @@
 # limitations under the License.
 #
 
-set -e
+set -ex
 gcc --version
 
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
@@ -58,7 +58,7 @@ make -j$PARALLEL_LEVEL
 make install DESTDIR=$INSTALL_PREFIX
 
 ###### Build cudf jar ######
-BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS"
+BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
 if [ "$SIGN_FILE" == true ]; then
     # Build javadoc and sources only when SIGN_FILE is true
     BUILD_ARG="$BUILD_ARG -Prelease"

From 7fa15db306631c026642942993283bd93da1c7c2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Nov 2021 09:33:47 -0500
Subject: [PATCH 23/72] Fix doxygen for enum types in libcudf (#9724)

Fix some doxygen formatting errors around enum types found when looking at various pages in the published docs: https://docs.rapids.ai/api/libcudf/stable/namespacecudf.html

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9724
---
 .../cudf/ast/detail/expression_parser.hpp     |  6 +--
 cpp/include/cudf/ast/expressions.hpp          |  6 +--
 cpp/include/cudf/copying.hpp                  |  8 ++--
 cpp/include/cudf/detail/gather.cuh            |  6 +--
 cpp/include/cudf/detail/structs/utilities.hpp |  4 +-
 cpp/include/cudf/io/types.hpp                 |  6 +--
 .../cudf/strings/char_types/char_types.hpp    | 20 ++++----
 cpp/include/cudf/strings/regex/flags.hpp      |  6 +--
 cpp/include/cudf/strings/strip.hpp            |  6 +--
 cpp/include/cudf/strings/translate.hpp        |  5 +-
 cpp/include/cudf/unary.hpp                    | 46 +++++++++----------
 11 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index dc800bde527..4f73cb1ef6e 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -37,9 +37,9 @@ namespace detail {
  * linearization process but cannot be explicitly created by the user.
  */
 enum class device_data_reference_type {
-  COLUMN,       // A value in a table column
-  LITERAL,      // A literal value
-  INTERMEDIATE  // An internal temporary value
+  COLUMN,       ///< A value in a table column
+  LITERAL,      ///< A literal value
+  INTERMEDIATE  ///< An internal temporary value
 };
 
 /**
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 7ae40a7d65f..20aaa42fb68 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -122,9 +122,9 @@ enum class ast_operator {
  * This determines which table to use in cases with two tables (e.g. joins).
  */
 enum class table_reference {
-  LEFT,   // Column index in the left table
-  RIGHT,  // Column index in the right table
-  OUTPUT  // Column index in the output table
+  LEFT,   ///< Column index in the left table
+  RIGHT,  ///< Column index in the right table
+  OUTPUT  ///< Column index in the output table
 };
 
 /**
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index ba5043fb261..81dddbd284a 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -41,8 +41,8 @@ namespace cudf {
  */
 
 enum class out_of_bounds_policy : bool {
-  NULLIFY,    /// Output values corresponding to out-of-bounds indices are null
-  DONT_CHECK  /// No bounds checking is performed, better performance
+  NULLIFY,    ///< Output values corresponding to out-of-bounds indices are null
+  DONT_CHECK  ///< No bounds checking is performed, better performance
 };
 
 /**
@@ -901,8 +901,8 @@ std::unique_ptr<scalar> get_element(
  * @brief Indicates whether a row can be sampled more than once.
  */
 enum class sample_with_replacement : bool {
-  FALSE,  // A row can be sampled only once
-  TRUE    // A row can be sampled more than once
+  FALSE,  ///< A row can be sampled only once
+  TRUE    ///< A row can be sampled more than once
 };
 
 /**
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 594191d275d..08dbdb6f1a0 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -66,9 +66,9 @@ struct bounds_checker {
  * @brief The operation to perform when a gather map index is out of bounds
  */
 enum class gather_bitmask_op {
-  DONT_CHECK,   // Don't check for out of bounds indices
-  PASSTHROUGH,  // Preserve mask at rows with out of bounds indices
-  NULLIFY,      // Nullify rows with out of bounds indices
+  DONT_CHECK,   ///< Don't check for out of bounds indices
+  PASSTHROUGH,  ///< Preserve mask at rows with out of bounds indices
+  NULLIFY,      ///< Nullify rows with out of bounds indices
 };
 
 template <gather_bitmask_op Op, typename MapIterator>
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index aece79107c6..6f32e3190bf 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -28,8 +28,8 @@ namespace structs {
 namespace detail {
 
 enum class column_nullability {
-  MATCH_INCOMING,  // generate a null column if the incoming column has nulls
-  FORCE            // always generate a null column
+  MATCH_INCOMING,  ///< generate a null column if the incoming column has nulls
+  FORCE            ///< always generate a null column
 };
 
 /**
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index ac965e2d416..cf6be8a20af 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -87,9 +87,9 @@ enum class quote_style {
  * @brief Column statistics granularity type for parquet/orc writers
  */
 enum statistics_freq {
-  STATISTICS_NONE     = 0,  //!< No column statistics
-  STATISTICS_ROWGROUP = 1,  //!< Per-Rowgroup column statistics
-  STATISTICS_PAGE     = 2,  //!< Per-page column statistics
+  STATISTICS_NONE     = 0,  ///< No column statistics
+  STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
+  STATISTICS_PAGE     = 2,  ///< Per-page column statistics
 };
 
 /**
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 2af79de0716..04d65065bd3 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -37,16 +37,16 @@ namespace strings {
  * does not match to any explicitly named enumerator.
  */
 enum string_character_types : uint32_t {
-  DECIMAL    = 1 << 0,                             /// all decimal characters
-  NUMERIC    = 1 << 1,                             /// all numeric characters
-  DIGIT      = 1 << 2,                             /// all digit characters
-  ALPHA      = 1 << 3,                             /// all alphabetic characters
-  SPACE      = 1 << 4,                             /// all space characters
-  UPPER      = 1 << 5,                             /// all upper case characters
-  LOWER      = 1 << 6,                             /// all lower case characters
-  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  /// all alphanumeric characters
-  CASE_TYPES = UPPER | LOWER,                      /// all case-able characters
-  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       /// all character types
+  DECIMAL    = 1 << 0,                             ///< all decimal characters
+  NUMERIC    = 1 << 1,                             ///< all numeric characters
+  DIGIT      = 1 << 2,                             ///< all digit characters
+  ALPHA      = 1 << 3,                             ///< all alphabetic characters
+  SPACE      = 1 << 4,                             ///< all space characters
+  UPPER      = 1 << 5,                             ///< all upper case characters
+  LOWER      = 1 << 6,                             ///< all lower case characters
+  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  ///< all alphanumeric characters
+  CASE_TYPES = UPPER | LOWER,                      ///< all case-able characters
+  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       ///< all character types
 };
 
 /**
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f6aee6d22cc..637b3b0851b 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -33,9 +33,9 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,  /// default
-  MULTILINE = 8,  /// the '^' and '$' honor new-line characters
-  DOTALL    = 16  /// the '.' matching includes new-line characters
+  DEFAULT   = 0,  ///< default
+  MULTILINE = 8,  ///< the '^' and '$' honor new-line characters
+  DOTALL    = 16  ///< the '.' matching includes new-line characters
 };
 
 /**
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 72863bdf23b..fe9cd41e780 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -31,9 +31,9 @@ namespace strings {
  * @brief Direction identifier for strip() function.
  */
 enum class strip_type {
-  LEFT,   //<< strip characters from the beginning of the string
-  RIGHT,  //<< strip characters from the end of the string
-  BOTH    //<< strip characters from the beginning and end of the string
+  LEFT,   ///< strip characters from the beginning of the string
+  RIGHT,  ///< strip characters from the end of the string
+  BOTH    ///< strip characters from the beginning and end of the string
 };
 
 /**
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index e014f88c451..0cbf6b22029 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -60,7 +60,10 @@ std::unique_ptr<column> translate(
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
  */
-enum class filter_type : bool { KEEP, REMOVE };
+enum class filter_type : bool {
+  KEEP,   ///< All characters but those specified are removed
+  REMOVE  ///< Only the specified characters are removed
+};
 
 /**
  * @brief Removes ranges of characters from each string in a strings column.
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 254a7988e2e..36f08b7f23e 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,28 +28,28 @@ namespace cudf {
  */
 
 enum class unary_operator : int32_t {
-  SIN,         // < Trigonometric sine
-  COS,         // < Trigonometric cosine
-  TAN,         // < Trigonometric tangent
-  ARCSIN,      // < Trigonometric sine inverse
-  ARCCOS,      // < Trigonometric cosine inverse
-  ARCTAN,      // < Trigonometric tangent inverse
-  SINH,        // < Hyperbolic sine
-  COSH,        // < Hyperbolic cosine
-  TANH,        // < Hyperbolic tangent
-  ARCSINH,     // < Hyperbolic sine inverse
-  ARCCOSH,     // < Hyperbolic cosine inverse
-  ARCTANH,     // < Hyperbolic tangent inverse
-  EXP,         // < Exponential (base e, Euler number)
-  LOG,         // < Natural Logarithm (base e)
-  SQRT,        // < Square-root (x^0.5)
-  CBRT,        // < Cube-root (x^(1.0/3))
-  CEIL,        // < Smallest integer value not less than arg
-  FLOOR,       // < largest integer value not greater than arg
-  ABS,         // < Absolute value
-  RINT,        // < Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  // < Bitwise Not (~)
-  NOT,         // < Logical Not (!)
+  SIN,         ///< Trigonometric sine
+  COS,         ///< Trigonometric cosine
+  TAN,         ///< Trigonometric tangent
+  ARCSIN,      ///< Trigonometric sine inverse
+  ARCCOS,      ///< Trigonometric cosine inverse
+  ARCTAN,      ///< Trigonometric tangent inverse
+  SINH,        ///< Hyperbolic sine
+  COSH,        ///< Hyperbolic cosine
+  TANH,        ///< Hyperbolic tangent
+  ARCSINH,     ///< Hyperbolic sine inverse
+  ARCCOSH,     ///< Hyperbolic cosine inverse
+  ARCTANH,     ///< Hyperbolic tangent inverse
+  EXP,         ///< Exponential (base e, Euler number)
+  LOG,         ///< Natural Logarithm (base e)
+  SQRT,        ///< Square-root (x^0.5)
+  CBRT,        ///< Cube-root (x^(1.0/3))
+  CEIL,        ///< Smallest integer value not less than arg
+  FLOOR,       ///< largest integer value not greater than arg
+  ABS,         ///< Absolute value
+  RINT,        ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,  ///< Bitwise Not (~)
+  NOT,         ///< Logical Not (!)
 };
 
 /**

From cac53c5b7f4845faea935b29a6efb323eff56a19 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Mon, 22 Nov 2021 10:42:59 -0800
Subject: [PATCH 24/72] Enable string to decimal 128 cast (#9742)

A short PR to enable String to Decimal 128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9742
---
 java/src/main/native/src/ColumnViewJni.cpp       |  1 +
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index bce330ea4a3..4efac307627 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -936,6 +936,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
           break;
         case cudf::type_id::DECIMAL32:
         case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128:
           result = cudf::strings::to_fixed_point(*column, n_data_type);
           break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f332661dc19..a582541a0d4 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3370,6 +3370,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastStringToBigDecimal() {
+    String[] bigValues = {"923121331938210123.321",
+        "9223372036854775808.191",
+       "9328323982309091029831.002"
+    };
+
+    try (ColumnVector cv = ColumnVector.fromStrings(bigValues);
+        ColumnVector values = cv.castTo(DType.create(DType.DTypeEnum.DECIMAL128, -3));
+        ColumnVector expected = ColumnVector.fromDecimals(new BigDecimal("923121331938210123.321"),
+            new BigDecimal("9223372036854775808.191"),
+            new BigDecimal("9328323982309091029831.002"))) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastByteToString() {
 

From ebeb2023ce81f254aaa638c0cd308da98b15418d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Nov 2021 14:23:13 -0500
Subject: [PATCH 25/72] Fix out-of-bounds memory write in decimal128-to-string
 conversion (#9740)

This fixes an error found in a memcheck test referenced here: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=cuda115,LINUX_VER=centos7,PYTHON=3.8/5082/

This also disables the `FixedPointStringConversionOperator` which fails on a Debug build and may be a bug in `std::string`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9740
---
 cpp/src/strings/convert/utilities.cuh   | 7 ++++---
 cpp/tests/strings/fixed_point_tests.cpp | 4 ++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 234ecf48f2e..d9ca8159706 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -67,8 +67,9 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
   bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
 
   constexpr IntegerType base = 10;
-  constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
-  char digits[MAX_DIGITS];          // place-holder for digit chars
+  // largest 64-bit integer is 20 digits; largest 128-bit integer is 39 digits
+  constexpr int MAX_DIGITS = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
+  char digits[MAX_DIGITS];  // place-holder for digit chars
   int digits_idx = 0;
   while (value != 0) {
     assert(digits_idx < MAX_DIGITS);
@@ -107,7 +108,7 @@ constexpr size_type count_digits(IntegerType value)
   auto const digits = [value] {
     // largest 8-byte  unsigned value is 18446744073709551615 (20 digits)
     // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits)
-    auto constexpr max_digits = std::is_same_v<IntegerType, __int128_t> ? 39 : 20;
+    auto constexpr max_digits = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
 
     size_type digits = 1;
     __int128_t pow10 = 10;
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 7c188d39f6f..ce4280e0733 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -303,7 +303,11 @@ TEST_F(StringsConvertTest, IsFixedPoint)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
 
+#ifdef NDEBUG
 TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
+#else
+TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
+#endif
 {
   auto const max = cuda::std::numeric_limits<__int128_t>::max();
 

From 9fc35b77eea84dfddf4e5adf0f38d3223644c5d7 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 22 Nov 2021 15:15:08 -0500
Subject: [PATCH 26/72] Update `DEFAULT_CUDA_VER` in `ci/cpu/prebuild.sh`
 (#9749)

Now that we only do `11.5` builds for RAPIDS, the `DEFAULT_CUDA_VER` variable in `ci/cpu/prebuild.sh` should be set to `11.5` so that the rest of the logic in the file works correctly.
---
 ci/cpu/prebuild.sh | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index a9bc1f4c605..746c0005a47 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,15 +3,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
-ARCH=$(arch)
-if [ "${ARCH}" = "x86_64" ]; then
-    DEFAULT_CUDA_VER="11.0"
-elif [ "${ARCH}" = "aarch64" ]; then
-    DEFAULT_CUDA_VER="11.2"
-else
-    echo "Unsupported arch ${ARCH}"
-    exit 1
-fi
+DEFAULT_CUDA_VER="11.5"
 
 #Always upload cudf Python package
 export UPLOAD_CUDF=1

From 8e2ac44c0cfbcef045db54e00b03d1180b36f2b9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 22 Nov 2021 14:20:08 -0600
Subject: [PATCH 27/72] [REVIEW] Pin max `dask` & `distributed` versions
 (#9734)

* pin max dask version

* Update conda/recipes/dask-cudf/meta.yaml

Co-authored-by: Bradley Dice <bdice@bradleydice.com>

* Apply suggestions from code review

Co-authored-by: jakirkham <jakirkham@gmail.com>

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: jakirkham <jakirkham@gmail.com>
---
 ci/benchmark/build.sh                    | 2 +-
 ci/gpu/build.sh                          | 2 +-
 conda/environments/cudf_dev_cuda11.0.yml | 4 ++--
 conda/environments/cudf_dev_cuda11.2.yml | 4 ++--
 conda/environments/cudf_dev_cuda11.5.yml | 4 ++--
 conda/recipes/custreamz/meta.yaml        | 4 ++--
 conda/recipes/dask-cudf/meta.yaml        | 8 ++++----
 python/custreamz/dev_requirements.txt    | 4 ++--
 python/dask_cudf/dev_requirements.txt    | 4 ++--
 python/dask_cudf/setup.py                | 4 ++--
 10 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index bc82f638171..979db1b5034 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e2e95c34650..664e774c68a 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 4d106409e1e..e2ead779861 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 46785603c0e..6146d84835a 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 635d838640d..043c81c9e01 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index dc3a17f03ab..db8af9b0bed 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -31,8 +31,8 @@ requirements:
     - python
     - streamz 
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
     - python-confluent-kafka
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 2a88827ad94..d90de2d628c 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index de39ab4c7c1..6f1c09947d5 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.09.1
-distributed>=2021.09.1
+dask>=2021.11.1,<=2021.11.2
+distributed>=2021.11.1,<=2021.11.2
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index d9dc19bfb7f..db85515f379 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.09.1
-distributed>=2021.09.1
+dask>=2021.11.1,<=2021.11.2
+distributed>=2021.11.1,<=2021.11.2
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 1a9c2ff048a..b52c2ea37d6 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.09.1",
-    "distributed>=2021.09.1",
+    "dask>=2021.11.1,<=2021.11.2",
+    "distributed>=2021.11.1,<=2021.11.2",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",

From d1811b5baf1d83f8d376a4f6e7fd84020a24506b Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 24 Nov 2021 01:22:13 +0530
Subject: [PATCH 28/72] update cuda version in local build (#9736)

update cuda, ubuntu, python versions in local build using gpuci docker image.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/9736
---
 ci/local/README.md | 6 +++---
 ci/local/build.sh  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/local/README.md b/ci/local/README.md
index 96002802263..7754bcaf647 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -18,12 +18,12 @@ Build and test your local repository using a base gpuCI Docker image
 where:
     -H   Show this help text
     -r   Path to repository (defaults to working directory)
-    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7)
+    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8)
     -s   Skip building and testing and start an interactive shell in a container of the Docker image
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`
+`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:22.02-cuda11.5-devel-ubuntu20.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.
 
@@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you
 
 ### Docker Image Build Repository
 
-The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished.
+The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda11.5-ubuntu20.04-py3.8/`. Feel free to remove this directory after the script is finished.
 
 *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
 
diff --git a/ci/local/build.sh b/ci/local/build.sh
index 1bfb8b63fef..345db967264 100755
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -3,7 +3,7 @@
 GIT_DESCRIBE_TAG=`git describe --tags`
 MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
-DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.0-devel-ubuntu18.04-py3.7"
+DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
 CPP_BUILD_DIR="cpp/build"

From 8d9d22231b983f0a85ce594dc9758ab6a6c09559 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 29 Nov 2021 09:26:03 -0600
Subject: [PATCH 29/72] [FIX] Add `arrow_dataset` and `parquet` targets to
 build exports (#9491)

This PR adds the `arrow_dataset` and `parquet` targets to the build export when Arrow is built from source by CPM, similar to what we have to do today for `arrow` and `arrow_cuda` targets.
---
 cpp/CMakeLists.txt                   | 21 ++++++
 cpp/cmake/thirdparty/get_arrow.cmake | 96 +++++++++++++++++++++++-----
 2 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 966728d7647..59dc3c74af2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -733,6 +733,27 @@ set(install_code_string
     [=[
 set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA)
+]=]
+)
+
+if(CUDF_ENABLE_ARROW_PARQUET)
+  string(
+    APPEND
+    install_code_string
+    [=[
+  if(NOT Parquet_DIR)
+    set(Parquet_DIR "${Arrow_DIR}")
+  endif()
+  set(ArrowDataset_DIR "${Arrow_DIR}")
+  find_dependency(ArrowDataset)
+  ]=]
+  )
+endif()
+
+string(
+  APPEND
+  install_code_string
+  [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
   if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 5fe37402096..ae1448da502 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -90,7 +90,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared arrow_cuda_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_cuda_shared arrow_dataset_shared
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
@@ -142,6 +142,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       set(ArrowCUDA_DIR "${Arrow_DIR}")
       find_package(Arrow REQUIRED QUIET)
       find_package(ArrowCUDA REQUIRED QUIET)
+      if(ENABLE_PARQUET)
+        if(NOT Parquet_DIR)
+          # Set this to enable `find_package(Parquet)`
+          set(Parquet_DIR "${Arrow_DIR}")
+        endif()
+        # Set this to enable `find_package(ArrowDataset)`
+        set(ArrowDataset_DIR "${Arrow_DIR}")
+        find_package(ArrowDataset REQUIRED QUIET)
+      endif()
     elseif(Arrow_ADDED)
       # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
       # target_include_directories. That defeats ccache.
@@ -182,24 +191,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   if(Arrow_ADDED)
+
     set(arrow_code_string
         [=[
-        if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
-            add_library(arrow_shared ALIAS cudf::arrow_shared)
-        endif()
-        if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
-            add_library(arrow_static ALIAS cudf::arrow_static)
-        endif()
-        ]=]
-    )
-    set(arrow_cuda_code_string
-        [=[
-        if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
-            add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
-        endif()
-        if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
-            add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
-        endif()
+          if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
+              add_library(arrow_shared ALIAS cudf::arrow_shared)
+          endif()
+          if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
+              add_library(arrow_static ALIAS cudf::arrow_static)
+          endif()
         ]=]
     )
 
@@ -212,6 +212,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       FINAL_CODE_BLOCK arrow_code_string
     )
 
+    set(arrow_cuda_code_string
+        [=[
+          if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
+              add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
+          endif()
+          if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
+              add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
+          endif()
+        ]=]
+    )
+
     rapids_export(
       BUILD ArrowCUDA
       VERSION ${VERSION}
@@ -220,6 +231,49 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       NAMESPACE cudf::
       FINAL_CODE_BLOCK arrow_cuda_code_string
     )
+
+    if(ENABLE_PARQUET)
+
+      set(arrow_dataset_code_string
+          [=[
+              if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
+                  add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
+              endif()
+              if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
+                  add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD ArrowDataset
+        VERSION ${VERSION}
+        EXPORT_SET arrow_dataset_targets
+        GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK arrow_dataset_code_string
+      )
+
+      set(parquet_code_string
+          [=[
+              if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
+                  add_library(parquet_shared ALIAS cudf::parquet_shared)
+              endif()
+              if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
+                  add_library(parquet_static ALIAS cudf::parquet_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD Parquet
+        VERSION ${VERSION}
+        EXPORT_SET parquet_targets
+        GLOBAL_TARGETS parquet_shared parquet_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK parquet_code_string
+      )
+    endif()
   endif()
   # We generate the arrow-config and arrowcuda-config files when we built arrow locally, so always
   # do `find_dependency`
@@ -230,10 +284,18 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   # ArrowCUDA_DIR to be where Arrow was found, since Arrow packages ArrowCUDA.config in a
   # non-standard location
   rapids_export_package(BUILD ArrowCUDA cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_package(BUILD Parquet cudf-exports)
+    rapids_export_package(BUILD ArrowDataset cudf-exports)
+  endif()
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
   rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+    rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  endif()
 
   set(ARROW_FOUND
       "${ARROW_FOUND}"

From a1ca8c1e408ac1791c4f4bae563e775bbddb5a29 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Mon, 29 Nov 2021 11:32:43 -0500
Subject: [PATCH 30/72] Use ptxcompiler to patch Numba at runtime to support
 CUDA enhanced compatibility. (#9687)

---
 conda/environments/cudf_dev_cuda11.0.yml      |  3 +-
 conda/environments/cudf_dev_cuda11.2.yml      |  3 +-
 conda/environments/cudf_dev_cuda11.5.yml      |  3 +-
 conda/recipes/cudf/meta.yaml                  |  5 +-
 python/cudf/cudf/__init__.py                  | 11 ++++
 .../cudf/tests/test_extension_compilation.py  | 57 ++++++++++++-------
 6 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index e2ead779861..7c22b4d35e3 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 6146d84835a..0978ae7c8f9 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 043c81c9e01..d2d0a38c44e 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 6d56b0c0c94..46eefbc825f 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -29,7 +29,7 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - numba >=0.53.1
+    - numba >=0.54
     - dlpack>=0.5,<0.6.0a0
     - pyarrow 5.0.0 *cuda
     - libcudf {{ version }}
@@ -41,7 +41,7 @@ requirements:
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
     - cupy >=9.5.0,<10.0.0a0
-    - numba >=0.53.1
+    - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
     - fastavro >=0.22.0
@@ -51,6 +51,7 @@ requirements:
     - nvtx >=0.2.1
     - packaging
     - cachetools
+    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index bc35551b5bd..b24e71e7785 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -102,6 +102,17 @@
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import set_allocator
 
+try:
+    from ptxcompiler.patch import patch_numba_codegen_if_needed
+except ImportError:
+    pass
+else:
+    # Patch Numba to support CUDA enhanced compatibility.
+    # See https://github.com/rapidsai/ptxcompiler for
+    # details.
+    patch_numba_codegen_if_needed()
+    del patch_numba_codegen_if_needed
+
 cuda.set_memory_manager(rmm.RMMNumbaManager)
 cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
 
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 39fa7b11ce2..47c9448cf63 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,5 +1,6 @@
 import operator
 
+import cupy as cp
 import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
@@ -71,8 +72,8 @@ def test_execute_masked_binary(op, ty):
     def func(x, y):
         return op(x, y)
 
-    @cuda.jit(debug=True)
-    def test_kernel(x, y):
+    @cuda.jit
+    def test_kernel(x, y, err):
         # Reference result with unmasked value
         u = func(x, y)
 
@@ -87,14 +88,22 @@ def test_kernel(x, y):
         # Check masks are as expected, and unmasked result matches masked
         # result
         if r0.valid:
-            raise RuntimeError("Expected r0 to be invalid")
+            # TODO: ideally, we would raise an exception here rather
+            # than return an "error code", and that is what the
+            # previous version of this (and below) tests did. But,
+            # Numba kernels cannot currently use `debug=True` with
+            # CUDA enhanced compatibility.  Once a solution to that is
+            # reached, we should switch back to raising exceptions
+            # here.
+            err[0] = 1
         if not r1.valid:
-            raise RuntimeError("Expected r1 to be valid")
+            err[0] = 2
         if u != r1.value:
-            print("Values: ", u, r1.value)
-            raise RuntimeError("u != r1.value")
+            err[0] = 3
 
-    test_kernel[1, 1](1, 2)
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](1, 2, err)
+    assert err[0] == 0
 
 
 @pytest.mark.parametrize("op", ops)
@@ -187,18 +196,20 @@ def test_is_na(fn):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         valid_is_na = device_fn(valid)
         invalid_is_na = device_fn(invalid)
 
         if valid_is_na:
-            raise RuntimeError("Valid masked value is NA and should not be")
+            err[0] = 1
 
         if not invalid_is_na:
-            raise RuntimeError("Invalid masked value is not NA and should be")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 def func_lt_na(x):
@@ -271,8 +282,8 @@ def test_na_masked_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
         valid_masked = Masked(unmasked, True)
         invalid_masked = Masked(unmasked, False)
@@ -281,12 +292,14 @@ def test_kernel():
         invalid_cmp_na = device_fn(invalid_masked)
 
         if valid_cmp_na:
-            raise RuntimeError("Valid masked value compared True with NA")
+            err[0] = 1
 
         if invalid_cmp_na:
-            raise RuntimeError("Invalid masked value compared True with NA")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 # xfail because scalars do not yet cast for a comparison to NA
@@ -297,13 +310,15 @@ def test_na_scalar_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
 
         unmasked_cmp_na = device_fn(unmasked)
 
         if unmasked_cmp_na:
-            raise RuntimeError("Unmasked value compared True with NA")
+            err[0] = 1
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0

From 0ebeffa4c8122cd1f54fe9fc05c4bec660b7e37e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Nov 2021 16:14:17 -0500
Subject: [PATCH 31/72] Only run runtime jit tests with CUDA 11.5 runtime

CI runs all tests over a variety of different hardware. Tests
that have to use NVRTC to re-compile cudf C++ code are only
supported on CUDA 11.5+.
---
 cpp/tests/binaryop/binop-generic-ptx-test.cpp |  6 ++++
 cpp/tests/binaryop/binop-integration-test.cpp |  6 ++++
 cpp/tests/binaryop/binop-null-test.cpp        |  8 +++++
 cpp/tests/binaryop/util/runtime_support.h     | 33 +++++++++++++++++++
 4 files changed, 53 insertions(+)
 create mode 100644 cpp/tests/binaryop/util/runtime_support.h

diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 6e35bdac41c..0b6cfdab498 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -21,11 +21,17 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include <tests/binaryop/util/runtime_support.h>
 
 namespace cudf {
 namespace test {
 namespace binop {
 struct BinaryOperationGenericPTXTest : public BinaryOperationTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };
 
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_FP32_FP32_FP32)
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 427a21512a3..21696a419ee 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -31,6 +31,7 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include <tests/binaryop/util/runtime_support.h>
 #include "cudf/utilities/error.hpp"
 
 namespace cudf {
@@ -40,6 +41,11 @@ namespace binop {
 constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
 
 struct BinaryOperationIntegrationTest : public BinaryOperationTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };
 
 TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index 25ec3b30834..b7e7702bd6c 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -23,6 +23,8 @@
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
+#include <tests/binaryop/util/runtime_support.h>
+
 namespace cudf {
 namespace test {
 namespace binop {
@@ -52,6 +54,12 @@ struct BinaryOperationNullTest : public BinaryOperationTest {
       default: CUDF_FAIL("Unknown mask state " + std::to_string(static_cast<int64_t>(state)));
     }
   }
+
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };  // namespace binop
 
 TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
new file mode 100644
index 00000000000..b6cfdadee3c
--- /dev/null
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+
+bool can_do_runtime_jit()
+{
+  // We require a CUDA NVRTC of 11.5+ to do runtime jit
+  // as we need support for __int128
+
+  int runtime = 0;
+  auto error_value = cudaRuntimeGetVersion(&runtime);
+  return (error_value == cudaSuccess) && (runtime >= 11050);
+}

From dfcb48d09a56daa226ec5962acb00ddcaadaf494 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Nov 2021 16:22:08 -0500
Subject: [PATCH 32/72] Fix style issues found by CI

---
 cpp/tests/binaryop/binop-generic-ptx-test.cpp | 2 +-
 cpp/tests/binaryop/binop-integration-test.cpp | 2 +-
 cpp/tests/binaryop/util/runtime_support.h     | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 0b6cfdab498..f4407834786 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -20,8 +20,8 @@
 #include <cudf/binaryop.hpp>
 
 #include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/binop-fixture.hpp>
 #include <tests/binaryop/util/runtime_support.h>
+#include <tests/binaryop/binop-fixture.hpp>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 21696a419ee..4181b20220b 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -30,8 +30,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/binop-fixture.hpp>
 #include <tests/binaryop/util/runtime_support.h>
+#include <tests/binaryop/binop-fixture.hpp>
 #include "cudf/utilities/error.hpp"
 
 namespace cudf {
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
index b6cfdadee3c..a7ee0c3a391 100644
--- a/cpp/tests/binaryop/util/runtime_support.h
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -21,13 +21,12 @@
 
 #include <cuda_runtime.h>
 
-
-bool can_do_runtime_jit()
+inline bool can_do_runtime_jit()
 {
   // We require a CUDA NVRTC of 11.5+ to do runtime jit
   // as we need support for __int128
 
-  int runtime = 0;
+  int runtime      = 0;
   auto error_value = cudaRuntimeGetVersion(&runtime);
   return (error_value == cudaSuccess) && (runtime >= 11050);
 }

From bbf137eb16ba69e14de9924acd9ca47997da0324 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 24 Nov 2021 09:58:53 -0500
Subject: [PATCH 33/72] WIP: disable csv test

---
 cpp/tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 98bade7e15f..39ca11d8184 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -192,7 +192,7 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-ConfigureTest(CSV_TEST io/csv_test.cpp)
+# ConfigureTest(CSV_TEST io/csv_test.cpp)
 ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)

From a24d2a841e9ab0b94f39418790b4f4b1d88234ff Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 24 Nov 2021 11:15:22 -0500
Subject: [PATCH 34/72] WIP: disable all io tests

---
 cpp/tests/CMakeLists.txt | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 39ca11d8184..f60a2361752 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -192,15 +192,11 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-# ConfigureTest(CSV_TEST io/csv_test.cpp)
-ConfigureTest(ORC_TEST io/orc_test.cpp)
-ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
-ConfigureTest(JSON_TEST io/json_test.cpp)
-ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
-ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-if(CUDF_ENABLE_ARROW_S3)
-  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
-endif()
+# ConfigureTest(CSV_TEST io/csv_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp)
+# ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp)
+# ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST
+# io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3)
+# target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------

From f6143952bf27ccd3c29c7b015099d9db0fdf0528 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:06:33 +0530
Subject: [PATCH 35/72] remove jit integration tests

---
 cpp/tests/CMakeLists.txt                      |    1 -
 .../binop-compiled-fixed_point-test.cpp       |   40 +
 cpp/tests/binaryop/binop-compiled-test.cpp    |    2 +
 cpp/tests/binaryop/binop-integration-test.cpp | 2722 -----------------
 cpp/tests/binaryop/binop-null-test.cpp        |   32 +-
 .../binaryop/binop-verify-input-test.cpp      |   12 +-
 6 files changed, 64 insertions(+), 2745 deletions(-)
 delete mode 100644 cpp/tests/binaryop/binop-integration-test.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f60a2361752..8ae31d7d74d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -164,7 +164,6 @@ ConfigureTest(
   BINARY_TEST
   binaryop/binop-verify-input-test.cpp
   binaryop/binop-null-test.cpp
-  binaryop/binop-integration-test.cpp
   binaryop/binop-compiled-test.cpp
   binaryop/binop-compiled-fixed_point-test.cpp
   binaryop/binop-generic-ptx-test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 7925f0dd618..5020fbf898b 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -684,4 +684,44 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
                cudf::logic_error);
 }
 
+template <typename T>
+struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
+};
+
+using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
+TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
+
+TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
+    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
+    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
+    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
+    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
+    auto const bool_type = cudf::data_type{type_id::BOOL8};
+
+    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
+    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
+    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
+    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
+    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
+    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
+  }
+}
+
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 7a9f6135bcd..37212c30d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -679,3 +679,5 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
 }
 
 }  // namespace cudf::test::binop
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
deleted file mode 100644
index 4181b20220b..00000000000
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ /dev/null
@@ -1,2722 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/util/runtime_support.h>
-#include <tests/binaryop/binop-fixture.hpp>
-#include "cudf/utilities/error.hpp"
-
-namespace cudf {
-namespace test {
-namespace binop {
-
-constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
-
-struct BinaryOperationIntegrationTest : public BinaryOperationTest {
- protected:
-  void SetUp() override
-  {
-    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
-  }
-};
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_SI08_SI16_SI32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP64_SI08)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_SI64_FP64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = int32_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_TimepointD_TimepointS_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Scalar_Vector_SI32_DurationD_DurationMS)
-{
-  // Double the duration of days and convert the time interval to ms
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = int32_t;
-  using TypeRhs = cudf::duration_D;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(2);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_DurationS_SI32_DurationNS)
-{
-  // Multiple each duration with some random value and promote the result
-  using TypeOut = cudf::duration_ns;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = int32_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Scalar_Vector_DurationD_SI32_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  // Divide 2 days by an integer and convert the ticks to seconds
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{2});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_DurationD_DurationS_DurationMS)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::duration_s;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, TrueDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using TRUEDIV = cudf::library::operation::TrueDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::TRUE_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, TRUEDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationD_SI32_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  // Half the number of days and convert the remainder ticks to microseconds
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationS_DurationMS_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_ms;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP64_SI64_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, And_Vector_Vector_SI16_SI64_SI32)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using AND = cudf::library::operation::BitwiseAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Or_Vector_Vector_SI64_SI16_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using OR = cudf::library::operation::BitwiseOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Xor_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using XOR = cudf::library::operation::BitwiseXor<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_XOR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, XOR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_And_Vector_Vector_SI16_FP64_SI8)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using AND = cudf::library::operation::LogicalAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_Or_Vector_Vector_B8_SI16_SI64)
-{
-  using TypeOut = bool;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using OR = cudf::library::operation::LogicalOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Scalar_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  cudf::test::UniformRandomGenerator<long> rand_gen(1, 10);
-  auto itr = cudf::detail::make_counting_transform_iterator(
-    0, [&rand_gen](auto row) { return rand_gen.generate() * 1000; });
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, typename decltype(itr)::value_type> lhs(
-    itr, itr + 100, make_validity_iter());
-
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::string_scalar("eee");
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto rhs = cudf::string_scalar("eee");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto rhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto lhs = cudf::string_scalar("");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LessEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS_EQUAL = cudf::library::operation::LessEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, GreaterEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER_EQUAL = cudf::library::operation::GreaterEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  int num_els = 4;
-
-  TypeLhs lhs[] = {-8, 78, -93, 0, -INT_MAX};
-  cudf::test::fixed_width_column_wrapper<TypeLhs> lhs_w(lhs, lhs + num_els);
-
-  TypeRhs shift[] = {1, 1, 3, 2, 16};
-  cudf::test::fixed_width_column_wrapper<TypeRhs> shift_w(shift, shift + num_els);
-
-  TypeOut expected[] = {2147483644, 39, 536870900, 0, 32768};
-  cudf::test::fixed_width_column_wrapper<TypeOut> expected_w(expected, expected + num_els);
-
-  auto out = cudf::jit::binary_operation(
-    lhs_w, shift_w, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_w);
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Scalar_SI32_SI32_float)
-{
-  using TypeOut = int;      // Cast the result value to int for easy comparison
-  using TypeLhs = int32_t;  // All input types get converted into doubles
-  using TypeRhs = float;
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros. The log value is purposefully cast to int for easy comparison
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 10; });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 100);
-  // Find log to the base 10
-  auto rhs = numeric_scalar<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Scalar_Vector_float_SI32)
-{
-  using TypeOut = float;
-  using TypeLhs = int;
-  using TypeRhs = int;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 30; });
-  fixed_width_column_wrapper<TypeRhs> rhs(elements, elements + 100);
-  // Find log to the base 2
-  auto lhs = numeric_scalar<TypeLhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Vector_double_SI64_SI32)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements =
-    cudf::detail::make_counting_transform_iterator(1, [](auto i) { return std::pow(2, i); });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 50);
-
-  // Find log to the base 7
-  auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
-  fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX}, {true, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, false, false}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col    = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, true, false, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-  int_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, false, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_tsD_tsD)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> ts_col{
-    {
-      999,    // Random nullable field
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      3,      // Random nullable field
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-      111,    // Random nullable field
-    },
-    {false, true, true, true, false, true, true, false}};
-  auto ts_scalar = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{44376}, true);
-
-  auto op_col = cudf::jit::binary_operation(
-    ts_scalar, ts_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {false, false, true, false, false, false, false, false},
-                                   {true, true, true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_EmptyString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Empty string
-  cudf::string_scalar str_scalar("");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, true, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_ValidString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Match a valid string
-  cudf::string_scalar str_scalar("<null>");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NoMatch)
-{
-  using TypeOut = bool;
-
-  // Try with non nullable input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  // Matching a string that isn't present
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, true, true, true, true, true, true});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is valid
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_MatchInvalid)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Matching an invalid string
-  cudf::string_scalar str_scalar("bb");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_string_string)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Valid string invalidated
-  cudf::string_scalar str_scalar("bb");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_tsD_tsD_NonNullable)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> lhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44376,  // 2091-07-01 00:00:00 GMT
-    47695,  // 2100-08-02 00:00:00 GMT
-    66068,  // 2150-11-21 00:00:00 GMT
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-  ASSERT_EQ(column_view{lhs_col}.nullable(), false);
-  cudf::test::fixed_width_column_wrapper<TypeRhs, TypeRhs::rep> rhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44380,  // Mismatched
-    47695,  // 2100-08-02 00:00:00 GMT
-    66070,  // Mismatched
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, true},
-                                   {true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-// Both vectors with mixed validity
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixMix)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"foo", "valid", "<null>", "", "invalid", "inv", "ééé"},
-                                       {true, true, true, true, false, false, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, true, false, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, true, true, true, false, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_InvalidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_VectorAllInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-
-  auto lhs_col = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-  auto rhs_col = fixed_width_column_wrapper<TypeLhs>{{-47, 37, 12, 99, 4, -INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, true, true, true, true, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int8_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX},
-  };
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(77);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{77, -37, 0, 77}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(INT32_MAX);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{
-      {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX},
-      {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = float;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, -37, 0, INT32_MAX, 0, -4379, 0},
-                                        {false, true, false, true, false, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int8_t;
-  using TypeRhs = float;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55}, {true, true, true, true, true, true, true}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_tsD_tsD_tsD)
-{
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> lhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {true, false, true, true, false}};
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> rhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44380,  // Mismatched
-      47695,  // 2100-08-02 00:00:00 GMT
-      66070,  // Mismatched
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {false, true, true, true, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::timestamp_D>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 44380, 47695, 66068, 0}, {true, true, true, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI32_SI64_SI8)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {true, false, true, false, true, false, true}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{9, 0, 0, 0, -47, 0, 55},
-                                        {true, false, true, false, true, false, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "def"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  auto rhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "goo", "<null>", "", "", "", "ééé", "bar", "foo", "def", "abc"},
-    {false, true, true, true, false, true, true, false, false, true, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "def", "def"},
-    {false, true, true, true, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-
-  // Returns a non-nullable column as all elements are valid - it will have the scalar
-  // value at the very least
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, str_scalar, cudf::binary_operator::NULL_MIN, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"foo", "foo", "<null>", "foo", "", "", "foo", "foo", "foo", "abc", "foo"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  // Returns the lhs_col
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, lhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, CastAdd_Vector_Vector_SI32_float_float)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;  // Integral types promoted to double
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto rhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto expected = cudf::test::fixed_width_column_wrapper<int>{2, 3};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_DurationD_TimepointS_TimepointS)
-{
-  using TypeOut = cudf::timestamp_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI64_SI64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-12);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-86099.68377);
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881, -15456.4335, 32213.22119}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result =
-    fixed_width_column_wrapper<TypeOut>{{4671.0625, -8817.51953125, 10539.974609375}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{{90770.74881, -15456.4335, 32213.22119}};
-  auto rhs = cudf::scalar_type_t<TypeRhs>(-86099.68377);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{
-    {4671.0650400000013178, -15456.433499999999185, 32213.221190000000206}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{
-    {24854.55893, 79946.87288, -86099.68377, -86099.68377, 1.0, 1.0, -1.0, -1.0}};
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881,
-                                                  -15456.4335,
-                                                  36223.96138,
-                                                  -15456.4335,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{{24854.55859375,
-                                                              2664.7075000000040745,
-                                                              22572.196640000001935,
-                                                              -8817.5200000000040745,
-                                                              1.0,
-                                                              1.0,
-                                                              0.0,
-                                                              0.0}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_SI32_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_FP64_FP64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
-};
-
-template <typename T>
-using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::plus<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::multiplies<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-template <typename T>
-using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{10, 10, 10, 10, 10}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{110, 220, 330, 440, 550}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{0, 2, 4, 5}, scale_type{0}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto begin = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 11; });
-  auto result_begin =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i * 11) / 12; });
-  auto const lhs      = fp_wrapper<RepType>(begin, begin + 1000, scale_type{-1});
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>(result_begin, result_begin + 1000, scale_type{0});
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{210, 420, 630, 840, 1050}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd5)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const rhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col = fp_wrapper<RepType>{{3, 4, 5, 6, 7, 8}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
-  auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointCast)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col      = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result   = cudf::cast(col, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiplyScalar)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpSimplePlus)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{150, 200}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
-  auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result =
-    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale2Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const sz = std::size_t{1000};
-
-  // TESTING binary op ADD
-
-  auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto e) { return e * 1000; });
-  auto const vec1 = std::vector<RepType>(begin, begin + sz);
-  auto const vec2 = std::vector<RepType>(sz, 0);
-
-  auto const iota_3  = fp_wrapper<RepType>(vec1.begin(), vec1.end(), scale_type{-3});
-  auto const zeros_3 = fp_wrapper<RepType>(vec2.begin(), vec2.end(), scale_type{-1});
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(iota_3).type(),
-                                                   static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
-
-  // TESTING binary op EQUAL, LESS, GREATER
-
-  auto const trues    = std::vector<bool>(sz, true);
-  auto const true_col = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const btype = cudf::data_type{type_id::BOOL8};
-  auto const equal_result =
-    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
-
-  auto const less_result =
-    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
-
-  auto const greater_result =
-    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMaxSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 0, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMinSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 1, 1, 0, 0}, scale_type{-1}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullEqualsSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
-  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
-
-  auto const result = cudf::binary_operation(
-    col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100000, 300000, 500000, 700000}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10000, 30000, 50000, 70000}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(3000, scale_type{-3});
-  auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-
-  auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div7)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(1200, scale_type{0});
-  auto const rhs = fp_wrapper<RepType>{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}};
-
-  auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div8)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{4000, 6000, 80000}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
-  auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div9)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 20, 30}, scale_type{2}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
-  auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div10)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 200, 300}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div11)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1000, 2000, 3000}, scale_type{1}};
-  auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-               cudf::logic_error);
-}
-
-template <typename T>
-struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
-};
-
-using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
-TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
-
-TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
-    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
-    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
-    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
-    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
-    auto const bool_type = cudf::data_type{type_id::BOOL8};
-
-    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
-    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
-    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
-    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
-    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
-    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
-    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
-    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
-  }
-}
-
-}  // namespace binop
-}  // namespace test
-}  // namespace cudf
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index b7e7702bd6c..55ddde5ce5f 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -74,8 +74,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -91,8 +91,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -109,8 +109,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -126,8 +126,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Scalar_Valid)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_NULL);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -143,8 +143,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_Valid)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -160,8 +160,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -177,8 +177,8 @@ TEST_F(BinaryOperationNullTest, Vector_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_VALID);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -194,8 +194,8 @@ TEST_F(BinaryOperationNullTest, Vector_NonNullable_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::UNALLOCATED);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 779dc7c4c1f..167fbc22bde 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -35,9 +35,9 @@ TEST_F(BinopVerifyInputTest, Vector_Scalar_ErrorOutputVectorType)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
+    cudf::logic_error);
 }
 
 TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
@@ -49,9 +49,9 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
   auto lhs = make_random_wrapped_column<TypeLhs>(1);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
+    cudf::logic_error);
 }
 
 }  // namespace binop

From 16fcf4880e74c264899a8b09332cd8a99d08babe Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:40:44 +0530
Subject: [PATCH 36/72] remove jit code which are supported by compiled binops

---
 cpp/include/cudf/binaryop.hpp        |  78 ----
 cpp/include/cudf/detail/binaryop.hpp |  45 +-
 cpp/src/binaryop/binaryop.cpp        | 237 +---------
 cpp/src/binaryop/jit/kernel.cu       |  51 ---
 cpp/src/binaryop/jit/operation.hpp   | 646 ---------------------------
 cpp/src/binaryop/jit/traits.hpp      |  68 ---
 cpp/src/binaryop/jit/util.hpp        |  88 ----
 7 files changed, 6 insertions(+), 1207 deletions(-)
 delete mode 100644 cpp/src/binaryop/jit/operation.hpp
 delete mode 100644 cpp/src/binaryop/jit/traits.hpp
 delete mode 100644 cpp/src/binaryop/jit/util.hpp

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index fe548a36cf0..a514010c1f0 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -210,83 +210,5 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
-namespace jit {
-/**
- * @brief Performs a binary operation between a scalar and a column.
- *
- * The output contains the result of `op(lhs, rhs[i])` for all `0 <= i < rhs.size()`
- * The scalar is the left operand and the column elements are the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand scalar
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between a column and a scalar.
- *
- * The output contains the result of `op(lhs[i], rhs)` for all `0 <= i < lhs.size()`
- * The column elements are the left operand and the scalar is the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand scalar
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between two columns.
- *
- * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index ce7731ef7d2..9fa31d0e01d 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -22,52 +22,9 @@
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
-namespace jit {
-/**
- * @copydoc cudf::jit::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::jit::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * std::string const&, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index e84e175eaca..3398592d5b4 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -18,7 +18,6 @@
  */
 
 #include "compiled/binary_ops.hpp"
-#include "jit/util.hpp"
 
 #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
@@ -126,113 +125,6 @@ bool is_same_scale_necessary(binary_operator op)
 }
 
 namespace jit {
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      OperatorType op_type,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               lhs.offset(),
-               rhs.is_valid(stream));
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, lhs, rhs, op, OperatorType::Direct, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      scalar const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, rhs, lhs, op, OperatorType::Reverse, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               rhs.offset(),
-               rhs.null_mask(),
-               rhs.offset());
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
 void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
@@ -246,12 +138,11 @@ void binary_operation(mutable_column_view& out,
   std::string cuda_source =
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
 
-  std::string kernel_name =
-    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-      .instantiate(output_type_name,                                // list of template arguments
-                   cudf::jit::get_type_name(lhs.type()),
-                   cudf::jit::get_type_name(rhs.type()),
-                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+                              .instantiate(output_type_name,  // list of template arguments
+                                           cudf::jit::get_type_name(lhs.type()),
+                                           cudf::jit::get_type_name(rhs.type()),
+                                           std::string("UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
     .get_kernel(
@@ -418,126 +309,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
       output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
   }
 };
-
-namespace jit {
-
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
-
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty() or rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-}  // namespace jit
 }  // namespace detail
 
-namespace jit {
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-}  // namespace jit
-
 namespace detail {
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index fcfe16f979d..3130cf65bb3 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -18,8 +18,6 @@
  * limitations under the License.
  */
 
-#include <binaryop/jit/operation.hpp>
-
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -30,55 +28,6 @@ namespace cudf {
 namespace binops {
 namespace jit {
 
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s_with_validity(cudf::size_type size,
-                                         TypeOut* out_data,
-                                         TypeLhs* lhs_data,
-                                         TypeRhs* rhs_data,
-                                         cudf::bitmask_type* output_mask,
-                                         cudf::bitmask_type const* mask,
-                                         cudf::size_type offset,
-                                         bool scalar_valid)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    bool output_valid = false;
-    out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
-      lhs_data[i],
-      rhs_data[0],
-      mask ? cudf::bit_is_set(mask, offset + i) : true,
-      scalar_valid,
-      output_valid);
-    if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
-  }
-}
-
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s(cudf::size_type size,
-                           TypeOut* out_data,
-                           TypeLhs* lhs_data,
-                           TypeRhs* rhs_data)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[0]);
-  }
-}
-
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
 __global__ void kernel_v_v(cudf::size_type size,
                            TypeOut* out_data,
diff --git a/cpp/src/binaryop/jit/operation.hpp b/cpp/src/binaryop/jit/operation.hpp
deleted file mode 100644
index d117f2182f9..00000000000
--- a/cpp/src/binaryop/jit/operation.hpp
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-
-#include <binaryop/jit/operation-udf.hpp>
-#include <binaryop/jit/traits.hpp>
-
-#include <cmath>
-
-#include <cuda/std/type_traits>
-
-#pragma once
-
-using namespace cuda::std;
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-struct Add {
-  // Allow sum between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x + y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) + static_cast<TypeCommon>(y));
-  }
-};
-
-using RAdd = Add;
-
-struct Sub {
-  // Allow difference between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x - y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) - static_cast<TypeCommon>(y));
-  }
-};
-
-struct RSub {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Sub::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mul {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) * static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationProduct<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_integral_v<TypeRhs>) ||
-                        (is_integral_v<TypeLhs> && is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationProduct(TypeLhs x, TypeRhs y)
-  {
-    return x * y;
-  }
-};
-
-using RMul = Mul;
-
-struct Div {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) / static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationDivide<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeRhs> || is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationDivide(TypeLhs x, TypeRhs y)
-  {
-    return x / y;
-  }
-};
-
-struct RDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Div::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct TrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RTrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TrueDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct FloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return floor(static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RFloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return FloorDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mod {
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_integral_v<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) % static_cast<TypeCommon>(y));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isFloat<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmodf(static_cast<float>(x), static_cast<float>(y)));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isDouble<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmod(static_cast<double>(x), static_cast<double>(y)));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x % y;
-  }
-};
-
-struct RMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Mod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct PyMod {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_floating_point_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    double x1 = static_cast<double>(x);
-    double y1 = static_cast<double>(y);
-    return fmod(fmod(x1, y1) + y1, y1);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-};
-
-struct RPyMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PyMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Pow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return pow(static_cast<double>(x), static_cast<double>(y));
-  }
-};
-
-struct RPow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Pow::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Equal {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x == y);
-  }
-};
-
-using REqual = Equal;
-
-struct NotEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x != y);
-  }
-};
-
-using RNotEqual = NotEqual;
-
-struct Less {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x < y);
-  }
-};
-
-struct RLess {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y < x);
-  }
-};
-
-struct Greater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x > y);
-  }
-};
-
-struct RGreater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y > x);
-  }
-};
-
-struct LessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x <= y);
-  }
-};
-
-struct RLessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y <= x);
-  }
-};
-
-struct GreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >= y);
-  }
-};
-
-struct RGreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >= x);
-  }
-};
-
-struct BitwiseAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) & static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseAnd = BitwiseAnd;
-
-struct BitwiseOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) | static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseOr = BitwiseOr;
-
-struct BitwiseXor {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) ^ static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseXor = BitwiseXor;
-
-struct LogicalAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x && y);
-  }
-};
-
-using RLogicalAnd = LogicalAnd;
-
-struct LogicalOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x || y);
-  }
-};
-
-using RLogicalOr = LogicalOr;
-
-struct UserDefinedOp {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    TypeOut output;
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
-    return output;
-  }
-};
-
-struct ShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x << y);
-  }
-};
-
-struct RShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y << x);
-  }
-};
-
-struct ShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >> y);
-  }
-};
-
-struct RShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >> x);
-  }
-};
-
-struct ShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeLhs>>(x) >> y);
-  }
-};
-
-struct RShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeRhs>>(y) >> x);
-  }
-};
-
-struct LogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
-  }
-};
-
-struct RLogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return LogBase::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct NullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
-    if (lhs_valid && rhs_valid) return x == y;
-    return false;
-  }
-};
-
-struct RNullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    return NullEquals::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMax::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMin::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct PMod {
-  // Ideally, these two specializations - one for integral types and one for non integral
-  // types shouldn't be required, as std::fmod should promote integral types automatically
-  // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
-  // code does not work - it is having trouble deciding between float/double overloads
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* =
-              nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = xconv % yconv;
-    if (rem < 0) rem = (rem + yconv) % yconv;
-    return TypeOut{rem};
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<
-              !(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = std::fmod(xconv, yconv);
-    if (rem < 0) rem = std::fmod(rem + yconv, yconv);
-    return TypeOut{rem};
-  }
-};
-
-struct RPMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct ATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{std::atan2(double{x}, double{y})};
-  }
-};
-
-struct RATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{ATan2::operate<TypeOut, TypeRhs, TypeLhs>(y, x)};
-  }
-};
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/traits.hpp b/cpp/src/binaryop/jit/traits.hpp
deleted file mode 100644
index 1033d38a668..00000000000
--- a/cpp/src/binaryop/jit/traits.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// Include Jitify's cstddef header first
-#include <cstddef>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-#include <cudf/wrappers/durations.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-// -------------------------------------------------------------------------
-// type_traits cannot tell the difference between float and double
-template <typename Type>
-constexpr bool isFloat = false;
-
-template <typename T>
-constexpr bool is_timestamp_v =
-  cuda::std::is_same_v<cudf::timestamp_D, T> || cuda::std::is_same_v<cudf::timestamp_s, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ms, T> || cuda::std::is_same_v<cudf::timestamp_us, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ns, T>;
-
-template <typename T>
-constexpr bool is_duration_v =
-  cuda::std::is_same_v<cudf::duration_D, T> || cuda::std::is_same_v<cudf::duration_s, T> ||
-  cuda::std::is_same_v<cudf::duration_ms, T> || cuda::std::is_same_v<cudf::duration_us, T> ||
-  cuda::std::is_same_v<cudf::duration_ns, T>;
-
-template <typename T>
-constexpr bool is_chrono_v = is_timestamp_v<T> || is_duration_v<T>;
-
-template <>
-constexpr bool isFloat<float> = true;
-
-template <typename Type>
-constexpr bool isDouble = false;
-
-template <>
-constexpr bool isDouble<double> = true;
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/util.hpp b/cpp/src/binaryop/jit/util.hpp
deleted file mode 100644
index 34c42e28a8b..00000000000
--- a/cpp/src/binaryop/jit/util.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-
-#include <string>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-/**
- * @brief Orientation of lhs and rhs in operator
- */
-enum class OperatorType {
-  Direct,  ///< Orientation of operands is op(lhs, rhs)
-  Reverse  ///< Orientation of operands is op(rhs, lhs)
-};
-
-/**
- * @brief Get the Operator Name
- *
- * @param op The binary operator as enum of type cudf::binary_operator
- * @param type @see OperatorType
- * @return std::string The name of the operator as string
- */
-std::string inline get_operator_name(binary_operator op, OperatorType type)
-{
-  std::string const operator_name = [op] {
-    // clang-format off
-    switch (op) {
-      case binary_operator::ADD:                  return "Add";
-      case binary_operator::SUB:                  return "Sub";
-      case binary_operator::MUL:                  return "Mul";
-      case binary_operator::DIV:                  return "Div";
-      case binary_operator::TRUE_DIV:             return "TrueDiv";
-      case binary_operator::FLOOR_DIV:            return "FloorDiv";
-      case binary_operator::MOD:                  return "Mod";
-      case binary_operator::PYMOD:                return "PyMod";
-      case binary_operator::POW:                  return "Pow";
-      case binary_operator::EQUAL:                return "Equal";
-      case binary_operator::NOT_EQUAL:            return "NotEqual";
-      case binary_operator::LESS:                 return "Less";
-      case binary_operator::GREATER:              return "Greater";
-      case binary_operator::LESS_EQUAL:           return "LessEqual";
-      case binary_operator::GREATER_EQUAL:        return "GreaterEqual";
-      case binary_operator::BITWISE_AND:          return "BitwiseAnd";
-      case binary_operator::BITWISE_OR:           return "BitwiseOr";
-      case binary_operator::BITWISE_XOR:          return "BitwiseXor";
-      case binary_operator::LOGICAL_AND:          return "LogicalAnd";
-      case binary_operator::LOGICAL_OR:           return "LogicalOr";
-      case binary_operator::GENERIC_BINARY:       return "UserDefinedOp";
-      case binary_operator::SHIFT_LEFT:           return "ShiftLeft";
-      case binary_operator::SHIFT_RIGHT:          return "ShiftRight";
-      case binary_operator::SHIFT_RIGHT_UNSIGNED: return "ShiftRightUnsigned";
-      case binary_operator::LOG_BASE:             return "LogBase";
-      case binary_operator::ATAN2:                return "ATan2";
-      case binary_operator::PMOD:                 return "PMod";
-      case binary_operator::NULL_EQUALS:          return "NullEquals";
-      case binary_operator::NULL_MAX:             return "NullMax";
-      case binary_operator::NULL_MIN:             return "NullMin";
-      default:                                    return "";
-    }
-    // clang-format on
-  }();
-
-  if (operator_name == "") { return "None"; }
-
-  return "cudf::binops::jit::" +
-         (type == OperatorType::Direct ? operator_name : 'R' + operator_name);
-}
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf

From 1b9d624af8b3b5d097be3e478d5f878fa26e7632 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:53:17 +0530
Subject: [PATCH 37/72] remove jit benchmark

---
 cpp/benchmarks/CMakeLists.txt                 |  1 -
 .../binaryop/binaryop_benchmark.cpp           |  8 +-
 .../binaryop/jit_binaryop_benchmark.cpp       | 99 -------------------
 3 files changed, 4 insertions(+), 104 deletions(-)
 delete mode 100644 cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fa1e61e26fd..72b247ae748 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -208,7 +208,6 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # * binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(
   BINARYOP_BENCH binaryop/binaryop_benchmark.cpp binaryop/compiled_binaryop_benchmark.cpp
-  binaryop/jit_binaryop_benchmark.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 9de1112a9db..314d657679b 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -74,14 +74,14 @@ static void BM_binaryop_transform(benchmark::State& state)
     auto const op         = cudf::binary_operator::ADD;
     auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::jit::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
       std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
-        result = cudf::jit::binary_operation(result->view(), col, op, result_data_type);
+        result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
   }
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
deleted file mode 100644
index 7fda4a50ea1..00000000000
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/binaryop.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
-class JIT_BINARYOP : public cudf::benchmark {
-};
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
-{
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-
-  auto data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
-
-  auto lhs          = cudf::column_view(input1);
-  auto rhs          = cudf::column_view(input2);
-  auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
-
-  // Call once for hot cache.
-  cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-}
-
-// TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)               \
-  BENCHMARK_TEMPLATE_DEFINE_F(                                                    \
-    JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
-  (::benchmark::State & st)                                                       \
-  {                                                                               \
-    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop);     \
-  }                                                                               \
-  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                       \
-    ->Unit(benchmark::kMicrosecond)                                               \
-    ->UseManualTime()                                                             \
-    ->Arg(10000)      /* 10k */                                                   \
-    ->Arg(100000)     /* 100k */                                                  \
-    ->Arg(1000000)    /* 1M */                                                    \
-    ->Arg(10000000)   /* 10M */                                                   \
-    ->Arg(100000000); /* 100M */
-
-using namespace cudf;
-using namespace numeric;
-
-// clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);

From e49a3430ec0a75379fcf83a5041f0ae81a1ffdba Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 00:40:40 +0530
Subject: [PATCH 38/72] skip generic op udf (jit ptx) in pytest CUDA<11.5

---
 python/cudf/cudf/tests/test_udf_binops.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 4d6188acf8c..935c3868a68 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -6,11 +6,20 @@
 from numba.cuda import compile_ptx
 from numba.np import numpy_support
 
+import rmm
+
 import cudf
 from cudf import Series, _lib as libcudf
 from cudf.utils import dtypes as dtypeutils
 
+_driver_version = rmm._cuda.gpu.driverGetVersion()
+_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+_CUDA_JIT128INT_SUPPORTED = (_driver_version >= 11050) and (
+    _runtime_version >= 11050
+)
+
 
+@pytest.mark.skipif(not _CUDA_JIT128INT_SUPPORTED, reason="requires CUDA 11.5")
 @pytest.mark.parametrize(
     "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int8"}))
 )

From 8f640866b7512409af5e12b38bd7e8b3633b9a9e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 02:07:56 +0530
Subject: [PATCH 39/72] add deleted UserDefinedOp

---
 cpp/src/binaryop/binaryop.cpp  |  2 +-
 cpp/src/binaryop/jit/kernel.cu | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 3398592d5b4..a89e9795283 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -142,7 +142,7 @@ void binary_operation(mutable_column_view& out,
                               .instantiate(output_type_name,  // list of template arguments
                                            cudf::jit::get_type_name(lhs.type()),
                                            cudf::jit::get_type_name(rhs.type()),
-                                           std::string("UserDefinedOp"));
+                                           std::string("cudf::binops::jit::UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
     .get_kernel(
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index 3130cf65bb3..4eb011a1846 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -28,6 +28,17 @@ namespace cudf {
 namespace binops {
 namespace jit {
 
+struct UserDefinedOp {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    TypeOut output;
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
+    return output;
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
 __global__ void kernel_v_v(cudf::size_type size,
                            TypeOut* out_data,

From efb203bcd847d786faacc25da754251bf4c5eb2b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 03:51:38 +0530
Subject: [PATCH 40/72] fix missing includes

---
 cpp/src/binaryop/jit/kernel.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index 4eb011a1846..c9cc61a4f34 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -24,6 +24,9 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <binaryop/jit/operation-udf.hpp>
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace binops {
 namespace jit {
@@ -33,7 +36,7 @@ struct UserDefinedOp {
   static TypeOut operate(TypeLhs x, TypeRhs y)
   {
     TypeOut output;
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    using TypeCommon = typename cuda::std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
     GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
     return output;
   }

From 011fb482fdba796bc4238add7f86d81829100bff Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 14:06:28 +0530
Subject: [PATCH 41/72] fix segfault by nullptr check in cufile_shim dtor

---
 cpp/src/io/utilities/file_io_utilities.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 387452e171a..f6175fae4ec 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -106,8 +106,8 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    driver_close();
-    dlclose(cf_lib);
+    if (driver_close) driver_close();
+    if (cf_lib) dlclose(cf_lib);
   }
 
   decltype(cuFileHandleRegister)* handle_register     = nullptr;

From 9bdc28b9964b4bd75e3d387829747315bed7ac45 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 14:13:43 +0530
Subject: [PATCH 42/72] enable cuio tests again

---
 cpp/tests/CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8ae31d7d74d..c1c209b2413 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -191,11 +191,15 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-# ConfigureTest(CSV_TEST io/csv_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp)
-# ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp)
-# ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST
-# io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3)
-# target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif()
+ConfigureTest(CSV_TEST io/csv_test.cpp)
+ConfigureTest(ORC_TEST io/orc_test.cpp)
+ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
+ConfigureTest(JSON_TEST io/json_test.cpp)
+ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
+ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
+if(CUDF_ENABLE_ARROW_S3)
+  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
+endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------

From a3ba687b2e53a4fcc2ed9d9502733251ec12096b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 21:31:55 +0530
Subject: [PATCH 43/72] addres review comments

---
 cpp/src/binaryop/binaryop.cpp             | 10 +++-------
 cpp/tests/binaryop/util/runtime_support.h |  5 +----
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a89e9795283..7087b71a84e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -138,16 +138,15 @@ void binary_operation(mutable_column_view& out,
   std::string cuda_source =
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
 
-  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")
                               .instantiate(output_type_name,  // list of template arguments
                                            cudf::jit::get_type_name(lhs.type()),
                                            cudf::jit::get_type_name(rhs.type()),
                                            std::string("cudf::binops::jit::UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
+    .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
     ->launch(out.size(),
              cudf::jit::get_data_ptr(out),
              cudf::jit::get_data_ptr(lhs),
@@ -221,7 +220,6 @@ namespace detail {
 
 // There are 3 overloads of each of the following functions:
 // - `make_fixed_width_column_for_output`
-// - `fixed_point_binary_operation`
 // - `binary_operation`
 
 // The overloads are overloaded on the first two parameters of each function:
@@ -309,9 +307,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
       output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
   }
 };
-}  // namespace detail
 
-namespace detail {
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
index a7ee0c3a391..250d34a0879 100644
--- a/cpp/tests/binaryop/util/runtime_support.h
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -1,8 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 0fa0cc48a6b3b93e79f918d419a012b75765561c Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 30 Nov 2021 07:50:12 -0700
Subject: [PATCH 44/72] Support `min` and `max` in inclusive scan for structs
 (#9725)

This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` inclusive scan. Exclusive scan support is not needed in the near future.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9725
---
 cpp/include/cudf/detail/scan.hpp          |  71 ++++----
 cpp/src/groupby/sort/group_scan_util.cuh  |   6 +
 cpp/src/reductions/scan/scan_inclusive.cu |  87 +++++++++-
 cpp/tests/reductions/reduction_tests.cpp  |   2 +-
 cpp/tests/reductions/scan_tests.cpp       | 196 ++++++++++++++++++++++
 5 files changed, 325 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 113c15f19a1..8e3db1c7b10 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -26,22 +26,25 @@ namespace detail {
 /**
  * @brief Computes the exclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * The identity value for the column type as per the aggregation type
- * is used for the value of the first element in the output column.
+ * The identity value for the column type as per the aggregation type is used for the value of the
+ * first element in the output column.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type.
+ * Struct columns are allowed with aggregation types Min and Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or struct type but the
+ *                           `agg` is not Min or Max.
+ *
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -52,22 +55,22 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
 /**
  * @brief Computes the inclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * String columns are allowed with aggregation types Min and Max.
+ * String and struct columns are allowed with aggregation types Min and Max.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type
- *                           or string type but the `agg` is not Min or Max
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or string/struct types
+ *                           but the `agg` is not Min or Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -76,24 +79,24 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row ranks for a column
+ * @brief Generate row ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row dense ranks for a column
+ * @brief Generate row dense ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b565e8dc6d8..ae3e3232e06 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -239,7 +239,13 @@ struct group_scan_functor<K,
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
 
+    //
     // Gather the children elements of the prefix min/max struct elements first.
+    //
+    // Typically, we should use `get_sliced_child` for each child column to properly handle the
+    // input if it is a sliced view. However, since the input to this function is just generated
+    // from groupby internal APIs which is never a sliced view, we just use `child_begin` and
+    // `child_end` iterators for simplicity.
     auto scanned_children =
       cudf::detail::gather(
         table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 02ecd6df4d9..70f5ca90539 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include "scan.cuh"
+#include <reductions/arg_minmax_util.cuh>
+#include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -150,6 +154,72 @@ struct scan_functor<Op, cudf::string_view> {
   }
 };
 
+template <typename Op>
+struct scan_functor<Op, cudf::struct_view> {
+  static std::unique_ptr<column> invoke(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // Op is used only to determined if we want to find the min or max element.
+    auto constexpr is_min_op = std::is_same_v<Op, DeviceMin>;
+
+    // Build indices of the scan operation results (ARGMIN/ARGMAX).
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto gather_map    = rmm::device_uvector<size_type>(input.size(), stream);
+    auto const do_scan = [&](auto const& binop) {
+      thrust::inclusive_scan(rmm::exec_policy(stream),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(input.size()),
+                             gather_map.begin(),
+                             binop);
+    };
+
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    if (input.has_nulls()) {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    } else {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    }
+
+    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+    // handle input in case it is a sliced view.
+    auto const input_children = [&] {
+      auto const it = cudf::detail::make_counting_transform_iterator(
+        0, [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+          return structs_view.get_sliced_child(child_idx);
+        });
+      return std::vector<column_view>(it, it + input.num_children());
+    }();
+
+    // Gather the children elements of the prefix min/max struct elements for the output.
+    auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr)
+                              ->release();
+
+    // Don't need to set a null mask because that will be handled at the caller.
+    return make_structs_column(input.size(),
+                               std::move(scanned_children),
+                               UNKNOWN_NULL_COUNT,
+                               rmm::device_buffer{0, stream, mr});
+  }
+};
+
 /**
  * @brief Dispatcher for running a Scan operation on an input column
  *
@@ -161,7 +231,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      return std::is_same_v<Op, DeviceMin> || std::is_same_v<Op, DeviceMax>;
+    } else {
+      return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    }
   }
 
  public:
@@ -209,6 +283,15 @@ std::unique_ptr<column> scan_inclusive(
     output->set_null_mask(mask_scan(input, scan_type::INCLUSIVE, stream, mr), UNKNOWN_NULL_COUNT);
   }
 
+  // If the input is a structs column, we also need to push down nulls from the parent output column
+  // into the children columns.
+  if (input.type().id() == type_id::STRUCT && output->has_nulls()) {
+    for (size_type idx = 0; idx < output->num_children(); ++idx) {
+      structs::detail::superimpose_parent_nulls(
+        output->view().null_mask(), output->null_count(), output->child(idx), stream, mr);
+    }
+  }
+
   return output;
 }
 }  // namespace detail
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2c9279260e7..d8ee8f9d08d 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -28,7 +29,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d1e983460d5..0892436eb47 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -397,3 +398,198 @@ TYPED_TEST(ScanDurationTest, Sum)
   EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE),
                cudf::logic_error);
 }
+
+struct StructScanTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructScanTest, StructScanMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"};
+      auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "bit", "bit", "" /*NULL*/, "zit", "zit", "zit", "zit", "" /*NULL*/, "zit"};
+      auto child2 = INTS_CW{1, 2, 2, 0 /*NULL*/, 5, 5, 5, 5, 0 /*NULL*/, 5};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "año",
+                               "año",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            1,
+                            1,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "bit",
+                               "bit",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            2,
+                            2,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}

From 74ac6ed5e06be9a1ee37f3ceaa1d45b2224266f2 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Tue, 30 Nov 2021 22:58:19 +0800
Subject: [PATCH 45/72] fix make_empty_scalar_like (#9782)

Signed-off-by: sperlingxx <lovedreamf@gmail.com>
---
 cpp/src/io/orc/writer_impl.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 2bf020d08a2..9e493c192e4 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1496,15 +1496,23 @@ orc_table_view make_orc_table_view(table_view const& table,
           append_orc_column(col.child(lists_column_view::child_column_index),
                             &orc_columns[new_col_idx],
                             col_meta.child(lists_column_view::child_column_index));
-        } else if (kind == TypeKind::STRUCT or kind == TypeKind::MAP) {
-          // MAP: skip to the list child - include grandchildren columns instead of children
-          auto const real_parent_col =
-            kind == TypeKind::MAP ? col.child(lists_column_view::child_column_index) : col;
-          for (auto child_idx = 0; child_idx != real_parent_col.num_children(); ++child_idx) {
-            append_orc_column(real_parent_col.child(child_idx),
-                              &orc_columns[new_col_idx],
-                              col_meta.child(child_idx));
+        } else if (kind == TypeKind::STRUCT) {
+          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx) {
+            append_orc_column(
+              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
           }
+        } else if (kind == TypeKind::MAP) {
+          // MAP: skip to the list child - include grandchildren columns instead of children
+          auto const real_parent_col   = col.child(lists_column_view::child_column_index);
+          auto const& real_parent_meta = col_meta.child(lists_column_view::child_column_index);
+          CUDF_EXPECTS(real_parent_meta.num_children() == 2,
+                       "Map struct column should have exactly two children");
+          // process MAP key
+          append_orc_column(
+            real_parent_col.child(0), &orc_columns[new_col_idx], real_parent_meta.child(0));
+          // process MAP value
+          append_orc_column(
+            real_parent_col.child(1), &orc_columns[new_col_idx], real_parent_meta.child(1));
         }
       };
 

From dca8a0a0356e90e2b9dfa2a2cedf38d0c90935cb Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Nov 2021 10:40:18 -0600
Subject: [PATCH 46/72] Fix dtype-argument bug in dask_cudf read_csv (#9796)

Closes #9719

`dask_cudf.read_csv` currently fails when both `usecols` and `dtype` are specified. This PR is  a simple fix.  In the near future, the `_internal_read_csv` implementation should also be modified to produce a `Blockwise` HLG Layer, but I will leave that for a separate PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9796
---
 python/dask_cudf/dask_cudf/io/csv.py          | 19 +++++++++++--------
 .../dask_cudf/dask_cudf/io/tests/test_csv.py  |  5 +++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 132201a349e..ebb02e3b6d4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
     if chunksize is None:
         return read_csv_without_chunksize(path, **kwargs)
 
+    # Let dask.dataframe generate meta
     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    usecols = kwargs.pop("usecols", None)
-    meta = dask_reader(filenames[0], **kwargs)._meta
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
 
     dsk = {}
     i = 0
@@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
                 chunksize,
             )  # specify which chunk of the file we care about
             if start != 0:
-                kwargs2[
-                    "names"
-                ] = meta.columns  # no header in the middle of the file
+                kwargs2["names"] = names  # no header in the middle of the file
                 kwargs2["header"] = None
-            kwargs2["usecols"] = usecols
             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 
             i += 1
 
     divisions = [None] * (len(dsk) + 1)
-    if usecols is not None:
-        meta = meta[usecols]
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 98061f6c624..32960a90bd7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     dd.assert_eq(df, df2)
 
 
-def test_csv_reader_usecols(tmp_path):
+@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
+def test_csv_reader_usecols(tmp_path, dtype):
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4] * 100,
@@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path):
     csv_path = str(tmp_path / "usecols_data.csv")
     df.to_csv(csv_path, index=False)
     ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
-    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
+    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)
 
     dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)

From 1db05c9d889d04df113986eeee0356778ce8b003 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 11:45:54 -0600
Subject: [PATCH 47/72] Use Java classloader to find test resources (#9760)

Updates the Java tests to use the classloader to locate test files rather than reaching directly into the source directory.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9760
---
 .../src/test/java/ai/rapids/cudf/TableTest.java | 14 +++++++-------
 .../src/test/java/ai/rapids/cudf/TestUtils.java | 17 ++++++++++++++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4512a08430c..b4247e9bb7c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -70,11 +70,11 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
-  private static final File TEST_PARQUET_FILE = new File("src/test/resources/acq.parquet");
-  private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc");
-  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File(
-      "src/test/resources/timestamp-date-test.orc");
-  private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet");
+  private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+  private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
+  private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
+  private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -548,7 +548,7 @@ void testReadCSVPrune() {
         .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
         .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.2, 119.8)
         .build();
-         Table table = Table.readCSV(schema, opts, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, opts, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
@@ -675,7 +675,7 @@ void testReadCSV() {
         .column(120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L)
         .column("one", "two", "three", "four", "five", "six", "seven\ud801\uddb8", "eight\uBF68", "nine\u03E8", "ten")
         .build();
-         Table table = Table.readCSV(schema, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/TestUtils.java b/java/src/test/java/ai/rapids/cudf/TestUtils.java
index 5a799c666c2..a1acab5883b 100644
--- a/java/src/test/java/ai/rapids/cudf/TestUtils.java
+++ b/java/src/test/java/ai/rapids/cudf/TestUtils.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 
 package ai.rapids.cudf;
 
+import java.io.File;
+import java.net.URISyntaxException;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
@@ -211,4 +214,16 @@ static Double[] getDoubles(final long seed, final int size, int specialValues) {
     });
     return result;
   }
+
+  public static File getResourceAsFile(String resourceName) {
+    URL url = TestUtils.class.getClassLoader().getResource(resourceName);
+    if (url == null) {
+      throw new IllegalArgumentException("Unable to locate resource: " + resourceName);
+    }
+    try {
+      return new File(url.toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
 }

From 1697f63b9e6e80695cb157f479fada72d053fa1a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 30 Nov 2021 23:39:13 +0530
Subject: [PATCH 48/72] Run compute-sanitizer in nightly build (#9641)

Addresses part of https://github.com/rapidsai/cudf/issues/904
- This PR enables run of `compute-sanitizer --tool memcheck` on libcudf unit tests when env `COMPUTE_SANITIZER_ENABLE=true`
  This env `COMPUTE_SANITIZER_ENABLE` will be enabled only in nightly builds of cudf. (To be Enabled in PR https://github.com/rapidsai/gpuci-scripts/pull/675)
- This PR also adds script to parse compute-sanitizer log to junit xml file which can be processed by Jenkins.
  Reports only failures. If no errors, no tests are reported under memcheck results.

Note: Only `memcheck` is enabled now.  when required, other checks of compute-sanitizer could be enabled later.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9641
---
 ci/gpu/build.sh | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 664e774c68a..8f83c169330 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##############################################
 # cuDF GPU build and test script for CI      #
 ##############################################
@@ -176,6 +176,28 @@ else
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    ################################################################################
+    # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
+    ################################################################################
+    if [[ "$BUILD_MODE" == "branch" && "$BUILD_TYPE" == "gpu" ]]; then
+        if [[ "$COMPUTE_SANITIZER_ENABLE" == "true" ]]; then
+            gpuci_logger "Memcheck on GoogleTests with rmm_mode=cuda"
+            export GTEST_CUDF_RMM_MODE=cuda
+            COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
+            mkdir -p "$WORKSPACE/test-results/"
+            for gt in gtests/*; do
+                test_name=$(basename ${gt})
+                if [[ "$test_name" == "ERROR_TEST" ]]; then
+                  continue
+                fi
+                echo "Running GoogleTest $test_name"
+                ${COMPUTE_SANITIZER_CMD} ${gt} | tee "$WORKSPACE/test-results/${test_name}.cs.log"
+            done
+            unset GTEST_CUDF_RMM_MODE
+            # test-results/*.cs.log are processed in gpuci
+        fi
+    fi
+
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
     CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
     CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install

From 69d576543b5414372f36d02a189a7217d3bb8006 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Nov 2021 14:40:34 -0500
Subject: [PATCH 49/72] Update check for inf/nan strings in libcudf float
 conversion to ignore case (#9694)

Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126

Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof

Also, remove the expensive `replace` call in the cudf  before calling this from Python.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9694
---
 cpp/include/cudf/strings/string.cuh           | 64 +++++++++++++---
 cpp/src/strings/convert/convert_floats.cu     | 13 ++--
 cpp/tests/strings/floats_tests.cpp            | 51 ++++---------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 23 +++---
 python/cudf/cudf/core/column/string.py        | 73 -------------------
 5 files changed, 85 insertions(+), 139 deletions(-)

diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 82da5ad8f10..d85d19d7f10 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
            thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
 }
 
+/**
+ * @brief Returns true if input contains the not-a-number string.
+ *
+ * The following are valid for this function: "NAN" and "NaN"
+ * @param d_str input string
+ * @return true if input is as valid NaN string.
+ */
+inline __device__ bool is_nan_str(string_view const& d_str)
+{
+  auto const ptr = d_str.data();
+  return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
+         (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
+}
+
+/**
+ * @brief Returns true if input contains the infinity string.
+ *
+ * The following are valid for this function: "INF", "INFINITY", and "Inf"
+ * @param d_str input string
+ * @return true if input is as valid Inf string.
+ */
+inline __device__ bool is_inf_str(string_view const& d_str)
+{
+  auto const ptr  = d_str.data();
+  auto const size = d_str.size_bytes();
+
+  if (size != 3 && size != 8) return false;
+
+  auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
+                            (ptr[2] == 'F' || ptr[2] == 'f');
+
+  return prefix_valid &&
+         ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
+                          (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
+                          (ptr[7] == 'Y' || ptr[7] == 'y')));
+}
+
 /**
  * @brief Returns `true` if all characters in the string
  * are valid for conversion to a float type.
@@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str)
  * An empty string returns `false`.
  * No bounds checking is performed to verify if the value would fit
  * within a specific float type.
- * The following strings are also allowed "NaN", "Inf" and, "-Inf"
- * and will return true.
+ * The following strings are also allowed and will return true:
+ *  "NaN", "NAN", "Inf", "INF", "INFINITY"
  *
  * @param d_str String to check.
  * @return true if string has valid float characters
@@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str)
 inline __device__ bool is_float(string_view const& d_str)
 {
   if (d_str.empty()) return false;
-  // strings allowed by the converter
-  if (d_str.compare("NaN", 3) == 0) return true;
-  if (d_str.compare("Inf", 3) == 0) return true;
-  if (d_str.compare("-Inf", 4) == 0) return true;
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
   const char* data    = d_str.data();
   // sign character allowed at the beginning of the string
-  size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
-  bool result     = chidx < bytes;
+  size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
+
+  bool result = ch_idx < bytes;
+  // check for nan and infinity strings
+  if (result && data[ch_idx] > '9') {
+    auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
+    if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
+  }
+
   // check for float chars [0-9] and a single decimal '.'
   // and scientific notation [eE][+-][0-9]
-  for (; chidx < bytes; ++chidx) {
-    auto chr = data[chidx];
+  for (; ch_idx < bytes; ++ch_idx) {
+    auto chr = data[ch_idx];
     if (chr >= '0' && chr <= '9') continue;
     if (!decimal_found && chr == '.') {
       decimal_found = true;  // no more decimals
       continue;
     }
     if (!exponent_found && (chr == 'e' || chr == 'E')) {
-      if (chidx + 1 < bytes) chr = data[chidx + 1];
-      if (chr == '-' || chr == '+') ++chidx;
+      if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
+      if (chr == '-' || chr == '+') ++ch_idx;
       decimal_found  = true;  // no decimal allowed in exponent
       exponent_found = true;  // no more exponents
       continue;
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 366d4fe7d42..70b5f528213 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -45,7 +45,7 @@ namespace {
  * @brief This function converts the given string into a
  * floating point double value.
  *
- * This will also map strings containing "NaN", "Inf" and "-Inf"
+ * This will also map strings containing "NaN", "Inf", etc.
  * to the appropriate float values.
  *
  * This function will also handle scientific notation format.
@@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
   const char* in_ptr = d_str.data();
   const char* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
-  // special strings
-  if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
-  if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
-  if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
 
+  // special strings: NaN, Inf
+  if ((in_ptr < end) && *in_ptr > '9') {
+    auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
+    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+  }
+
   // Parse and store the mantissa as much as we can,
   // until we are about to exceed the limit of uint64_t
   constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 126bffa1e49..e6f4f6bb8d9 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+    {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
   results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.2e4",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782123456789",
-                                     "12e+309",
-                                     "3.4028236E38"};
+  std::vector<const char*> h_strings{
+    "1234",    nullptr,        "-876",     "543.2",
+    "-0.12",   ".25",          "-.002",    "",
+    "-0.0",    "1.2e4",        "NAN",      "abc123",
+    "123abc",  "456e",         "-1.78e+5", "-122.33644782123456789",
+    "12e+309", "3.4028236E38", "INF",      "Infinity"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.28e256",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782",
-                                     "12e+309",
-                                     "1.7976931348623159E308"};
+  std::vector<const char*> h_strings{
+    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
+    "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
+    "-Inf",   "-INFINITY"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index a582541a0d4..cf602c26717 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4919,11 +4919,12 @@ void testIsFloat() {
     try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
          ColumnVector isFloat = floatStringCV.isFloat();
          ColumnVector floats = floatStringCV.asFloats();
-         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
-             Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-             -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
+             Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
+             Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
+             1.2e-24f, 0f, 0f, null, 423f);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isFloat);
       assertColumnsAreEqual(expectedFloats, floats);
     }
@@ -4944,12 +4945,12 @@ void testIsDouble() {
     try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
          ColumnVector isDouble = doubleStringCV.isFloat();
          ColumnVector doubles = doubleStringCV.asDoubles();
-         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
-             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
-             Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
-             0d, null, 423d);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
+             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
+             -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
+             1.2e-234d, 0d, 0d, null, 423d);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isDouble);
       assertColumnsAreEqual(expectedDoubles, doubles);
     }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a167383c65c..2a91abc5701 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
-_NAN_INF_VARIATIONS = [
-    "nan",
-    "NAN",
-    "Nan",
-    "naN",
-    "nAN",
-    "NAn",
-    "nAn",
-    "-inf",
-    "-INF",
-    "-InF",
-    "-inF",
-    "-iNF",
-    "-INf",
-    "-iNf",
-    "+inf",
-    "+INF",
-    "+InF",
-    "+inF",
-    "+iNF",
-    "+INf",
-    "+Inf",
-    "+iNf",
-    "inf",
-    "INF",
-    "InF",
-    "inF",
-    "iNF",
-    "INf",
-    "iNf",
-]
-_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-]
-
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5309,16 +5246,6 @@ def as_numerical_column(
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            # TODO: Replace this `replace` call with a
-            # case-insensitive method once following
-            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
-            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
-            new_values = cudf.core.column.as_column(
-                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
-            )
-            string_col = libcudf.replace.replace(
-                string_col, old_values, new_values
-            )
             if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "

From 00a8845780ae9289f483f1113e5c62d4acd7dfe7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 14:02:24 -0600
Subject: [PATCH 50/72] Refactor TableTest assertion methods to a separate
 utility class (#9762)

TableTest has a number of dependencies, e.g.: Parquet, Hadoop, etc., that make it less ideal to be used in an external project. This moves the column and table assertion methods to a separate AssertUtils utility class that avoids the extra dependencies.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9762
---
 .../ai/rapids/cudf/ArrowColumnVectorTest.java |   3 +-
 .../test/java/ai/rapids/cudf/AssertUtils.java | 272 ++++++++++++++++++
 .../java/ai/rapids/cudf/BinaryOpTest.java     |   2 +-
 .../ai/rapids/cudf/ByteColumnVectorTest.java  |   6 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  38 +--
 .../test/java/ai/rapids/cudf/IfElseTest.java  |   2 +-
 .../ai/rapids/cudf/IntColumnVectorTest.java   |   4 +-
 .../test/java/ai/rapids/cudf/ScalarTest.java  |   2 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 251 +---------------
 .../cudf/TimestampColumnVectorTest.java       |   2 +-
 .../test/java/ai/rapids/cudf/UnaryOpTest.java |   2 +-
 .../cudf/ast/CompiledExpressionTest.java      |   2 +-
 12 files changed, 309 insertions(+), 277 deletions(-)
 create mode 100644 java/src/test/java/ai/rapids/cudf/AssertUtils.java

diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
index d5d4059d18d..2a11b24b3a8 100644
--- a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 
-import ai.rapids.cudf.HostColumnVector.BasicType;
 import ai.rapids.cudf.HostColumnVector.ListType;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
@@ -40,7 +39,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/AssertUtils.java b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
new file mode 100644
index 00000000000..184e7dd0c57
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/** Utility methods for asserting in unit tests */
+public class AssertUtils {
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expect The expected result column
+   * @param cv The input column
+   */
+  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
+    assertColumnsAreEqual(expect, cv, "unnamed");
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   */
+  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param cv The input host column
+   * @param colName The name of the host column
+   */
+  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param cv The input Struct column
+   */
+  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param rowOffset The row number to look from
+   * @param length The number of rows to consider
+   * @param cv The input Struct column
+   * @param colName The name of the column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param rowOffset start row index
+   * @param length  number of rows from starting offset
+   * @param cv The input host column
+   * @param colName The name of the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
+   */
+  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
+    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
+    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
+    if (enableNullCountCheck) {
+      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
+    } else {
+      // TODO add in a proper check when null counts are supported by serializing a partitioned column
+    }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
+    DType type = expected.getType();
+    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
+      long tableRow = expectedRow - rowOffset;
+      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
+          "NULL for Column " + colName + " Row " + tableRow);
+      if (!expected.isNull(expectedRow)) {
+        switch (type.typeId) {
+          case BOOL8: // fall through
+          case INT8: // fall through
+          case UINT8:
+            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT16: // fall through
+          case UINT16:
+            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT32: // fall through
+          case UINT32: // fall through
+          case TIMESTAMP_DAYS:
+          case DURATION_DAYS:
+          case DECIMAL32:
+            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT64: // fall through
+          case UINT64: // fall through
+          case DURATION_MICROSECONDS: // fall through
+          case DURATION_MILLISECONDS: // fall through
+          case DURATION_NANOSECONDS: // fall through
+          case DURATION_SECONDS: // fall through
+          case TIMESTAMP_MICROSECONDS: // fall through
+          case TIMESTAMP_MILLISECONDS: // fall through
+          case TIMESTAMP_NANOSECONDS: // fall through
+          case TIMESTAMP_SECONDS:
+          case DECIMAL64:
+            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case DECIMAL128:
+            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT32:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT64:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case STRING:
+            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case LIST:
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
+                expectedOffsets.getInt(expectedRow * 4);
+            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
+                cvOffsets.getInt(tableRow * 4);
+            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
+                colName + " Row " + tableRow);
+            break;
+          case STRUCT:
+            // parent column only has validity which was checked above
+            break;
+          default:
+            throw new IllegalArgumentException(type + " is not supported yet");
+        }
+      }
+    }
+
+    if (type.isNestedType()) {
+      switch (type.typeId) {
+        case LIST:
+          int expectedChildRowOffset = 0;
+          int numChildRows = 0;
+          if (length > 0) {
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
+            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
+                expectedChildRowOffset;
+          }
+          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
+              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
+              enableNullCountCheck, enableNullabilityCheck);
+          break;
+        case STRUCT:
+          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
+          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
+          for (int i = 0; i < expectedChildren.size(); i++) {
+            HostColumnVectorCore expectedChild = expectedChildren.get(i);
+            HostColumnVectorCore cvChild = cvChildren.get(i);
+            String childName = colName + " child " + i;
+            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
+            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
+                colName, enableNullCountCheck, enableNullabilityCheck);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException(type + " is not supported yet");
+      }
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
+   * @param expected the expected result table
+   * @param rowOffset the row number to start checking from
+   * @param length the number of rows to check
+   * @param table the input table to compare against expected
+   * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
+   */
+  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
+    assertEquals(length, table.getRowCount(), "ROW COUNT");
+    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
+      ColumnVector expect = expected.getColumn(col);
+      ColumnVector cv = table.getColumn(col);
+      String name = String.valueOf(col);
+      if (rowOffset != 0 || length != expected.getRowCount()) {
+        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
+      }
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables match
+   * @param expected the expected result table
+   * @param table the input table to compare against expected
+   */
+  public static void assertTablesAreEqual(Table expected, Table table) {
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
+  }
+
+  public static void assertTableTypes(DType[] expectedTypes, Table t) {
+    int len = t.getNumberOfColumns();
+    assertEquals(expectedTypes.length, len);
+    for (int i = 0; i < len; i++) {
+      ColumnVector vec = t.getColumn(i);
+      DType type = vec.getType();
+      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index 894861b8c44..0ca997d3c80 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -27,7 +27,7 @@
 import java.util.Arrays;
 import java.util.stream.IntStream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.TestUtils.*;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index 878fa7e4516..a26dbec4907 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -127,9 +127,9 @@ public void testCastToByte() {
          ColumnVector expected1 = ColumnVector.fromBytes((byte)4, (byte)3, (byte)8);
          ColumnVector expected2 = ColumnVector.fromBytes((byte)100);
          ColumnVector expected3 = ColumnVector.fromBytes((byte)-23)) {
-      TableTest.assertColumnsAreEqual(expected1, byteColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, byteColumnVector2);
-      TableTest.assertColumnsAreEqual(expected3, byteColumnVector3);
+      AssertUtils.assertColumnsAreEqual(expected1, byteColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, byteColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected3, byteColumnVector3);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index cf602c26717..fa9052029cc 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -34,8 +34,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertStructColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.QuantileMethod.*;
-import static ai.rapids.cudf.TableTest.*;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
@@ -86,8 +88,8 @@ void testTransformVector() {
          ColumnVector cv1 = cv.transform(ptx, true);
          ColumnVector cv2 = cv.transform(cuda, false);
          ColumnVector expected = ColumnVector.fromBoxedInts(2*2-2, 3*3-3, null, 4*4-4)) {
-      TableTest.assertColumnsAreEqual(expected, cv1);
-      TableTest.assertColumnsAreEqual(expected, cv2);
+      assertColumnsAreEqual(expected, cv1);
+      assertColumnsAreEqual(expected, cv2);
     }
   }
 
@@ -252,7 +254,7 @@ void testStringCreation() {
     try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END");
          HostColumnVector host = cv.copyToHost();
          ColumnVector backAgain = host.copyToDevice()) {
-      TableTest.assertColumnsAreEqual(cv, backAgain);
+      assertColumnsAreEqual(cv, backAgain);
     }
   }
 
@@ -265,7 +267,7 @@ void testUTF8StringCreation() {
             null,
             "END".getBytes(StandardCharsets.UTF_8));
          ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) {
-      TableTest.assertColumnsAreEqual(expected, cv);
+      assertColumnsAreEqual(expected, cv);
     }
   }
 
@@ -299,7 +301,7 @@ void testConcatNoNulls() {
          ColumnVector v2 = ColumnVector.fromInts(8, 9);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromInts(1, 2, 3, 4, 5, 6, 7, 8, 9)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -310,7 +312,7 @@ void testConcatWithNulls() {
          ColumnVector v2 = ColumnVector.fromBoxedDoubles(null, 9.0);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromBoxedDoubles(1., 2., 3., 4., 5., 6., 7., null, 9.)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -1882,13 +1884,13 @@ void testSubvector() {
     try (ColumnVector vec = ColumnVector.fromBoxedInts(1, 2, 3, null, 5);
          ColumnVector expected = ColumnVector.fromBoxedInts(2, 3, null, 5);
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
 
     try (ColumnVector vec = ColumnVector.fromStrings("1", "2", "3", null, "5");
          ColumnVector expected = ColumnVector.fromStrings("2", "3", null, "5");
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
   }
 
@@ -2014,7 +2016,7 @@ void testTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.strip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2024,7 +2026,7 @@ void testTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.strip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23", null, "23", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2033,7 +2035,7 @@ void testLeftTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.lstrip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123 ", null, "123 ", "123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2043,7 +2045,7 @@ void testLeftTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.lstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23 ", null, "231", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2052,7 +2054,7 @@ void testRightTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.rstrip();
          ColumnVector expected = ColumnVector.fromStrings(" 123", "123", null, " 123", "\t\t123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2062,7 +2064,7 @@ void testRightTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.rstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2108,7 +2110,7 @@ void testCountElements() {
         Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
          ColumnVector lengths = cv.countElements();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2117,7 +2119,7 @@ void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");
       ColumnVector lengths = cv.getCharLengths();
       ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2126,7 +2128,7 @@ void testGetByteCount() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", "123", null, "1234");
          ColumnVector byteLengthVector = cv.getByteCount();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 3, null, 4)) {
-      TableTest.assertColumnsAreEqual(expected, byteLengthVector);
+      assertColumnsAreEqual(expected, byteLengthVector);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IfElseTest.java b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
index 86ddcc23416..a078befdf40 100644
--- a/java/src/test/java/ai/rapids/cudf/IfElseTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
@@ -25,7 +25,7 @@
 
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class IfElseTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index dd03c4de69e..2fb8164534b 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -117,8 +117,8 @@ public void testCastToInt() {
          ColumnVector expected1 = ColumnVector.fromInts(4, 3, 8);
          ColumnVector intColumnVector2 = shortColumnVector.asInts();
          ColumnVector expected2 = ColumnVector.fromInts(100)) {
-      TableTest.assertColumnsAreEqual(expected1, intColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, intColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected1, intColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, intColumnVector2);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 0889363c2d0..86c340bb321 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -29,7 +29,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class ScalarTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b4247e9bb7c..fa221e19387 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -57,6 +57,11 @@
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTableTypes;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -94,242 +99,6 @@ public class TableTest extends CudfTestBase {
       "8|118.2|128\n" +
       "9|119.8|129").getBytes(StandardCharsets.UTF_8);
 
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expect The expected result column
-   * @param cv The input column
-   */
-  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
-    assertColumnsAreEqual(expect, cv, "unnamed");
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   */
-  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param cv The input host column
-   * @param colName The name of the host column
-   */
-  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param cv The input Struct column
-   */
-  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param rowOffset The row number to look from
-   * @param length The number of rows to consider
-   * @param cv The input Struct column
-   * @param colName The name of the column
-   * @param enableNullCountCheck Whether to check for nulls in the Struct column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param rowOffset start row index
-   * @param length  number of rows from starting offset
-   * @param cv The input host column
-   * @param colName The name of the host column
-   * @param enableNullCountCheck Whether to check for nulls in the host column
-   */
-  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
-    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
-    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCountCheck) {
-      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
-    } else {
-      // TODO add in a proper check when null counts are supported by serializing a partitioned column
-    }
-    if (enableNullabilityCheck) {
-      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
-    }
-    DType type = expected.getType();
-    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
-      long tableRow = expectedRow - rowOffset;
-      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
-          "NULL for Column " + colName + " Row " + tableRow);
-      if (!expected.isNull(expectedRow)) {
-        switch (type.typeId) {
-          case BOOL8: // fall through
-          case INT8: // fall through
-          case UINT8:
-            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT16: // fall through
-          case UINT16:
-            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT32: // fall through
-          case UINT32: // fall through
-          case TIMESTAMP_DAYS:
-          case DURATION_DAYS:
-          case DECIMAL32:
-            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT64: // fall through
-          case UINT64: // fall through
-          case DURATION_MICROSECONDS: // fall through
-          case DURATION_MILLISECONDS: // fall through
-          case DURATION_NANOSECONDS: // fall through
-          case DURATION_SECONDS: // fall through
-          case TIMESTAMP_MICROSECONDS: // fall through
-          case TIMESTAMP_MILLISECONDS: // fall through
-          case TIMESTAMP_NANOSECONDS: // fall through
-          case TIMESTAMP_SECONDS:
-          case DECIMAL64:
-            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case DECIMAL128:
-            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT32:
-            assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT64:
-            assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case STRING:
-            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case LIST:
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
-                expectedOffsets.getInt(expectedRow * 4);
-            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
-                cvOffsets.getInt(tableRow * 4);
-            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
-                colName + " Row " + tableRow);
-            break;
-          case STRUCT:
-            // parent column only has validity which was checked above
-            break;
-          default:
-            throw new IllegalArgumentException(type + " is not supported yet");
-        }
-      }
-    }
-
-    if (type.isNestedType()) {
-      switch (type.typeId) {
-        case LIST:
-          int expectedChildRowOffset = 0;
-          int numChildRows = 0;
-          if (length > 0) {
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
-            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
-                expectedChildRowOffset;
-          }
-          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
-              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCountCheck, enableNullabilityCheck);
-          break;
-        case STRUCT:
-          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
-          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
-          for (int i = 0; i < expectedChildren.size(); i++) {
-            HostColumnVectorCore expectedChild = expectedChildren.get(i);
-            HostColumnVectorCore cvChild = cvChildren.get(i);
-            String childName = colName + " child " + i;
-            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
-            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCountCheck, enableNullabilityCheck);
-          }
-          break;
-        default:
-          throw new IllegalArgumentException(type + " is not supported yet");
-      }
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
-   * @param expected the expected result table
-   * @param rowOffset the row number to start checking from
-   * @param length the number of rows to check
-   * @param table the input table to compare against expected
-   * @param enableNullCheck whether to check for nulls or not
-   * @param enableNullabilityCheck whether the table have a validity mask
-   */
-  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
-    assertEquals(length, table.getRowCount(), "ROW COUNT");
-    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
-      ColumnVector expect = expected.getColumn(col);
-      ColumnVector cv = table.getColumn(col);
-      String name = String.valueOf(col);
-      if (rowOffset != 0 || length != expected.getRowCount()) {
-        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
-      }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables match
-   * @param expected the expected result table
-   * @param table the input table to compare against expected
-   */
-  public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
-  }
-
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
     assertEquals(expectedTable.length, table.getNumberOfColumns());
     int numCols = table.getNumberOfColumns();
@@ -358,16 +127,6 @@ void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table
     }
   }
 
-  public static void assertTableTypes(DType[] expectedTypes, Table t) {
-    int len = t.getNumberOfColumns();
-    assertEquals(expectedTypes.length, len);
-    for (int i = 0; i < len; i++) {
-      ColumnVector vec = t.getColumn(i);
-      DType type = vec.getType();
-      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
-    }
-  }
-
   @Test
   void testMergeSimple() {
     try (Table table1 = new Table.TestBuilder()
diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
index 8bf1370a0f7..9a929cec98d 100644
--- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
@@ -22,7 +22,7 @@
 
 import java.util.function.Function;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class TimestampColumnVectorTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
index 76970e8bf76..7fcb7cbd85b 100644
--- a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
@@ -22,7 +22,7 @@
 import ai.rapids.cudf.HostColumnVector.Builder;
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class UnaryOpTest extends CudfTestBase {
   private static final Double[] DOUBLES_1 = new Double[]{1.0, 10.0, -100.1, 5.3, 50.0, 100.0, null, Double.NaN, Double.POSITIVE_INFINITY, 1/9.0, Double.NEGATIVE_INFINITY, 500.0, -500.0};
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 2fb6792b409..e50da0a4d4d 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -36,7 +36,7 @@
 import java.util.function.Function;
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class CompiledExpressionTest extends CudfTestBase {
   @Test

From 554ac817498e64ba1c7ef054873fab7dc658d25c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:50:56 -0600
Subject: [PATCH 51/72] Load native dependencies when Java ColumnView is loaded
 (#9800)

The Java ColumnView class has native methods but does not ensure the corresponding native libraries that implement those methods are loaded.  This adds a static code block to the ColumnView class to load the native libraries when the ColumnView class is loaded.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Kuhu Shukla (https://github.com/kuhushukla)

URL: https://github.com/rapidsai/cudf/pull/9800
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 729444f460c..6d0d24baf99 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -30,6 +30,10 @@
  */
 public class ColumnView implements AutoCloseable, BinaryOperable {
 
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
   public static final long UNKNOWN_NULL_COUNT = -1;
 
   protected long viewHandle;

From 20d6723fcb5eaffb6398e5cf6c14de8d774ca917 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:51:12 -0600
Subject: [PATCH 52/72] Copy Java native dependencies directly into classpath
 (#9787)

Eliminates the intermediate copy of the native libraries for the Java bindings into target/native-deps, instead copying libcudf.so and libcudfjni.so directly into the classpath resources.  This eliminates the need to search target/native-deps at runtime when the native libraries are not in the classpath in the case of running tests before the jar is built.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9787
---
 java/pom.xml                                          |  7 ++-----
 .../main/java/ai/rapids/cudf/NativeDepsLoader.java    | 11 ++---------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/java/pom.xml b/java/pom.xml
index 87d43ec1272..c5a3bc64fad 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -297,9 +297,6 @@
                 <include>LICENSE</include>
               </includes>
             </resource>
-            <resource>
-                <directory>${project.build.directory}/native-deps/</directory>
-            </resource>
         </resources>
         <pluginManagement>
             <plugins>
@@ -499,14 +496,14 @@
                 <executions>
                     <execution>
                         <id>copy-native-libs</id>
-                        <phase>validate</phase>
+                        <phase>generate-resources</phase>
                         <goals>
                             <goal>copy-resources</goal>
                         </goals>
                         <configuration>
                             <overwrite>true</overwrite>
                             <skip>${skipNativeCopy}</skip>
-                            <outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
+                            <outputDirectory>${project.build.outputDirectory}/${os.arch}/${os.name}</outputDirectory>
                             <resources>
                                 <resource>
                                     <directory>${native.build.path}</directory>
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 8780ecc3aa3..9663fbcafb4 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -81,9 +81,7 @@ public static synchronized void loadNativeDeps() {
 
   /**
    * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
-   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. It
-   * will also look for the libraries under ./target/native-deps/${os.arch}/${os.name} to help
-   * facilitate testing while building.
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
    * <br/>
    * Because this just loads the libraries and loading the libraries themselves needs to be a
    * singleton operation it is recommended that any library using this provide their own wrapper
@@ -203,12 +201,7 @@ private static File createFile(String os, String arch, String baseName) throws I
     File loc;
     URL resource = loader.getResource(path);
     if (resource == null) {
-      // It looks like we are not running from the jar, or there are issues with the jar
-      File f = new File("./target/native-deps/" + path);
-      if (!f.exists()) {
-        throw new FileNotFoundException("Could not locate native dependency " + path);
-      }
-      resource = f.toURI().toURL();
+      throw new FileNotFoundException("Could not locate native dependency " + path);
     }
     try (InputStream in = resource.openStream()) {
       loc = File.createTempFile(baseName, ".so");

From 991136c78be01d4de20387086a185cfd5a21713b Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Tue, 30 Nov 2021 15:31:53 -0800
Subject: [PATCH 53/72] Add Pearson correlation for sort groupby (python)
 (#9166)

Fixes: https://github.com/rapidsai/cudf/issues/8691

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Karthikeyan (https://github.com/karthikeyann)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Michael Wang (https://github.com/isVoid)
  - Mayank Anand (https://github.com/mayankanand007)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9166
---
 docs/cudf/source/api_docs/groupby.rst     |   1 +
 docs/cudf/source/basics/groupby.rst       |  10 ++
 python/cudf/cudf/_lib/aggregation.pyx     |  55 +++++++++-
 python/cudf/cudf/_lib/cpp/aggregation.pxd |  15 ++-
 python/cudf/cudf/_lib/groupby.pyx         |   4 +-
 python/cudf/cudf/core/groupby/groupby.py  | 121 +++++++++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py  | 115 ++++++++++++++++++++
 7 files changed, 314 insertions(+), 7 deletions(-)

diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index cf08d1d791b..575d7442cdf 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -59,6 +59,7 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
+   GroupBy.corr
    
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index 04c4d42fa2a..f3269768025 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -127,6 +127,13 @@ Aggregations on groups is supported via the ``agg`` method:
     a
     1   4   1  2.0
     2   5   2  4.5
+    >>> df.groupby("a").corr(method="pearson")
+              b          c
+    a                      
+    1 b  1.000000  0.866025
+      c  0.866025  1.000000
+    2 b  1.000000  1.000000
+      c  1.000000  1.000000
 
 The following table summarizes the available aggregations and the types
 that support them:
@@ -169,6 +176,9 @@ that support them:
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   
 
 GroupBy apply
 -------------
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4f703724cef..68f7101b6ee 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -1,6 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from enum import Enum
+from enum import Enum, IntEnum
 
 import numba
 import numpy as np
@@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
 
 import cudf
 
@@ -57,6 +58,22 @@ class AggregationKind(Enum):
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
+    CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION
+
+
+class CorrelationType(IntEnum):
+    PEARSON = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.PEARSON
+    )
+    KENDALL = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.KENDALL
+    )
+    SPEARMAN = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.SPEARMAN
+    )
 
 
 cdef class Aggregation:
@@ -321,6 +338,22 @@ cdef class Aggregation:
             ))
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef Aggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_correlation_aggregation[aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -692,6 +725,24 @@ cdef class GroupbyAggregation:
         )
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef GroupbyAggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_correlation_aggregation[groupby_aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
+
 cdef class GroupbyScanAggregation:
     """A Cython wrapper for groupby scan aggregations.
 
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 13bfa49057c..3982b4fecbb 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
+ctypedef int32_t underlying_type_t_correlation_type
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -38,6 +39,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
+            CORRELATION 'cudf::aggregation::CORRELATION'
+
         Kind kind
 
     cdef cppclass rolling_aggregation:
@@ -53,6 +56,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
 
+    ctypedef enum correlation_type:
+        PEARSON 'cudf::correlation_type::PEARSON'
+        KENDALL 'cudf::correlation_type::KENDALL'
+        SPEARMAN 'cudf::correlation_type::SPEARMAN'
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -106,3 +114,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         udf_type type,
         string user_defined_aggregator,
         data_type output_type) except +
+
+    cdef unique_ptr[T] make_correlation_aggregation[T](
+        correlation_type type, size_type min_periods) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0968d22d465..314542c9549 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from collections import defaultdict
 
@@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
 _LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = set()
+_STRUCT_AGGS = {"CORRELATION"}
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
                  "NTH", "COLLECT"}
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 7f9f61ed3fd..f1d622362e2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import collections
+import itertools
 import pickle
 import warnings
 
@@ -13,7 +14,8 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
-from cudf.core.column.column import arange
+from cudf.core.column.column import arange, as_column
+from cudf.core.multiindex import MultiIndex
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
@@ -69,6 +71,8 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
+        self._by = by
+        self._level = level
         self._sort = sort
         self._dropna = dropna
 
@@ -777,6 +781,121 @@ def median(self):
         """Get the column-wise median of the values in each group."""
         return self.agg("median")
 
+    def corr(self, method="pearson", min_periods=1):
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
+
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        Returns
+        ----------
+        DataFrame
+            Correlation matrix.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+        id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
+
+        if not method.lower() in ("pearson",):
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs to be correlated
+        # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        _cols = self.grouping.values.columns.tolist()
+        len_cols = len(_cols)
+
+        new_df_data = {}
+        for x, y in itertools.combinations_with_replacement(_cols, 2):
+            new_df_data[(x, y)] = cudf.DataFrame._from_data(
+                {"x": self.obj._data[x], "y": self.obj._data[y]}
+            ).to_struct()
+        new_gb = cudf.DataFrame._from_data(new_df_data).groupby(
+            by=self.grouping.keys
+        )
+
+        try:
+            gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods))
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Correlation accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(_cols)
+            for i, x in enumerate(_cols)
+        ]
+        cols_split = [
+            cols_list[i : i + len_cols]
+            for i in range(0, len(cols_list), len_cols)
+        ]
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: gb_corr.loc[:, i].interleave_columns()
+                for i, x in zip(cols_split, _cols)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_corr.index.repeat(len_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_corr):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                cudf.Series(_cols).tile(len(gb_corr.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d07caef11d5..d555b5c4033 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8924,3 +8924,118 @@ def test_frame_series_where_other(data):
     expected = gdf.where(gdf["b"] == 1, 0)
     actual = pdf.where(pdf["b"] == 1, 0)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id", "val1", "val2"],
+        ),
+        (
+            {
+                "id": [0] * 4 + [1] * 3,
+                "a": [10, 3, 4, 2, -3, 9, 10],
+                "b": [10, 23, -4, 2, -3, 9, 19],
+            },
+            ["id", "a"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val": [None, None, None, None, None, None],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val1": [None, 4, 6, 8, None, 2],
+                "val2": [4, 5, None, 2, 9, None],
+            },
+            ["id"],
+        ),
+        ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_per", [0, 1, 2, 3, 4],
+)
+def test_pearson_corr_passing(data, gkey, min_per):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+    expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("method", ["kendall", "spearman"])
+def test_pearson_corr_unsupported_methods(method):
+    gdf = cudf.DataFrame(
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+            "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        }
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Only pearson correlation is currently supported",
+    ):
+        gdf.groupby("id").corr(method)
+
+
+def test_pearson_corr_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").corr("pearson")
+    expected = pdf.groupby("id").corr("pearson")
+
+    assert_eq(
+        expected, actual, check_dtype=False, check_index_type=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+    ],
+)
+@pytest.mark.parametrize("gkey", ["id", "val1", "val2"])
+def test_pearson_corr_invalid_column_types(data, gkey):
+    with pytest.raises(
+        TypeError, match="Correlation accepts only numerical column-pairs",
+    ):
+        cudf.DataFrame(data).groupby(gkey).corr("pearson")
+
+
+def test_pearson_corr_multiindex_dataframe():
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]}
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level="a").corr("pearson")
+    expected = gdf.to_pandas().groupby(level="a").corr("pearson")
+
+    assert_eq(expected, actual)

From 1eabcb73b7df235de9985e207e2087af9dfb0e14 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 1 Dec 2021 17:03:36 +0530
Subject: [PATCH 54/72] Fix some doxygen warnings and add missing documentation
 (#9770)

fix to ignore `__device__ void` return type warnings.
add missing documentation on some functions
Correct doxygen doc style comment fixes

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9770
---
 cpp/doxygen/Doxyfile                             |  7 ++++---
 cpp/include/cudf/lists/combine.hpp               |  2 +-
 cpp/include/cudf/scalar/scalar_device_view.cuh   | 16 ++++++++++++++++
 .../cudf/strings/convert/convert_lists.hpp       |  2 +-
 cpp/include/cudf/table/row_operators.cuh         |  3 ++-
 cpp/include/cudf_test/base_fixture.hpp           |  3 +++
 cpp/include/cudf_test/column_wrapper.hpp         |  3 +++
 cpp/include/cudf_test/file_utilities.hpp         |  9 +++++++++
 cpp/include/cudf_test/table_utilities.hpp        |  2 +-
 9 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 55e5119040e..6a556bb4b34 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -2089,7 +2089,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2097,7 +2097,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2129,7 +2129,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED              = __device__= \
+                          __host__=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index a9407ed57ca..61a81e8a745 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -26,7 +26,7 @@ namespace lists {
  * @file
  */
 
-/*
+/**
  * @brief Flag to specify whether a null list element will be ignored from concatenation, or the
  * entire concatenation result involving null list elements will be a null element.
  */
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 884b412d3e2..56afa150dfc 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -91,6 +91,12 @@ class fixed_width_scalar_device_view_base : public detail::scalar_device_view_ba
     return *data<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @tparam T The desired type
+   * @param value The value to store in scalar
+   */
   template <typename T>
   __device__ void set_value(T value)
   {
@@ -159,6 +165,11 @@ class fixed_width_scalar_device_view : public detail::fixed_width_scalar_device_
     return fixed_width_scalar_device_view_base::value<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(T value) { fixed_width_scalar_device_view_base::set_value<T>(value); }
 
   /**
@@ -218,6 +229,11 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base {
   {
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(rep_type value) { *_data = value; }
 
   /**
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index ec22186ea99..279bf44e7fc 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,7 +50,7 @@ namespace strings {
  *
  * @param input Lists column to format.
  * @param na_rep Replacment string for null elements.
- * @param separator Strings to use for enclosing list components and separating elements.
+ * @param separators Strings to use for enclosing list components and separating elements.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index c719c564a87..70ccac2f75d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -67,7 +67,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 }
 }  // namespace detail
 
-/*
+/**
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`. Specialization is to
  * handle `nan` in the order shown below.
@@ -187,6 +187,7 @@ class element_equality_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
+   * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
    *
    */
   template <typename Element,
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 5fa07fd5568..e08bf6aa53a 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -50,6 +50,7 @@ class BaseFixture : public ::testing::Test {
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
+   * @return pointer to memory resource
    */
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
@@ -170,6 +171,7 @@ class UniformRandomGenerator {
 
   /**
    * @brief Returns the next random number.
+   * @return generated random number
    */
   template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
@@ -211,6 +213,7 @@ class TempDirTestEnvironment : public ::testing::Environment {
   /**
    * @brief Get a temporary filepath to use for the specified filename
    *
+   * @param filename name of the file to be placed in temporary directory.
    * @return std::string The temporary filepath
    */
   std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index f291b04776a..cd2ac9f3ec1 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -79,6 +79,7 @@ class column_wrapper {
 
   /**
    * @brief Releases internal unique_ptr to wrapped column
+   * @return unique_ptr to wrapped column
    */
   std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
 
@@ -1040,11 +1041,13 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 
   /**
    * @brief Access keys column view
+   * @return column_view to keys column
    */
   column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
 
   /**
    * @brief Access indices column view
+   * @return column_view to indices column
    */
   column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
 
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 90bf0cd99dc..8e242e5a4f3 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -24,6 +24,10 @@
 
 #include <cudf/utilities/error.hpp>
 
+/**
+ * @brief RAII class for creating a temporary directory.
+ *
+ */
 class temp_directory {
   std::string _path;
 
@@ -49,5 +53,10 @@ class temp_directory {
     nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
   }
 
+  /**
+   * @brief Returns the path of the temporary directory
+   *
+   * @return string path of the temporary directory
+   */
   const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 831c9f5ac14..f2427c5b8c6 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -39,7 +39,7 @@ void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
 
-/*
+/**
  * @brief Verifies the equivalency of two tables.
  *
  * Treats null elements as equivalent.  Columns that have nullability but no nulls,

From 1ceb8ab01120ffe463600db14e6893e196cbb991 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:10:10 -0500
Subject: [PATCH 55/72] Improve build time of libcudf iterator tests (#9788)

While working on #9641 I noticed that building the iterator gtests takes alot of time in CI. Here is a link to the individual build times for libcudf including the gtests:
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5173/testReport/(root)/BuildTime/
(you can sort by Duration by clicking on table colum header).

Here is a table of the top 20 compile time offenders as recorded on my local machine. Note that like the CI build output, 6 of the top 20 are just building the `ITERATOR_TEST`

| rank | time (ms) | file |
| ---:| ---:|:--- |
|  1 | 814334 | /cudf.dir/src/search/search.cu.o
|  2 | 755375 | /cudf.dir/src/sort/sort_column.cu.o
|  3 | 686235 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_numeric.cu.o
|  4 | 670587 | /cudf.dir/src/groupby/sort/group_nunique.cu.o
|  5 | 585524 | /cudf.dir/src/reductions/scan/scan_inclusive.cu.o
|  6 | 582677 | /ITERATOR_TEST.dir/iterator/pair_iterator_test_numeric.cu.o
|  7 | 568418 | /ITERATOR_TEST.dir/iterator/scalar_iterator_test.cu.o
|  8 | 563196 | /cudf.dir/src/sort/sort.cu.o
|  9 | 548816 | /ITERATOR_TEST.dir/iterator/value_iterator_test_numeric.cu.o
| 10 | 535315 | /cudf.dir/src/groupby/sort/sort_helper.cu.o
| 11 | 531384 | /cudf.dir/src/sort/is_sorted.cu.o
| 12 | 530382 | /ITERATOR_TEST.dir/iterator/value_iterator_test_chrono.cu.o
| 13 | 525187 | /cudf.dir/src/join/semi_join.cu.o
| 14 | 523726 | /cudf.dir/src/rolling/rolling.cu.o
| 15 | 517909 | /cudf.dir/src/reductions/product.cu.o
| 16 | 513119 | /cudf.dir/src/stream_compaction/distinct_count.cu.o
| 17 | 512569 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_chrono.cu.o
| 18 | 508978 | /cudf.dir/src/reductions/sum_of_squares.cu.o
| 19 | 508460 | /cudf.dir/src/lists/drop_list_duplicates.cu.o
| 20 | 505247 | /cudf.dir/src/reductions/sum.cu.o

I made some simple changes to the iterator code logic to use different thrust functions along with a temporary device vector. This approach improved the compile time of the `ITERATOR_TEST` by about 3x. Here are the results of compiling the above 6 files with the changes in this PR.

| new rank | new time (ms) | file |
| ---:| ---:|:--- |
| 59 | 232691 (2.9x) | optional_iterator_test_numeric.cu.o |
| 26 | 416951 (1.4x) | pair_iterator_test_numeric.cu.o |
| 92 | 165947 (3.4x) | scalar_iterator_test.cu.o |
| 65 | 216364 (2.5x) | value_iterator_test_numeric.cu.o |
| 77 | 186583 (2.8x) | value_iterator_test_chrono.cu.o |
| 111 | 137789 (3.7x) | optional_iterator_test_chrono.cu.o |

Total overall build time improved locally by ~3m (10%) using `ninja -j48 install` on a Dell 5820.

Here are the build time results of a CI build with these changes.
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5190/testReport/(root)/BuildTime/

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9788
---
 cpp/tests/iterator/iterator_tests.cuh         | 17 +++++++--
 .../optional_iterator_test_numeric.cu         | 37 +++++++++----------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 4ec347c4bc1..07eb595449c 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -18,8 +18,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>                             // include iterator header
-#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -28,6 +28,7 @@
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 #include <cub/device/device_reduce.cuh>
@@ -83,7 +84,17 @@ struct IteratorTest : public cudf::test::BaseFixture {
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
 
-    bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin());
+    // using a temporary vector and calling transform and all_of separately is
+    // equivalent to thrust::equal but compiles ~3x faster
+    auto dev_results = rmm::device_uvector<bool>(num_items, rmm::cuda_stream_default);
+    thrust::transform(thrust::device,
+                      d_in,
+                      d_in_last,
+                      dev_expected.begin(),
+                      dev_results.begin(),
+                      thrust::equal_to{});
+    auto result = thrust::all_of(
+      thrust::device, dev_results.begin(), dev_results.end(), thrust::identity<bool>{});
     EXPECT_TRUE(result) << "thrust test";
   }
 
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 6d51f4a5c14..a8c135a726f 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -50,21 +50,15 @@ struct transformer_optional_meanvar {
   }
 };
 
-struct sum_if_not_null {
-  template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::optional<T> operator()(const thrust::optional<T>& lhs,
-                                                           const thrust::optional<T>& rhs)
-  {
-    return lhs.value_or(T{0}) + rhs.value_or(T{0});
-  }
+template <typename T>
+struct optional_to_meanvar {
+  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
 // Workaround: exclude this test only at device debug mode.
 #if !defined(__CUDACC_DEBUG__)
-// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
-// It would be useful for `var`, `std` operation
 TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
 {
   using T        = TypeParam;
@@ -104,22 +98,27 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
   expected_value.value_squared = std::accumulate(
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
-  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
-
   // GPU test
   auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
-  auto result         = thrust::reduce(it_dev_squared,
-                               it_dev_squared + d_col->size(),
-                               thrust::optional<T_output>{T_output{}},
-                               sum_if_not_null{});
+
+  // this can be computed with a single reduce and without a temporary output vector
+  // but the approach increases the compile time by ~2x
+  auto results = rmm::device_uvector<T_output>(d_col->size(), rmm::cuda_stream_default);
+  thrust::transform(thrust::device,
+                    it_dev_squared,
+                    it_dev_squared + d_col->size(),
+                    results.begin(),
+                    optional_to_meanvar<T_output>{});
+  auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{});
+
   if (not std::is_floating_point<T>()) {
-    EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum";
+    EXPECT_EQ(expected_value, result) << "optional iterator reduction sum";
   } else {
-    EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum";
-    EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3)
+    EXPECT_NEAR(expected_value.value, result.value, 1e-3) << "optional iterator reduction sum";
+    EXPECT_NEAR(expected_value.value_squared, result.value_squared, 1e-3)
       << "optional iterator reduction sum squared";
-    EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count";
+    EXPECT_EQ(expected_value.count, result.count) << "optional iterator reduction count";
   }
 }
 #endif

From 11c3dfef2e7fe6fd67ff93bdf36a47c0a5b2eb37 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:28:24 -0600
Subject: [PATCH 56/72] Remove unused masked udf cython/c++ code (#9792)

This PR removes the c++ side of the original masked UDF code introduced in https://github.com/rapidsai/cudf/pull/8213. These kernels had some limitations and are now superseded by the numba-generated versions we moved to in https://github.com/rapidsai/cudf/pull/9174. As far as I can tell, cuDF python was the only thing consuming this API for the short time it has existed. However I am marking this breaking just in case.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9792
---
 .../Modules/JitifyPreprocessKernels.cmake     |   4 +-
 cpp/include/cudf/transform.hpp                |   6 --
 cpp/src/transform/jit/masked_udf_kernel.cu    |  85 ---------------
 cpp/src/transform/transform.cpp               | 102 ------------------
 python/cudf/cudf/_lib/cpp/transform.pxd       |   6 --
 python/cudf/cudf/_lib/transform.pyx           |  24 -----
 6 files changed, 2 insertions(+), 225 deletions(-)
 delete mode 100644 cpp/src/transform/jit/masked_udf_kernel.cu

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index c2ad25760b8..6ab1293ab6f 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -51,8 +51,8 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(
-  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu
-  transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu
+  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu
+  rolling/jit/kernel.cu
 )
 
 add_custom_target(
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 55e7bc84dbe..45e8ff1310c 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,12 +54,6 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& binary_udf,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
deleted file mode 100644
index 319ad730c53..00000000000
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <transform/jit/operation-udf.hpp>
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-namespace cudf {
-namespace transformation {
-namespace jit {
-
-template <typename T>
-struct Masked {
-  T value;
-  bool valid;
-};
-
-template <typename TypeIn, typename MaskType, typename OffsetType>
-__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::make_tuple(in_ptr[id], valid);
-}
-
-template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
-__device__ auto make_args(cudf::size_type id,
-                          InType in_ptr,
-                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
-                          OffsetType in_offset,  // in practice, always cudf::size_type
-                          Arguments... args)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
-}
-
-template <typename TypeOut, typename... Arguments>
-__global__ void generic_udf_kernel(cudf::size_type size,
-                                   TypeOut* out_data,
-                                   bool* out_mask,
-                                   Arguments... args)
-{
-  int const tid    = threadIdx.x;
-  int const blkid  = blockIdx.x;
-  int const blksz  = blockDim.x;
-  int const gridsz = gridDim.x;
-  int const start  = tid + blkid * blksz;
-  int const step   = blksz * gridsz;
-
-  Masked<TypeOut> output;
-  for (cudf::size_type i = start; i < size; i += step) {
-    auto func_args = cuda::std::tuple_cat(
-      cuda::std::make_tuple(&output.value),
-      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
-    );
-    cuda::std::apply(GENERIC_OP, func_args);
-    out_data[i] = output.value;
-    out_mask[i] = output.valid;
-  }
-}
-
-}  // namespace jit
-}  // namespace transformation
-}  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 5230b853a79..0cca6699586 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -19,12 +19,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -65,80 +63,6 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<std::string> make_template_types(column_view outcol_view, table_view const& data_view)
-{
-  std::string mskptr_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
-  std::string offset_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
-
-  std::vector<std::string> template_types;
-  template_types.reserve((3 * data_view.num_columns()) + 1);
-
-  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
-  for (auto const& col : data_view) {
-    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
-    template_types.push_back(mskptr_type);
-    template_types.push_back(offset_type);
-  }
-  return template_types;
-}
-
-void generalized_operation(table_view const& data_view,
-                           std::string const& udf,
-                           data_type output_type,
-                           mutable_column_view outcol_view,
-                           mutable_column_view outmsk_view,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto const template_types = make_template_types(outcol_view, data_view);
-
-  std::string generic_kernel_name =
-    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-      .instantiate(template_types);
-
-  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-
-  std::vector<void*> kernel_args;
-  kernel_args.reserve((data_view.num_columns() * 3) + 3);
-
-  cudf::size_type size   = outcol_view.size();
-  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
-  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
-  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
-
-  std::vector<const void*> data_ptrs;
-  std::vector<cudf::bitmask_type const*> mask_ptrs;
-  std::vector<cudf::offset_type> offsets;
-
-  data_ptrs.reserve(data_view.num_columns());
-  mask_ptrs.reserve(data_view.num_columns());
-  offsets.reserve(data_view.num_columns());
-
-  auto const iters = thrust::make_zip_iterator(
-    thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
-
-  std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) {
-    kernel_args.push_back(&thrust::get<0>(tuple_vals));
-    kernel_args.push_back(&thrust::get<1>(tuple_vals));
-    kernel_args.push_back(&thrust::get<2>(tuple_vals));
-  });
-
-  std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {
-    return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
-  });
-
-  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
-    .get_kernel(generic_kernel_name,
-                {},
-                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
-                {"-arch=sm_."})
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
-    ->launch(kernel_args.data());
-}
-
 }  // namespace jit
 }  // namespace transformation
 
@@ -165,24 +89,6 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
-  std::unique_ptr<column> output_mask =
-    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
-
-  transformation::jit::generalized_operation(
-    data_view, udf, output_type, *output, *output_mask, stream, mr);
-
-  auto final_output_mask = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
-  return output;
-}
-
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -195,12 +101,4 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 3153427ce3c..590a371ff52 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -34,12 +34,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
-    cdef unique_ptr[column] generalized_masked_op(
-        const table_view& data_view,
-        string udf,
-        data_type output_type,
-    ) except +
-
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index a0eb7c68183..96d25cb92c9 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -123,30 +123,6 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def masked_udf(incols, op, output_type):
-    cdef table_view data_view = table_view_from_table(
-        incols, ignore_index=True)
-    cdef string c_str = op.encode("UTF-8")
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
-    c_tid = <type_id> (
-        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-            output_type
-        ]
-    )
-    c_dtype = data_type(c_tid)
-
-    with nogil:
-        c_output = move(libcudf_transform.generalized_masked_op(
-            data_view,
-            c_str,
-            c_dtype,
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
-
-
 def table_encode(input):
     cdef table_view c_input = table_view_from_table(
         input, ignore_index=True)

From 1904d1a9ff54343471998523816c9e0a00f46797 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 1 Dec 2021 13:00:16 -0600
Subject: [PATCH 57/72] Fix overflow for min calculation in
 strings::from_timestamps (#9793)

This fixes #9790

When converting a timestamp to a String it is possible for the %M min calculation to overflow an int32_t part way through casting. This moves that result to be an int64_t which avoids the overflow issues.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9793
---
 cpp/src/strings/convert/convert_datetime.cu | 4 ++--
 cpp/tests/strings/datetime_tests.cpp        | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 51a6a796ba3..8d0c5704a7b 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -707,9 +707,9 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int32_t scale_time(int64_t time, int64_t base) const
+  __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
-    return static_cast<int32_t>((time - ((time < 0) * (base - 1L))) / base);
+    return (time - ((time < 0) * (base - 1L))) / base;
   };
 
   __device__ time_components get_time_components(int64_t tstamp) const
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 4543607614f..9a01d5dd041 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -311,13 +311,14 @@ TEST_F(StringsDatetimeTest, FromTimestampAmPm)
 TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
 {
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> timestamps_ms{
-    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047};
+    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047, 128849018880000};
   auto results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%3f");
   cudf::test::strings_column_wrapper expected_ms{"2018-07-04 12:00:00.123",
                                                  "2020-02-29 00:01:01.007",
                                                  "2015-12-29 23:02:02.421",
                                                  "2011-10-11 03:03:03.999",
-                                                 "1776-07-04 11:59:59.953"};
+                                                 "1776-07-04 11:59:59.953",
+                                                 "6053-01-23 02:08:00.000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms);
 
   results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%f");
@@ -325,7 +326,8 @@ TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
                                                     "2020-02-29 00:01:01.007000",
                                                     "2015-12-29 23:02:02.421000",
                                                     "2011-10-11 03:03:03.999000",
-                                                    "1776-07-04 11:59:59.953000"};
+                                                    "1776-07-04 11:59:59.953000",
+                                                    "6053-01-23 02:08:00.000000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms_6f);
 
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps_ns{

From 836f800e61acafa0fa6b3c7d9826904f0ba2ad06 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:14 -0500
Subject: [PATCH 58/72] Use CTAD with Thrust function objects (#9768)

While reviewing another PR, I noticed unnecessary usage of explicit template parameters with Thrust function objects and decided to open a small PR to clean this up (CTAD showed up in C++17).

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9768
---
 cpp/include/cudf/strings/detail/gather.cuh       |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp         |  7 ++-----
 cpp/src/copying/concatenate.cu                   |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu           |  4 ++--
 cpp/src/groupby/sort/group_rank_scan.cu          |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh         |  6 +++---
 .../sort/group_single_pass_reduction_util.cuh    | 16 ++++++++--------
 cpp/src/groupby/sort/group_tdigest.cu            | 10 +++++-----
 cpp/src/join/hash_join.cu                        |  2 +-
 cpp/src/join/join_utils.cu                       |  2 +-
 .../lists/combine/concatenate_list_elements.cu   |  2 +-
 cpp/src/lists/contains.cu                        |  7 ++-----
 cpp/src/lists/interleave_columns.cu              |  8 ++++----
 cpp/src/quantiles/tdigest/tdigest.cu             |  7 ++-----
 cpp/src/reductions/scan/scan_inclusive.cu        |  9 ++++-----
 cpp/src/rolling/grouped_rolling.cu               |  6 +++---
 cpp/src/rolling/rolling_collect_list.cu          |  2 +-
 cpp/src/sort/rank.cu                             | 10 +++++-----
 cpp/src/strings/copying/concatenate.cu           |  2 +-
 cpp/src/strings/findall.cu                       |  7 ++-----
 cpp/src/strings/repeat_strings.cu                |  2 +-
 cpp/src/strings/split/split.cu                   | 14 ++++----------
 cpp/tests/iterator/iterator_tests.cuh            | 11 +++--------
 .../apply_boolean_mask_tests.cpp                 |  4 ++--
 cpp/tests/strings/fixed_point_tests.cpp          |  2 +-
 cpp/tests/transform/row_bit_count_test.cu        |  6 ++----
 26 files changed, 63 insertions(+), 89 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ec4a88a0e46..eb7258830ce 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -315,7 +315,7 @@ std::unique_ptr<cudf::column> gather(
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
-    thrust::plus<size_t>{});
+    thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cd2ac9f3ec1..ccfdde2270c 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1502,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     // concatenate them together, skipping children that are null.
     std::vector<column_view> children;
-    thrust::copy_if(std::cbegin(cols),
-                    std::cend(cols),
-                    valids,  // stencil
-                    std::back_inserter(children),
-                    thrust::identity<bool>{});
+    thrust::copy_if(
+      std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f4b6a8bf5fd..34c0cea683e 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     device_views.cend(),
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
-    thrust::plus<size_t>{});
+    thrust::plus{});
   auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4e2a5b68abc..bde7c985df1 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -173,8 +173,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 
   // Generate bitmask for the output.
   // Only mean and M2 values can be nullable. Count column must be non-nullable.
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
   if (null_count > 0) {
     result_means->set_null_mask(null_mask, null_count);           // copy null_mask
     result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 935ef9554a9..f36bdc0a660 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -79,7 +79,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                 group_labels.end(),
                                 mutable_ranks.begin<size_type>(),
                                 mutable_ranks.begin<size_type>(),
-                                thrust::equal_to<size_type>{},
+                                thrust::equal_to{},
                                 scan_op);
 
   return ranks;
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index ae3e3232e06..e25fdd6fc27 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -115,7 +115,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -160,7 +160,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -214,7 +214,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index decb127b264..95a36f40e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -191,7 +191,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -215,10 +215,10 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
@@ -264,7 +264,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -283,10 +283,10 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
       auto const binop =
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 146a6a8c31c..551eb128231 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -625,7 +625,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to<size_type>{},    // key equality check
+                        thrust::equal_to{},               // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -850,8 +850,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::minimum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -864,8 +864,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::maximum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::maximum{});
 
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index e4bd1938ecc..c5b680f129e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
     left_join_complement_size = thrust::count_if(rmm::exec_policy(stream),
                                                  invalid_index_map->begin(),
                                                  invalid_index_map->end(),
-                                                 thrust::identity<size_type>());
+                                                 thrust::identity());
   }
   return join_size + left_join_complement_size;
 }
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 4aca4b4a9cf..9e98f87e7f0 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map->begin(),
                                               right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
+                                              thrust::identity{}) -
                               right_indices_complement->begin();
     right_indices_complement->resize(indices_count, stream);
   }
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4bef312b396..2ddede97ce4 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -225,7 +225,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
   auto list_entries =
     gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
 
   return make_lists_column(num_rows,
                            std::move(list_offsets),
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index bdbc9ae013c..b48982d205a 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -74,11 +74,8 @@ struct lookup_functor {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
     } else {
-      return cudf::detail::valid_if(result_validity.begin<bool>(),
-                                    result_validity.end<bool>(),
-                                    thrust::identity<bool>{},
-                                    stream,
-                                    mr);
+      return cudf::detail::valid_if(
+        result_validity.begin<bool>(), result_validity.end<bool>(), thrust::identity{}, stream, mr);
     }
   }
 
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index b9b73d98ed2..220cb25a942 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -228,8 +228,8 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    auto [null_mask, null_count] =
+      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
@@ -306,7 +306,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
 
     if (data_has_null_mask) {
       auto [null_mask, null_count] = cudf::detail::valid_if(
-        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+        validities.begin(), validities.end(), thrust::identity{}, stream, mr);
       if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
     }
 
@@ -405,7 +405,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   }
 
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
   return make_lists_column(num_output_lists,
                            std::move(list_offsets),
                            std::move(list_entries),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 57c221b15ed..18e7d02d086 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -348,11 +348,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(tdigest_is_empty,
-                                  tdigest_is_empty + tdv.size(),
-                                  thrust::logical_not<size_type>{},
-                                  stream,
-                                  mr);
+    return cudf::detail::valid_if(
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 70f5ca90539..b0e761c4c3b 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -50,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view,
   auto valid_itr = detail::make_validity_iterator(*d_input);
 
   auto first_null_position = [&] {
-    size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream),
-                                                     valid_itr,
-                                                     valid_itr + input_view.size(),
-                                                     thrust::identity<bool>{}) -
-                                 valid_itr;
+    size_type const first_null =
+      thrust::find_if_not(
+        rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) -
+      valid_itr;
     size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0;
     return std::min(input_view.size(), first_null + exclusive_offset);
   }();
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 509f67bb5c6..5a7f15148d8 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -142,8 +142,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
+    return thrust::minimum{}(preceding_window,
+                             idx - group_start + 1);  // Preceding includes current row.
   };
 
   auto following_calculator = [d_group_offsets = group_offsets.data(),
@@ -152,7 +152,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
     auto group_label = d_group_labels[idx];
     auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
                                                           // is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+    return thrust::minimum{}(following_window, (group_end - 1) - idx);
   };
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
index ecef90dc8e1..30c39bde7d2 100644
--- a/cpp/src/rolling/rolling_collect_list.cu
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -75,7 +75,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum<size_type>{});
+                         thrust::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c8a908e44cd..e9589e6c4b3 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to<size_type>{},
+                        thrust::equal_to{},
                         tie_breaker);
   auto sorted_tied_rank = thrust::make_transform_iterator(
     dense_rank_sorted.begin(),
@@ -171,8 +171,8 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::minimum{},
+                                       thrust::identity{},
                                        stream);
 }
 
@@ -189,8 +189,8 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::maximum{},
+                                       thrust::identity{},
                                        stream);
 }
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index db8b37a9592..3822fa8bf5a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -96,7 +96,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
                                    device_views_ptr + views.size(),
                                    std::next(d_partition_offsets.begin()),
                                    chars_size_transform{},
-                                   thrust::plus<size_t>{});
+                                   thrust::plus{});
   auto const output_chars_size = d_partition_offsets.back_element(stream);
   stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 3ab5b55020c..8d96f0de415 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -153,11 +153,8 @@ std::unique_ptr<table> findall_re(
 
   std::vector<std::unique_ptr<column>> results;
 
-  size_type const columns = thrust::reduce(rmm::exec_policy(stream),
-                                           find_counts.begin(),
-                                           find_counts.end(),
-                                           0,
-                                           thrust::maximum<size_type>{});
+  size_type const columns = thrust::reduce(
+    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 458f3ed885c..7820e0064a6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -369,7 +369,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              thrust::make_counting_iterator<size_type>(strings_count),
                              fn,
                              int64_t{0},
-                             thrust::plus<int64_t>{});
+                             thrust::plus{});
 
   return std::make_pair(std::move(output_sizes), total_bytes);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 5113b418501..c6e52a79059 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -490,11 +490,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                            token_counts.begin(),
-                                            token_counts.end(),
-                                            0,
-                                            thrust::maximum<size_type>{});
+  auto const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -748,11 +745,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                     [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
 
   // column count is the maximum number of tokens for any string
-  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                                 token_counts.begin(),
-                                                 token_counts.end(),
-                                                 0,
-                                                 thrust::maximum<size_type>{});
+  size_type const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 07eb595449c..d93c1275122 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -51,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              d_in,
-                              dev_result.begin(),
-                              num_items,
-                              thrust::minimum<T_output>{},
-                              init);
+    cub::DeviceReduce::Reduce(
+      nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init);
 
     // Allocate temporary storage
     rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
@@ -68,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum<T_output>{},
+                              thrust::minimum{},
                               init);
 
     evaluate(expected, dev_result, "cub test");
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 813cceb0861..c80a8fba55c 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest)
                   dec32_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec32_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
   thrust::copy_if(thrust::seq,
                   dec64_data.cbegin(),
                   dec64_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec64_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
 
   decimal32_wrapper expect_col32(
     expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3});
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index ce4280e0733..5872a9e5bb7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -329,4 +329,4 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
 
   auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
   EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 7fb7326f221..43d63c9fd22 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -239,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(thrust::device,
-                   ints_view.begin<int32_t>(),
-                   ints_view.end<int32_t>(),
-                   thrust::identity<int32_t>());
+  thrust::tabulate(
+    thrust::device, ints_view.begin<int32_t>(), ints_view.end<int32_t>(), thrust::identity{});
 
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);

From 677e63236a81ea3c402df993845a1fdc98072c9e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:25 -0500
Subject: [PATCH 59/72] Avoid overflow for `fixed_point` `cudf::cast` and
 performance optimization (#9772)

This resolves https://github.com/rapidsai/cudf/issues/9000.

When using `cudf::cast` for a wider decimal type to a narrower decimal type, you can overflow. This PR modifies the code path for this specific use case so that the "rescale" happens for the type cast. A small perf improvement was added when you have identical scales to avoid rescaling.

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9772
---
 cpp/src/unary/cast_ops.cu      | 49 +++++++++++++++++++++-------------
 cpp/tests/unary/cast_tests.cpp | 13 +++++++++
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e852b00796a..131fde11cf8 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -305,28 +305,39 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
-
-    auto const size = input.size();
-    auto temporary =
-      std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
-                               size,
-                               rmm::device_buffer{size * cudf::size_of(type), stream},
-                               copy_bitmask(input, stream),
-                               input.null_count());
-
     using SourceDeviceT = device_storage_type_t<SourceT>;
     using TargetDeviceT = device_storage_type_t<TargetT>;
 
-    mutable_column_view output_mutable = *temporary;
-
-    thrust::transform(rmm::exec_policy(stream),
-                      input.begin<SourceDeviceT>(),
-                      input.end<SourceDeviceT>(),
-                      output_mutable.begin<TargetDeviceT>(),
-                      device_cast<SourceDeviceT, TargetDeviceT>{});
-
-    // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
+    auto casted = [&]() {
+      auto const size = input.size();
+      auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
+                                             size,
+                                             rmm::device_buffer{size * cudf::size_of(type), stream},
+                                             copy_bitmask(input, stream),
+                                             input.null_count());
+
+      mutable_column_view output_mutable = *output;
+
+      thrust::transform(rmm::exec_policy(stream),
+                        input.begin<SourceDeviceT>(),
+                        input.end<SourceDeviceT>(),
+                        output_mutable.begin<TargetDeviceT>(),
+                        device_cast<SourceDeviceT, TargetDeviceT>{});
+
+      return output;
+    };
+
+    if (input.type().scale() == type.scale()) return casted();
+
+    if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) {
+      // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT
+      auto temporary = casted();
+      return detail::rescale<TargetT>(*temporary, scale_type{type.scale()}, stream, mr);
+    } else {
+      // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow
+      auto temporary = detail::rescale<SourceT>(input, scale_type{type.scale()}, stream, mr);
+      return detail::cast(*temporary, type, stream, mr);
+    }
   }
 
   template <typename TargetT,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 4d0009ab20a..db457623d8d 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1003,3 +1003,16 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
+{
+  using namespace numeric;
+  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const input    = fp_wrapperB{{141230900000L}, scale_type{-10}};
+  auto const expected = fp_wrapperA{{14123}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal32>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}

From 7d8a8e53f495279ae129fa46948c07230d6e77b4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 1 Dec 2021 13:53:05 -0800
Subject: [PATCH 60/72] Allow cast decimal128 to string and add tests (#9756)

Small PR that enables Decimal128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9756
---
 java/src/main/native/src/ColumnViewJni.cpp       |  3 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4efac307627..02d5dc4569c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
         case cudf::type_id::DECIMAL32:
-        case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index fa9052029cc..31a52eb2ec0 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3372,6 +3372,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastBigDecimalToString() {
+    BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
+        new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("9328323982309091029831.002")
+    };
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(bigValues);
+         ColumnVector values = cv.castTo(DType.STRING);
+         ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
+             "9223372036854775808.191",
+             "9328323982309091029831.002")) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastStringToBigDecimal() {
     String[] bigValues = {"923121331938210123.321",

From 5491cc789bbfbaad7099124dcfe004719e7f013c Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 Dec 2021 03:30:50 +0530
Subject: [PATCH 61/72] Fix memory error due to lambda return type deduction
 limitation (#9778)

Fixes #9703
replace device lambda with device functor with return type. (due to [14. extended-lambda-restrictions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions) )
~add `__host__` to lambda for nvcc return type deduction to work properly.~
~replaced `auto` (generic lambda) with `size_type`.~
fixes shared memory write error caused in #9703

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9778
---
 cpp/src/sort/rank.cu         | 13 +++++++++----
 cpp/tests/sort/rank_test.cpp | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index e9589e6c4b3..de0a44e3234 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -194,6 +194,12 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        stream);
 }
 
+// Returns index, count
+template <typename T>
+struct index_counter {
+  __device__ T operator()(size_type i) { return T{i, 1}; }
+};
+
 void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
@@ -208,10 +214,9 @@ void rank_average(cudf::device_span<size_type const> group_keys,
   using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    cudf::detail::make_counting_transform_iterator(1,
-                                                   [] __device__(auto i) {
-                                                     return MinCount{i, 1};
-                                                   }),
+    // Use device functor with return type. Cannot use device lambda due to limitation.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions
+    cudf::detail::make_counting_transform_iterator(1, index_counter<MinCount>{}),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 94e389fc7ce..926ad1e203e 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct)
   this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true);
 }
 
+struct RankLarge : public BaseFixture {
+};
+
+TEST_F(RankLarge, average_large)
+{
+  // testcase of https://github.com/rapidsai/cudf/issues/9703
+  auto iter = thrust::counting_iterator<int64_t>(0);
+  fixed_width_column_wrapper<int64_t> col1(iter, iter + 10558);
+  auto result =
+    cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false);
+  fixed_width_column_wrapper<double, int> expected(iter + 1, iter + 10559);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 }  // namespace test
 }  // namespace cudf

From c10966cc3847ca9837ddc7ce5df9c4d9b7c743d8 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 2 Dec 2021 18:48:03 +0800
Subject: [PATCH 62/72] Fix make_empty_scalar_like on list_type (#9759)

Fixes #9758

In `make_empty_scalar_like`, we create list scalar with the list column itself, which is wrong. The correct way is with the child of list column.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9759
---
 cpp/src/scalar/scalar_factories.cpp      | 7 +++++--
 cpp/tests/reductions/reduction_tests.cpp | 8 ++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d2876435780..c18b57d220f 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -184,10 +185,12 @@ std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
-    case type_id::LIST:
-      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+    case type_id::LIST: {
+      auto const empty_child = empty_like(lists_column_view(column).child());
+      result                 = make_list_scalar(empty_child->view(), stream, mr);
       result->set_valid_async(false, stream);
       break;
+    }
     case type_id::STRUCT:
       // The input column must have at least 1 row to extract a scalar (row) from it.
       result = detail::get_element(column, 0, stream, mr);
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index d8ee8f9d08d..e138cd6f68e 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1961,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
       auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
       EXPECT_EQ(is_valid, list_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+      if (is_valid) {
+        CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view());
+      } else {
+        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view());
+      }
     };
 
     if (succeeded_condition) {
@@ -2047,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 
   // test against empty input
   this->reduction_test(LCW{},
-                       ElementCol{{0}, {0}},  // expected_value,
+                       ElementCol{},  // expected_value,
                        true,
                        false,
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));

From 582cc6e466c7d941e1b34893fd56fbd42fe90d68 Mon Sep 17 00:00:00 2001
From: Chong Gao <gaochong.gc@qq.com>
Date: Thu, 2 Dec 2021 21:12:01 +0800
Subject: [PATCH 63/72] Add sample JNI API (#9728)

Add sample JNI

Signed-off-by: Chong Gao <res_life@163.com>

Authors:
  - Chong Gao (https://github.com/res-life)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9728
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 30 +++++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 15 ++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 21 +++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b0791fb440f..b11808ed023 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -678,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
                                                                 boolean[] keysDescending,
                                                                 boolean[] keysNullSmallest);
 
+  private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -2801,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     return result;
   }
 
+
+  /**
+   * Gather `n` samples from table randomly
+   * Note: does not preserve the ordering
+   * Example:
+   * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}}
+   * n: 3
+   * replacement: false
+   *
+   * output:       {col1: {3, 1, 4}, col2: {8, 6, 9}}
+   *
+   * replacement: true
+   *
+   * output:       {col1: {3, 1, 1}, col2: {8, 6, 6}}
+   *
+   * throws "logic_error" if `n` > table rows and `replacement` == FALSE.
+   * throws "logic_error" if `n` < 0.
+   *
+   * @param n non-negative number of samples expected from table
+   * @param replacement Allow or disallow sampling of the same row more than once.
+   * @param seed Seed value to initiate random number generator.
+   *
+   * @return Table containing samples
+   */
+  public Table sample(long n, boolean replacement, long seed) {
+    return new Table(sample(nativeHandle, n, replacement, seed));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a78d40a58f7..f3377bb002d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -20,6 +20,7 @@
 #include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
@@ -3147,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong n, jboolean replacement,
+                                                              jlong seed) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto sample_with_replacement =
+        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index fa221e19387..0b2f56895e9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7584,4 +7584,25 @@ void testExplodeOuterPosition() {
       }
     }
   }
+
+  @Test
+  void testSample() {
+    try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) {
+      try (Table ret = t.sample(3, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(5, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(8, true, 0);
+           Table expected = new Table.TestBuilder()
+               .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+    }
+  }
 }

From 1077daeaad8ff710de6f4fbb99f2e7371b4af8de Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:51:04 -0600
Subject: [PATCH 64/72] Fix caching in `Series.applymap` (#9821)

The cache key we were generating for these functions didn't take into account the constants that could be different in the bytecode. Hence certain functions were causing cache hits when they actually differ by a constant value somewhere in the logic.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9821
---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 19 +++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py           |  4 +++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index dc126546f15..c9c2c440632 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -593,3 +593,22 @@ def func(row, c, k):
         return y
 
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
+
+
+def test_masked_udf_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    data = cudf.Series([1, 2, 3])
+    expect = data ** 2
+    got = data.applymap(lambda x: x ** 2)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # update the constant value being used and make sure
+    # it does not result in a cache hit
+
+    expect = data ** 3
+    got = data.applymap(lambda x: x ** 3)
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 5fa091a0081..f0533dcaa72 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -216,12 +216,14 @@ def make_cache_key(udf, sig):
     recompiling the same function for the same set of types
     """
     codebytes = udf.__code__.co_code
+    constants = udf.__code__.co_consts
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
         cvarbytes = dumps(cvars)
     else:
         cvarbytes = b""
-    return codebytes, cvarbytes, sig
+
+    return constants, codebytes, cvarbytes, sig
 
 
 def compile_udf(udf, type_signature):

From 50acf076d4a35bc57dc00a416f0d9507b1992c0f Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 2 Dec 2021 14:07:31 -0800
Subject: [PATCH 65/72] Fix stream usage in `segmented_gather()` (#9679)

`detail::segmented_gather()` inadvertently uses `cuda_default_stream` in some parts of its implementation, while using the user-specified stream in others.

This applies to the calls to `copy_range_in_place()`, `allocate_like()`, and `make_lists_column()`. ~This might produce race conditions, which might explain NVIDIA/spark-rapids/issues/4060. It's a rare failure that's quite hard to reproduce.~ This might lead to over-synchronization, though bad output is unlikely.

The commit here should sort this out, by switching to the `detail` APIs corresponding to the calls above.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9679
---
 cpp/src/lists/copying/segmented_gather.cu | 21 ++++++++++++---------
 cpp/src/lists/extract.cu                  |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 8cbcddc1c58..41187b96cdb 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -88,14 +88,15 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child       = std::move(child_table->release().front());
 
   // Create list offsets from gather_map.
-  auto output_offset = cudf::allocate_like(
-    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = cudf::detail::allocate_like(
+    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr);
   auto output_offset_view = output_offset->mutable_view();
-  cudf::copy_range_in_place(gather_map.offsets(),
-                            output_offset_view,
-                            gather_map.offset(),
-                            gather_map.offset() + output_offset_view.size(),
-                            0);
+  cudf::detail::copy_range_in_place(gather_map.offsets(),
+                                    output_offset_view,
+                                    gather_map.offset(),
+                                    gather_map.offset() + output_offset_view.size(),
+                                    0,
+                                    stream);
   // Assemble list column & return
   auto null_mask       = cudf::detail::copy_bitmask(value_column.parent(), stream, mr);
   size_type null_count = value_column.null_count();
@@ -103,7 +104,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                            std::move(output_offset),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 381864e1a68..7c6c612eb25 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
   // `segmented_gather()` on a null index should produce a null row.
   if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
 
-  auto const d_indices = column_device_view::create(indices);
+  auto const d_indices = column_device_view::create(indices, stream);
   // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
   auto const null_replaced_iter_begin =
     cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());

From b848dd5c9cfef7e3523810d67296e037f31945c1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 2 Dec 2021 14:40:57 -0800
Subject: [PATCH 66/72] Fix ORC writer crash with empty input columns (#9808)

Fixes https://github.com/rapidsai/cudf/issues/9783

Skip some parts of writing when the input table was zero rows.
Add is_empty to `hostdevice_2dvector`.
Add Python test with empty columns.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9808
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e53fb3589bc..db02125ce77 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 6b02874146e..dc176992434 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)

From 0c08543955a01470baa4fbdbab927298dcf6afd9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 3 Dec 2021 04:53:37 +0530
Subject: [PATCH 67/72] Update cmake and conda to 22.02 (#9746)

Changes related to update to 22.02 in one conda environment recipe (only 11.5) was missed. This adds that.
Also makes project version changes in cmake related to update from 21.12 to 22.02.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9746
---
 ci/release/update-version.sh     | 6 +++---
 cpp/CMakeLists.txt               | 2 +-
 cpp/libcudf_kafka/CMakeLists.txt | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eeb76a15fcc..86432a92128 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,13 +30,13 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
-sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
 # cpp cudf_jni update
-sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50bdc30b292..e2b317f2e03 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 435ff3b5987..d0874b57c2d 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES CXX
 )
 

From ce64e53264d21c6e59fe98548796a7b6bae24c07 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 2 Dec 2021 20:19:12 -0600
Subject: [PATCH 68/72] Add directory-partitioned data support to
 cudf.read_parquet (#9720)

Closes #9684
Closes #9690

This PR refactors path handling in `cudf.read_parquet` and uses `pyarrow.dataset` to support for directory-partitioned datasets (with full filterings support at row-group granularity). Since it is my understanding that some users may wish for directory-partitioned columns to be represented as a raw dtype (rather than always becoming categorical), I also added an optional `categorical_partitions` argument (open to suggestions on a better name).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9720
---
 python/cudf/cudf/io/json.py              |   2 +-
 python/cudf/cudf/io/orc.py               |   2 +-
 python/cudf/cudf/io/parquet.py           | 286 +++++++++++++++++++----
 python/cudf/cudf/tests/test_parquet.py   |  94 +++++++-
 python/cudf/cudf/tests/test_s3.py        |   9 +-
 python/cudf/cudf/utils/ioutils.py        |  26 ++-
 python/dask_cudf/dask_cudf/io/parquet.py |   7 +-
 7 files changed, 355 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index a48cfd07d3f..1f876214b16 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -37,7 +37,7 @@ def read_json(
         for source in path_or_buf:
             if ioutils.is_directory(source, **kwargs):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source
+                    passed_filesystem=None, path=source, **kwargs
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index ecb1b0cd185..c1cce3f996f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -316,7 +316,7 @@ def read_orc(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+                passed_filesystem=None, path=source, **kwargs,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 9d665d9a0a5..04d64969a16 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
+from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
 
 
@@ -80,7 +81,7 @@ def write_to_dataset(
         kwargs for to_parquet function.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path)
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
     metadata = []
 
@@ -163,11 +164,19 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-def _process_row_groups(paths, fs, filters=None, row_groups=None):
+def _process_dataset(
+    paths, fs, filters=None, row_groups=None, categorical_partitions=True,
+):
+    # Returns:
+    #     file_list - Expanded/filtered list of paths
+    #     row_groups - Filtered list of row-group selections
+    #     partition_keys - list of partition keys for each file
+    #     partition_categories - Categories for each partition
 
     # The general purpose of this function is to (1) expand
     # directory input into a list of paths (using the pyarrow
-    # dataset API), and (2) to apply row-group filters.
+    # dataset API), (2) to apply row-group filters, and (3)
+    # to discover directory-partitioning information
 
     # Deal with case that the user passed in a directory name
     file_list = paths
@@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None):
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
 
-    if filters is not None:
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+    # Deal with directory partitioning
+    # Get all partition keys (without filters)
+    partition_categories = defaultdict(list)
+    file_fragment = None
+    for file_fragment in dataset.get_fragments():
+        keys = ds._get_partition_keys(file_fragment.partition_expression)
+        if not (keys or partition_categories):
+            # Bail - This is not a directory-partitioned dataset
+            break
+        for k, v in keys.items():
+            if v not in partition_categories[k]:
+                partition_categories[k].append(v)
+        if not categorical_partitions:
+            # Bail - We don't need to discover all categories.
+            # We only need to save the partition keys from this
+            # first `file_fragment`
+            break
+
+    if partition_categories and file_fragment is not None:
+        # Check/correct order of `categories` using last file_frag,
+        # because `_get_partition_keys` does NOT preserve the
+        # partition-hierarchy order of the keys.
+        cat_keys = [
+            part.split("=")[0]
+            for part in file_fragment.path.split(fs.sep)
+            if "=" in part
+        ]
+        if set(partition_categories) == set(cat_keys):
+            partition_categories = {
+                k: partition_categories[k]
+                for k in cat_keys
+                if k in partition_categories
+            }
+
+    # If we do not have partitioned data and
+    # are not filtering, we can return here
+    if filters is None and not partition_categories:
+        return file_list, row_groups, [], {}
+
+    # Record initial row_groups input
+    row_groups_map = {}
+    if row_groups is not None:
+        # Make sure paths and row_groups map 1:1
+        # and save the initial mapping
+        if len(paths) != len(file_list):
+            raise ValueError(
+                "Cannot specify a row_group selection for a directory path."
+            )
+        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}
+
+    # Apply filters and discover partition columns
+    partition_keys = []
+    if partition_categories or filters is not None:
+        file_list = []
+        if filters is not None:
+            row_groups = []
+        for file_fragment in dataset.get_fragments(filter=filters):
+            path = file_fragment.path
+
+            # Extract hive-partition keys, and make sure they
+            # are orederd the same as they are in `partition_categories`
+            if partition_categories:
+                raw_keys = ds._get_partition_keys(
+                    file_fragment.partition_expression
+                )
+                partition_keys.append(
+                    [
+                        (name, raw_keys[name])
+                        for name in partition_categories.keys()
+                    ]
                 )
 
-    return file_list, row_groups
+            # Apply row-group filtering
+            selection = row_groups_map.get(path, None)
+            if selection is not None or filters is not None:
+                filtered_row_groups = [
+                    rg_info.id
+                    for rg_fragment in file_fragment.split_by_row_group(
+                        filters, schema=dataset.schema,
+                    )
+                    for rg_info in rg_fragment.row_groups
+                ]
+            file_list.append(path)
+            if filters is not None:
+                if selection is None:
+                    row_groups.append(filtered_row_groups)
+                else:
+                    row_groups.append(
+                        [
+                            rg_id
+                            for rg_id in filtered_row_groups
+                            if rg_id in selection
+                        ]
+                    )
+
+    return (
+        file_list,
+        row_groups,
+        partition_keys,
+        partition_categories if categorical_partitions else {},
+    )
 
 
 def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
@@ -319,6 +407,7 @@ def read_parquet(
     strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=False,
+    categorical_partitions=True,
     *args,
     **kwargs,
 ):
@@ -345,17 +434,29 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
-    filepath_or_buffer = paths if paths else filepath_or_buffer
-    if fs is None and filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
 
-    # Apply filters now (before converting non-local paths to buffers).
-    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
-    # into a full list of files if it is a directory.
-    if fs is not None:
-        filepath_or_buffer, row_groups = _process_row_groups(
-            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+    # Use pyarrow dataset to detect/process directory-partitioned
+    # data and apply filters. Note that we can only support partitioned
+    # data and filtering if the input is a single directory or list of
+    # paths.
+    partition_keys = []
+    partition_categories = {}
+    if fs and paths:
+        (
+            paths,
+            row_groups,
+            partition_keys,
+            partition_categories,
+        ) = _process_dataset(
+            paths,
+            fs,
+            filters=filters,
+            row_groups=row_groups,
+            categorical_partitions=categorical_partitions,
         )
+    elif filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # Check if we should calculate the specific byte-ranges
     # needed for each parquet file. We always do this when we
@@ -380,15 +481,6 @@ def read_parquet(
     filepaths_or_buffers = []
     for i, source in enumerate(filepath_or_buffer):
 
-        if ioutils.is_directory(source, **kwargs):
-            # Note: For now, we know `fs` is an fsspec filesystem
-            # object, but it may be an arrow object in the future
-            fsspec_fs = ioutils._ensure_filesystem(
-                passed_filesystem=fs, path=source
-            )
-            source = ioutils.stringify_pathlike(source)
-            source = fsspec_fs.sep.join([source, "*.parquet"])
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
@@ -410,6 +502,117 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    # Warn user if they are not using cudf for IO
+    # (There is a good chance this was not the intention)
+    if engine != "cudf":
+        warnings.warn(
+            "Using CPU via PyArrow to read Parquet dataset."
+            "This option is both inefficient and unstable!"
+        )
+        if filters is not None:
+            warnings.warn(
+                "Parquet row-group filtering is only supported with "
+                "'engine=cudf'. Use pandas or pyarrow API directly "
+                "for full CPU-based filtering functionality."
+            )
+
+    return _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        skiprows=skiprows,
+        num_rows=num_rows,
+        strings_to_categorical=strings_to_categorical,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        **kwargs,
+    )
+
+
+def _parquet_to_frame(
+    paths_or_buffers,
+    *args,
+    row_groups=None,
+    partition_keys=None,
+    partition_categories=None,
+    **kwargs,
+):
+
+    # If this is not a partitioned read, only need
+    # one call to `_read_parquet`
+    if not partition_keys:
+        return _read_parquet(
+            paths_or_buffers, *args, row_groups=row_groups, **kwargs,
+        )
+
+    # For partitioned data, we need a distinct read for each
+    # unique set of partition keys. Therefore, we start by
+    # aggregating all paths with matching keys using a dict
+    plan = {}
+    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
+        rgs = row_groups[i] if row_groups else None
+        tkeys = tuple(keys)
+        if tkeys in plan:
+            plan[tkeys][0].append(path)
+            if rgs is not None:
+                plan[tkeys][1].append(rgs)
+        else:
+            plan[tkeys] = ([path], None if rgs is None else [rgs])
+
+    dfs = []
+    for part_key, (key_paths, key_row_groups) in plan.items():
+        # Add new DataFrame to our list
+        dfs.append(
+            _read_parquet(
+                key_paths, *args, row_groups=key_row_groups, **kwargs,
+            )
+        )
+        # Add partition columns to the last DataFrame
+        for (name, value) in part_key:
+            if partition_categories and name in partition_categories:
+                # Build the categorical column from `codes`
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=len(dfs[-1]),
+                )
+                dfs[-1][name] = build_categorical_column(
+                    categories=partition_categories[name],
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
+                    ordered=False,
+                )
+            else:
+                # Not building categorical columns, so
+                # `value` is already what we want
+                dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+
+    # Concatenate dfs and return.
+    # Assume we can ignore the index if it has no name.
+    return (
+        cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
+        if len(dfs) > 1
+        else dfs[0]
+    )
+
+
+def _read_parquet(
+    filepaths_or_buffers,
+    engine,
+    columns=None,
+    row_groups=None,
+    skiprows=None,
+    num_rows=None,
+    strings_to_categorical=None,
+    use_pandas_metadata=None,
+    *args,
+    **kwargs,
+):
+    # Simple helper function to dispatch between
+    # cudf and pyarrow to read parquet data
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
@@ -421,7 +624,6 @@ def read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
-        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b6595be9566..516ee0d17d3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1578,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf):
 
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
-def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
+def test_parquet_partitioned(tmpdir_factory, cols, filename):
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1597,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
         gdf_dir, index=False, partition_cols=cols, partition_file_name=filename
     )
 
-    # Use pandas since dataset may be partitioned
-    expect = pd.read_parquet(pdf_dir)
-    got = pd.read_parquet(gdf_dir)
-    assert_eq(expect, got)
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
     if filename:
@@ -1629,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
     gdf.to_parquet(dir1, partition_cols=cols)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
-    # cudf read_parquet cannot handle partitioned dataset
-    expect = pd.read_parquet(dir1)
-    got = pd.read_parquet(dir2)
+    # Read back with cudf
+    expect = cudf.read_parquet(dir1)
+    got = cudf.read_parquet(dir2)
     assert_eq(expect, got)
 
     gdf = cudf.DataFrame(
@@ -1645,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         gdf.to_parquet(dir1, partition_cols=cols)
 
 
+@pytest.mark.parametrize(
+    "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]],
+)
+@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"])
+@pytest.mark.parametrize("use_cat", [True, False])
+def test_read_parquet_partitioned_filtered(
+    tmpdir, pfilters, selection, use_cat
+):
+    path = str(tmpdir)
+    size = 100
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    df.to_parquet(path, partition_cols=["c", "b"])
+
+    if selection == "files":
+        # Pass in a list of paths
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = None
+    elif selection == "row-groups":
+        # Pass in a list of paths AND row-group ids
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = [[0] for p in read_path]
+    else:
+        # Pass in a directory path
+        # (row-group selection not allowed in this case)
+        read_path = path
+        row_groups = None
+
+    # Filter on partitioned columns
+    expect = pd.read_parquet(read_path, filters=pfilters)
+    got = cudf.read_parquet(
+        read_path,
+        filters=pfilters,
+        row_groups=row_groups,
+        categorical_partitions=use_cat,
+    )
+    if use_cat:
+        assert got.dtypes["b"] == "category"
+        assert got.dtypes["c"] == "category"
+    else:
+        # Check that we didn't get categorical
+        # columns, but convert back to categorical
+        # for comparison with pandas
+        assert got.dtypes["b"] == "object"
+        assert got.dtypes["c"] == "int"
+        got["b"] = pd.Categorical(
+            got["b"].to_pandas(), categories=list("abcd")
+        )
+        got["c"] = pd.Categorical(
+            got["c"].to_pandas(), categories=np.arange(4)
+        )
+    assert_eq(expect, got)
+
+    # Filter on non-partitioned column.
+    # Cannot compare to pandas, since the pyarrow
+    # backend will filter by row (and cudf can
+    # only filter by column, for now)
+    filters = [("a", "==", 10)]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and 10 in got["a"]
+
+    # Filter on both kinds of columns
+    filters = [[("a", "==", 10)], [("c", "==", 1)]]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+
+
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
     gdf_fname = tmpdir.join("gdf.parquet")
     test_path = "test/path"
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index dea876891f8..5738e1f0d00 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
-def test_write_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("partition_cols", [None, ["String"]])
+def test_write_parquet(s3_base, s3so, pdf, partition_cols):
     fname = "test_parquet_writer.parquet"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so)
+        gdf.to_parquet(
+            "s3://{}/{}".format(bname, fname),
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
         assert s3fs.exists("s3://{}/{}".format(bname, fname))
 
         got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname)))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0f9d9d53b23..e6c031acac7 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -154,6 +154,9 @@
 strings_to_categorical : boolean, default False
     If True, return string columns as GDF_CATEGORY dtype; if False, return a
     as GDF_STRING dtype.
+categorical_partitions : boolean, default True
+    Whether directory-partitioned columns should be interpreted as categorical
+    or raw dtypes.
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
@@ -1129,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs, _, paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
         except ValueError as e:
@@ -1153,9 +1156,9 @@ def is_directory(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
-            )
+            )[0]
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
                 return False
@@ -1189,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         else:
             path_or_data = [path_or_data]
 
-        # Pyarrow did not support the protocol or storage options.
-        # Fall back to fsspec
         try:
-            fs, _, fs_paths = fsspec.get_fs_token_paths(
+            fs, _, fs_paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
             return_paths = fs_paths
@@ -1322,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
-        fs, _, _ = fsspec.get_fs_token_paths(
+        fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
-        )
+        )[0]
 
         if not _is_local_filesystem(fs):
             filepath_or_buffer = fsspec.open(
@@ -1513,11 +1514,12 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path):
+def _ensure_filesystem(passed_filesystem, path, **kwargs):
     if passed_filesystem is None:
-        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
-            0
-        ]
+        return get_fs_token_paths(
+            path[0] if isinstance(path, list) else path,
+            storage_options=kwargs.get("storage_options", {}),
+        )[0]
     return passed_filesystem
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b47a5e78095..a49d73493ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -126,11 +126,8 @@ def _read_paths(
 
                 # Build the column from `codes` directly
                 # (since the category is often a larger dtype)
-                codes = (
-                    as_column(partitions[i].keys.index(index2))
-                    .as_frame()
-                    .repeat(len(df))
-                    ._data[None]
+                codes = as_column(
+                    partitions[i].keys.index(index2), length=len(df),
                 )
                 df[name] = build_categorical_column(
                     categories=partitions[i].keys,

From e82cc62e2ea61211c64ba4784cb131d5b535644c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 3 Dec 2021 04:46:25 -0800
Subject: [PATCH 69/72] Fix join of MultiIndex to Index with one column and
 overlapping name. (#9830)

This PR resolves #9823

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9830
---
 python/cudf/cudf/core/_base_index.py   |  4 ++--
 python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d688b75ed14..2fcc976d8e1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1147,14 +1147,14 @@ def join(
         if isinstance(lhs, cudf.MultiIndex):
             if level is not None and isinstance(level, int):
                 on = lhs._data.select_by_index(level).names[0]
-            right_names = (on,) or right_names
+            right_names = (on,) if on is not None else right_names
             on = right_names[0]
             if how == "outer":
                 how = "left"
             elif how == "right":
                 how = "inner"
         else:
-            # Both are nomal indices
+            # Both are normal indices
             right_names = left_names
             on = right_names[0]
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 0518cc2c9b9..d25c6130bfb 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -2150,3 +2150,16 @@ def test_join_redundant_params():
         lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
     with pytest.raises(ValueError):
         lhs.merge(rhs, left_on="c", right_on="b")
+
+
+def test_join_multiindex_index():
+    # test joining a MultiIndex with an Index with overlapping name
+    lhs = (
+        cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]})
+        .set_index(["a", "b"])
+        .index
+    )
+    rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index
+    expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner")
+    got = lhs.join(rhs, how="inner")
+    assert_join_results_equal(expect, got, how="inner")

From 69e6dbbf447a951e4b08f15c737eedcbaf3291da Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 3 Dec 2021 10:18:04 -0500
Subject: [PATCH 70/72] Move the binary_ops common dispatcher logic to be
 executed on the CPU (#9816)

* move NullEquals to separate file

* To improve runtime performance move more binary_ops dispatch to host

* make sure to forceinline the operator_dispatcher

* Correct style issues found by ci

* Expand the binary-op compiled benchmark suite

* Ensure forceinline is on binary ops device dispatch functions

* Correct style issues found by ci

Co-authored-by: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Co-authored-by: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
---
 cpp/CMakeLists.txt                            |  1 +
 .../binaryop/compiled_binaryop_benchmark.cpp  | 66 ++++++++++---------
 .../cudf/utilities/type_dispatcher.hpp        | 14 ++--
 cpp/src/binaryop/compiled/NullEquals.cu       | 26 ++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  2 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      | 63 ++++++++++++------
 cpp/src/binaryop/compiled/equality_ops.cu     | 41 ++++++++----
 7 files changed, 141 insertions(+), 72 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullEquals.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 59dc3c74af2..37f93f1868b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -185,6 +185,7 @@ add_library(
   src/binaryop/compiled/LogicalOr.cu
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
+  src/binaryop/compiled/NullEquals.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index bc0818ace4b..8d04f8bdcb2 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -50,14 +50,14 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)                    \
+#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut)              \
   BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
-    COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+    COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop)  \
   (::benchmark::State & st)                                                            \
   {                                                                                    \
     BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
   }                                                                                    \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop)                                       \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                                        \
     ->Unit(benchmark::kMicrosecond)                                                    \
     ->UseManualTime()                                                                  \
     ->Arg(10000)      /* 10k */                                                        \
@@ -70,30 +70,36 @@ using namespace cudf;
 using namespace numeric;
 
 // clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(ADD_1,          float,        float,        ADD,                  float);
+BINARYOP_BENCHMARK_DEFINE(ADD_2,          timestamp_s,  duration_s,   ADD,                  timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(SUB_1,          duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(SUB_2,          int64_t,      int64_t,      SUB,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_1,          float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_2,          duration_s,   int64_t,      MUL,                  duration_s);
+BINARYOP_BENCHMARK_DEFINE(DIV_1,          int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(DIV_2,          duration_ms,  int32_t,      DIV,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(TRUE_DIV,       int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV,      int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(MOD_1,          double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(MOD_2,          duration_ms,  int64_t,      MOD,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(PMOD,           int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(PYMOD,          int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(POW,            int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(LOG_BASE,       float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(ATAN2,          float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT,     int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT,    int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT,   int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_AND,    int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_OR,     int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR,    int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND,    double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR,     int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_1,        int32_t,      int64_t,      EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_2,        duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL,      decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(LESS,           timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(GREATER,        timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS,    duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_MAX,       decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(NULL_MIN,       timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index a04b8309142..d7d38aba4f3 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -531,7 +531,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,9 +541,7 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(cudf::data_type type2,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -568,10 +566,10 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                                          cudf::data_type type2,
-                                                                          F&& f,
-                                                                          Ts&&... args)
+CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
+                                                          cudf::data_type type2,
+                                                          F&& f,
+                                                          Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
new file mode 100644
index 00000000000..3fc76e804f7
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7b0139a0082..71d9b615153 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -316,7 +316,6 @@ case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs
 case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::EQUAL:
 case binary_operator::NOT_EQUAL:
-case binary_operator::NULL_EQUALS:
 if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
 dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break;
 case binary_operator::LESS:                 apply_binary_op<ops::Less>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -337,6 +336,7 @@ case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op<ops::ShiftRightUnsig
 case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..f10d3dd1a62 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -177,35 +177,51 @@ struct ops2_wrapper {
 };
 
 /**
- * @brief Functor which does single, and double type dispatcher in device code
+ * @brief Functor which does single type dispatcher in device code
  *
  * single type dispatcher for lhs and rhs with common types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <class BinaryOperator>
+struct binary_op_device_dispatcher {
+  data_type common_data_type;
+  mutable_column_device_view out;
+  column_device_view lhs;
+  column_device_view rhs;
+  bool is_lhs_scalar;
+  bool is_rhs_scalar;
+
+  __forceinline__ __device__ void operator()(size_type i)
+  {
+    type_dispatcher(common_data_type,
+                    ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+                    i);
+  }
+};
+
+/**
+ * @brief Functor which does double type dispatcher in device code
+ *
  * double type dispatcher for lhs and rhs without common types.
  *
  * @tparam BinaryOperator binary operator functor
  */
 template <class BinaryOperator>
-struct device_type_dispatcher {
+struct binary_op_double_device_dispatcher {
   mutable_column_device_view out;
   column_device_view lhs;
   column_device_view rhs;
   bool is_lhs_scalar;
   bool is_rhs_scalar;
-  std::optional<data_type> common_data_type;
 
-  __device__ void operator()(size_type i)
+  __forceinline__ __device__ void operator()(size_type i)
   {
-    if (common_data_type) {
-      type_dispatcher(*common_data_type,
-                      ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-                      i);
-    } else {
-      double_type_dispatcher(
-        lhs.type(),
-        rhs.type(),
-        ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-        i);
-    }
+    double_type_dispatcher(
+      lhs.type(),
+      rhs.type(),
+      ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+      i);
   }
 };
 
@@ -263,10 +279,19 @@ void apply_binary_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Create binop functor instance
-  auto binop_func = device_type_dispatcher<BinaryOperator>{
-    outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype};
-  // Execute it on every element
-  for_each(stream, outd.size(), binop_func);
+  if (common_dtype) {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_device_dispatcher<BinaryOperator>{
+               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  } else {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_double_device_dispatcher<BinaryOperator>{
+               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  }
 }
 
 }  // namespace compiled
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index feee310716a..03c3e373476 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -28,19 +28,32 @@ void dispatch_equality_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Execute it on every element
-  for_each(
-    stream,
-    outd.size(),
-    [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) {
-      // clang-format off
-      // Similar enabled template types should go together (better performance)
-      switch (op) {
-      case binary_operator::EQUAL:         device_type_dispatcher<ops::Equal>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NOT_EQUAL:     device_type_dispatcher<ops::NotEqual>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NULL_EQUALS:   device_type_dispatcher<ops::NullEquals>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      default:;
-      }
-      // clang-format on
-    });
+
+  if (common_dtype) {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::Equal>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::NotEqual>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  } else {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::Equal>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::NotEqual>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  }
 }
+
 }  // namespace cudf::binops::compiled

From 62103c6a99b4f2df00965e733542e08ce4b11448 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 3 Dec 2021 08:34:47 -0800
Subject: [PATCH 71/72] Added a few more tests for Decimal to String cast
 (#9818)

This PR adds a few more edge cases as a sanity test on the request of @sameerz

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9818
---
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 31a52eb2ec0..7120a40a26a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3376,6 +3376,8 @@ void testFixedWidthCast() {
   void testCastBigDecimalToString() {
     BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
         new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("-9.223"),
+        new BigDecimal("0.000"),
         new BigDecimal("9328323982309091029831.002")
     };
 
@@ -3383,9 +3385,21 @@ void testCastBigDecimalToString() {
          ColumnVector values = cv.castTo(DType.STRING);
          ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
              "9223372036854775808.191",
-             "9328323982309091029831.002")) {
+             "-9.223",
+             "0.000",
+            "9328323982309091029831.002")) {
       assertColumnsAreEqual(expected, values);
     }
+
+    BigDecimal[] bigValues0 = {new BigDecimal("992983283728193827182918744829283742232")};
+    try {
+      ColumnVector cv = ColumnVector.fromDecimals(bigValues0);
+      if (cv != null) {
+        cv.close();
+      }
+      fail("Precision check should've thrown an IllegalArgumentException");
+    } catch (IllegalArgumentException iae) {
+    }
   }
 
   @Test

From a93d333a0bf2e71145bbd53475e6c81dfe45b4df Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 3 Dec 2021 09:47:26 -0800
Subject: [PATCH 72/72] Fix ORC writer crash with empty input columns (#9831)

* skip some parts of writing if no rows; test

* expand test

* Apply suggestions from code review

Co-authored-by: Conor Hoekstra <36027403+codereport@users.noreply.github.com>

* fix

* remove integer-suffix

Co-authored-by: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 9e493c192e4..bf92af8ec75 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 99b5652110b..fe0bb6905cf 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)