From 4e34a20a31fae2546f9cfbaa520d7561b80563c7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 11:18:25 -0500
Subject: [PATCH 001/101] Backport: Fix segfault in conditional join (#16094)
 (#16100)

Backports #16094 to 24.06 for inclusion in a hotfix release.
---
 cpp/src/join/conditional_join.cu         | 13 +---
 cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++-------
 2 files changed, 70 insertions(+), 35 deletions(-)
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});

From e41242094092f9ed31fd4d04f8a30107c1ffb2ff Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 11:24:52 -0700
Subject: [PATCH 002/101] Backport #16038 to 24.06 (#16101)

Backporting #16038 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/include/cudf/ast/detail/operators.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index b618f33a6e5..c483d459833 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -819,7 +820,17 @@ struct operator_functor<ast_operator::NOT, false> {
 template <typename To>
 struct cast {
   static constexpr auto arity{1};
-  template <typename From>
+  template <typename From, typename std::enable_if_t<is_fixed_point<From>()>* = nullptr>
+  __device__ inline auto operator()(From f) -> To
+  {
+    if constexpr (cuda::std::is_floating_point_v<To>) {
+      return convert_fixed_to_floating<To>(f);
+    } else {
+      return static_cast<To>(f);
+    }
+  }
+
+  template <typename From, typename cuda::std::enable_if_t<!is_fixed_point<From>()>* = nullptr>
   __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);

From dfab1b589e5907b324dc1688f6dab862d194012c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 15:33:42 -0500
Subject: [PATCH 003/101] Backport: Use size_t to allow large conditional joins
 (#16127) (#16133)

Backports #16127 to 24.06 for inclusion in a hotfix release.

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 cpp/src/join/conditional_join.cu          |   5 +-
 cpp/src/join/conditional_join_kernels.cuh | 124 ++++++++++++++++++++--
 cpp/src/join/join_common_utils.cuh        |  95 -----------------
 3 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 97a06d5a923..d4ef2747c9d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -232,13 +232,14 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
   auto const& join_output_l = left_indices->data();
   auto const& join_output_r = right_indices->data();
+
   if (has_nulls) {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 1e16c451f5a..62769862f54 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -29,6 +29,110 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Adds a pair of indices to the shared memory cache
+ *
+ * @param[in] first The first index in the pair
+ * @param[in] second The second index in the pair
+ * @param[in,out] current_idx_shared Pointer to shared index that determines
+ * where in the shared memory cache the pair will be written
+ * @param[in] warp_id The ID of the warp of the calling the thread
+ * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
+ * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
+ */
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l,
+                                             size_type* joined_shared_r)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx = ref.fetch_add(1, cuda::memory_order_relaxed);
+  // It's guaranteed to fit into the shared cache
+  joined_shared_l[my_current_idx] = first;
+  joined_shared_r[my_current_idx] = second;
+}
+
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx      = ref.fetch_add(1, cuda::memory_order_relaxed);
+  joined_shared_l[my_current_idx] = first;
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type join_shared_r[num_warps][output_cache_size],
+                                   size_type* join_output_l,
+                                   size_type* join_output_r)
+{
+  // count how many active threads participating here which could be less than warp_size
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  // No warp sync is necessary here because we are assuming that ShuffleIndex
+  // is internally using post-CUDA 9.0 synchronization-safe primitives
+  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
+  // be safe by the compiler because it is not required by the standard to
+  // converge divergent branches before executing.
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
+    }
+  }
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 /**
  * @brief Computes the output size of joining the left table to the right table.
  *
@@ -103,14 +207,14 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     }
   }
 
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) {
     cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+    ref.fetch_add(block_counter, cuda::memory_order_relaxed);
   }
 }
 
@@ -143,13 +247,13 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
                                   join_kind join_type,
                                   cudf::size_type* join_output_l,
                                   cudf::size_type* join_output_r,
-                                  cudf::size_type* current_idx,
+                                  std::size_t* current_idx,
                                   cudf::ast::detail::expression_device_view device_expression_data,
-                                  cudf::size_type const max_size,
+                                  std::size_t const max_size,
                                   bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
@@ -183,7 +287,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
@@ -277,12 +381,12 @@ CUDF_KERNEL void conditional_join_anti_semi(
   table_device_view right_table,
   join_kind join_type,
   cudf::size_type* join_output_l,
-  cudf::size_type* current_idx,
+  std::size_t* current_idx,
   cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const max_size)
+  std::size_t const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
 
   extern __shared__ char raw_intermediate_storage[];
@@ -310,7 +414,7 @@ CUDF_KERNEL void conditional_join_anti_semi(
   for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 31f267d5cfb..3d0f3e4340d 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -262,101 +262,6 @@ struct valid_range {
   }
 };
 
-/**
- * @brief Adds a pair of indices to the shared memory cache
- *
- * @param[in] first The first index in the pair
- * @param[in] second The second index in the pair
- * @param[in,out] current_idx_shared Pointer to shared index that determines
- * where in the shared memory cache the pair will be written
- * @param[in] warp_id The ID of the warp of the calling the thread
- * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
- * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
- */
-__inline__ __device__ void add_pair_to_cache(size_type const first,
-                                             size_type const second,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l,
-                                             size_type* joined_shared_r)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-  // its guaranteed to fit into the shared cache
-  joined_shared_l[my_current_idx] = first;
-  joined_shared_r[my_current_idx] = second;
-}
-
-__inline__ __device__ void add_left_to_cache(size_type const first,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
-  joined_shared_l[my_current_idx] = first;
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type join_shared_r[num_warps][output_cache_size],
-                                   size_type* join_output_l,
-                                   size_type* join_output_r)
-{
-  // count how many active threads participating here which could be less than warp_size
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  // No warp sync is necessary here because we are assuming that ShuffleIndex
-  // is internally using post-CUDA 9.0 synchronization-safe primitives
-  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
-  // be safe by the compiler because it is not required by the standard to
-  // converge divergent branches before executing.
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
-    }
-  }
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type* join_output_l)
-{
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-    }
-  }
-}
-
 }  // namespace detail
 
 }  // namespace cudf

From 781794bb52448f617351ed96441a8e2fdb765dd7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 14:59:04 -0700
Subject: [PATCH 004/101] Backport #16045 to 24.06 (#16102)

Backporting #16045 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/tests/ast/transform_tests.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index ef1d09e5652..6b350c137d0 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionDoubleCast)
+{
+  auto c_0 = column_wrapper<double>{3, 20, 1, 50};
+  std::vector<__int128_t> data1{10, 7, 20, 0};
+  auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
+    data1.begin(), data1.end(), numeric::scale_type{0});
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+  auto expected   = column_wrapper<double>{13, 27, 21, 50};
+  auto result     = cudf::compute_column(table, expression);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, Literal)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From 3aedeeaaaa08bb99695bbbc34098a5660e4c94e0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:40:23 -0500
Subject: [PATCH 005/101] `cudf-polars` string slicing (#16082)

This PR plumbs the libcudf/pylibcudf `slice_strings` function through to cudf-polars. Depends on https://github.com/rapidsai/cudf/pull/15988

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16082
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 36 +++++++++++++++
 .../tests/expressions/test_stringfunction.py  | 46 +++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fe859c8d958..cfc2947f8de 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -703,6 +703,7 @@ def _validate_input(self):
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.Slice,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -716,6 +717,11 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
 
     def do_evaluate(
         self,
@@ -744,6 +750,36 @@ def do_evaluate(
                 flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
             )
             return Column(plc.strings.contains.contains_re(column.obj, prog))
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 9729e765948..8cf65dd51ac 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -37,6 +37,30 @@ def ldf(with_nulls):
     return pl.LazyFrame({"a": a, "b": range(len(a))})
 
 
+slice_cases = [
+    (1, 3),
+    (0, 3),
+    (0, 0),
+    (-3, 1),
+    (-100, 5),
+    (1, 1),
+    (100, 100),
+    (-3, 4),
+    (-3, 3),
+]
+
+
+@pytest.fixture(params=slice_cases)
+def slice_column_data(ldf, request):
+    start, length = request.param
+    if length:
+        return ldf.with_columns(
+            pl.lit(start).alias("start"), pl.lit(length).alias("length")
+        )
+    else:
+        return ldf.with_columns(pl.lit(start).alias("start"))
+
+
 def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
@@ -104,3 +128,25 @@ def test_contains_invalid(ldf):
         query.collect()
     with pytest.raises(pl.exceptions.ComputeError):
         query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
+
+
+@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
+def test_slice_scalars_offset(ldf, offset):
+    query = ldf.select(pl.col("a").str.slice(offset))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("offset,length", slice_cases)
+def test_slice_scalars_length_and_offset(ldf, offset, length):
+    query = ldf.select(pl.col("a").str.slice(offset, length))
+    assert_gpu_result_equal(query)
+
+
+def test_slice_column(slice_column_data):
+    if "length" in slice_column_data.collect_schema():
+        query = slice_column_data.select(
+            pl.col("a").str.slice(pl.col("start"), pl.col("length"))
+        )
+    else:
+        query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
+    assert_ir_translation_raises(query, NotImplementedError)

From 39de5a2527b297ba79c625993a49b28c3baf5b00 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 4 Jul 2024 06:49:06 +1000
Subject: [PATCH 006/101] Refactor from_arrow_device/host to use resource_ref
 (#16160)

Fixes #16159

Also fixes typos / leftovers in  dictionary `add_keys` copydocs.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16160
---
 .../cudf/dictionary/detail/update_keys.hpp    | 10 +++----
 cpp/include/cudf/interop.hpp                  | 29 ++++++++++---------
 cpp/src/interop/from_arrow_device.cu          | 27 ++++++++---------
 cpp/src/interop/from_arrow_host.cu            | 19 ++++++------
 cpp/src/interop/from_arrow_stream.cu          |  6 ++--
 5 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index e8486a80afc..9cdda773dbb 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -29,7 +29,7 @@ namespace dictionary {
 namespace detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -40,7 +40,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,7 +51,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +61,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -72,7 +72,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc
- * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
+ * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 502ffb9ba4f..11f6ce2bad7 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -372,8 +373,8 @@ std::unique_ptr<cudf::scalar> from_arrow(
 std::unique_ptr<cudf::table> from_arrow(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -391,8 +392,8 @@ std::unique_ptr<cudf::table> from_arrow(
 std::unique_ptr<cudf::column> from_arrow_column(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -415,8 +416,8 @@ std::unique_ptr<cudf::column> from_arrow_column(
 std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowArrayStream input
@@ -433,8 +434,8 @@ std::unique_ptr<table> from_arrow_host(
  */
 std::unique_ptr<table> from_arrow_stream(
   ArrowArrayStream* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -456,8 +457,8 @@ std::unique_ptr<table> from_arrow_stream(
 std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -537,8 +538,8 @@ using unique_table_view_t =
 unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -580,8 +581,8 @@ using unique_column_view_t =
 unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 73c1a474310..e1d289e67a3 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -35,6 +35,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -56,7 +57,7 @@ struct dispatch_from_arrow_device {
                               data_type,
                               bool,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
   }
@@ -68,7 +69,7 @@ struct dispatch_from_arrow_device {
                               data_type type,
                               bool skip_mask,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref mr)
   {
     size_type const num_rows   = input->length;
     size_type const offset     = input->offset;
@@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 template <>
 dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
@@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* s
                                                               data_type type,
                                                               bool skip_mask,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
@@ -141,7 +142,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
                "Large strings are not yet supported in from_arrow_device",
@@ -182,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView keys_schema_view;
   NANOARROW_THROW_NOT_OK(
@@ -238,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> children;
   owned_columns_t out_owned_cols;
@@ -283,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type const num_rows   = input->length;
   size_type const offset     = input->offset;
@@ -324,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -342,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -397,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -429,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -439,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b7e07056686..b3087dedf98 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -38,6 +38,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -49,7 +50,7 @@ namespace {
 
 struct dispatch_copy_from_arrow_host {
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(ArrowArray const* array)
   {
@@ -131,7 +132,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
@@ -388,7 +389,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -405,7 +406,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -441,7 +442,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -462,7 +463,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -472,7 +473,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -482,7 +483,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
                                   ArrowArray const* input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +498,7 @@ std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_column(ArrowSchema const* schema,
                                           ArrowArray const* input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index 0c85b561944..578105aa90a 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -41,7 +41,7 @@ namespace {
 
 std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView schema_view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
@@ -81,7 +81,7 @@ std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
 
@@ -135,7 +135,7 @@ std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_arrow_stream(input, stream, mr);

From dab6a447ca418073ec50c4e95aee5f0448fc95c2 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:30:24 -0700
Subject: [PATCH 007/101] Add environment-agnostic
 `ci/run_cudf_polars_pytest.sh` (#16178)

Adds environment-agnostic `ci/run_cudf_polars_pytest.sh` script, similar to the scripts added in https://github.com/rapidsai/cudf/pull/14992.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16178
---
 ci/run_cudf_polars_pytests.sh | 11 +++++++++++
 ci/test_cudf_polars.sh        |  6 ++----
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100755 ci/run_cudf_polars_pytests.sh

diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
new file mode 100755
index 00000000000..78683b057a5
--- /dev/null
+++ b/ci/run_cudf_polars_pytests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level.
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
+
+pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
index 95fb4b431bf..ca98c4dadb3 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_cudf_polars.sh
@@ -42,13 +42,11 @@ EXITCODE=0
 trap set_exitcode ERR
 set +e
 
-python -m pytest \
-       --cache-clear \
+./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
        --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
-       python/cudf_polars/tests
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml"
 
 trap ERR
 set -e

From 769e94ffcebaabe33ddec4ab8f178f6d1c7545aa Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:31:28 -0700
Subject: [PATCH 008/101] Make `test_python_cudf_pandas` generate
 `requirements.txt` (#16181)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16181
---
 dependencies.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index e3f8a72e76c..6d4ba0c38d1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -755,7 +755,7 @@ dependencies:
           - {matrix: null, packages: *cupy_packages_cu11}
   test_python_pandas_cudf:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           # dependencies to run pandas tests
           # https://github.com/pandas-dev/pandas/blob/main/environment.yml
@@ -766,7 +766,7 @@ dependencies:
           - pytest-reportlog
   test_python_cudf_pandas:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - ipython
           - openpyxl

From aa4033c5fe0be9e3d235d5722f1030c60b04e34d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 10:10:02 +0100
Subject: [PATCH 009/101] Cast count aggs to correct dtype in translation
 (#16192)

Polars default dtypes for some aggregations, particularly count, don't match ours, so insert casts.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16192
---
 python/cudf_polars/cudf_polars/dsl/translate.py | 17 +++++++++++++----
 python/cudf_polars/tests/test_groupby.py        |  5 +----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a2fdb3c3d79..0019b3aa98a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -432,8 +432,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
-    else:
-        return expr.Cast(dtype, inner)
+    elif isinstance(inner, expr.Cast):
+        # Translation of Len/Count-agg put in a cast, remove double
+        # casts if we have one.
+        (inner,) = inner.children
+    return expr.Cast(dtype, inner)
 
 
 @_translate_expr.register
@@ -443,12 +446,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Agg(
+    value = expr.Agg(
         dtype,
         node.name,
         node.options,
         *(translate_expr(visitor, n=n) for n in node.arguments),
     )
+    if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(value.dtype, value)
+    return value
 
 
 @_translate_expr.register
@@ -475,7 +481,10 @@ def _(
 
 @_translate_expr.register
 def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Len(dtype)
+    value = expr.Len(dtype)
+    if dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(dtype, value)
+    return value  # pragma: no cover; never reached since polars len has uint32 dtype
 
 
 def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index aefad59eb91..8a6732b7063 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -83,10 +83,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 
-    # TODO: polars returns UInt32, libcudf returns Int32
-    with pytest.raises(AssertionError):
-        assert_gpu_result_equal(q, check_row_order=False)
-    assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 @pytest.mark.parametrize(

From 5f57bc9034311f5461981644dec86c9c2e3434c7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 11:55:36 +0100
Subject: [PATCH 010/101] Some small fixes in cudf-polars (#16191)

These catch a few more edge cases.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16191
---
 python/cudf_polars/cudf_polars/callback.py          | 13 +++++++++++--
 .../cudf_polars/cudf_polars/containers/dataframe.py |  6 +++++-
 python/cudf_polars/cudf_polars/dsl/ir.py            |  2 ++
 python/cudf_polars/tests/test_union.py              |  9 +++++++++
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 979087d5273..764cdd3b3ca 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -34,7 +34,12 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None
         Should conversion raise an exception rather than continuing
         without setting a callback.
 
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except NotImplementedError:
+    except exception:
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ec8d00c3123..d86656578d7 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
@@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns])
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9b3096becd4..31a0be004ea 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -96,6 +96,8 @@ def broadcast(
     ``target_length`` is provided and not all columns are length-1
     (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
     """
+    if len(columns) == 0:
+        return []
     lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index b021d832910..865b95a7d91 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -46,3 +46,12 @@ def test_concat_vertical():
     q = pl.concat([ldf, ldf2], how="vertical")
 
     assert_gpu_result_equal(q)
+
+
+def test_concat_diagonal_empty():
+    df1 = pl.LazyFrame()
+    df2 = pl.LazyFrame({"a": [1, 2]})
+
+    q = pl.concat([df1, df2], how="diagonal_relaxed")
+
+    assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})

From c1c62f1c02cf3929fb7536d67d14a24a9e2950ea Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 4 Jul 2024 04:31:06 -1000
Subject: [PATCH 011/101] Fix `memory_usage` when calculating nested list
 column (#16193)

The offset column of a nested empty list column may be empty as discussed in https://github.com/rapidsai/cudf/issues/16164. `ListColumn.memory_usage` assumed that this column was non-empty

Unblocks https://github.com/rapidsai/cuspatial/pull/1400

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16193
---
 python/cudf/cudf/core/column/lists.py | 11 ++++++++---
 python/cudf/cudf/tests/test_list.py   | 27 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c548db67344..1992d471947 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -73,10 +73,15 @@ def memory_usage(self):
             child0_size = (
                 current_base_child.size + 1 - current_offset
             ) * current_base_child.base_children[0].dtype.itemsize
-            current_offset = current_base_child.base_children[
-                0
-            ].element_indexing(current_offset)
             n += child0_size
+            current_offset_col = current_base_child.base_children[0]
+            if not len(current_offset_col):
+                # See https://github.com/rapidsai/cudf/issues/16164 why
+                # offset column can be uninitialized
+                break
+            current_offset = current_offset_col.element_indexing(
+                current_offset
+            )
             current_base_child = current_base_child.base_children[1]
 
         n += (
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f76143cb381..ec9d7995b05 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,6 +12,7 @@
 from cudf import NA
 from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
+from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
@@ -926,3 +927,29 @@ def test_list_iterate_error():
 def test_list_struct_list_memory_usage():
     df = cudf.DataFrame({"a": [[{"b": [1]}]]})
     assert df.memory_usage().sum() == 16
+
+
+def test_empty_nested_list_uninitialized_offsets_memory_usage():
+    col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
+    nested_col = col.children[1]
+    empty_inner = type(nested_col)(
+        size=nested_col.size,
+        dtype=nested_col.dtype,
+        mask=nested_col.mask,
+        offset=nested_col.offset,
+        null_count=nested_col.null_count,
+        children=(
+            column_empty(0, nested_col.children[0].dtype),
+            nested_col.children[1],
+        ),
+    )
+    col_empty_offset = type(col)(
+        size=col.size,
+        dtype=col.dtype,
+        mask=col.mask,
+        offset=col.offset,
+        null_count=col.null_count,
+        children=(column_empty(0, col.children[0].dtype), empty_inner),
+    )
+    ser = cudf.Series._from_data({None: col_empty_offset})
+    assert ser.memory_usage() == 8

From f3a1216bb9bac07667b05cef01fe007fe6dc52ce Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 4 Jul 2024 12:49:10 -0400
Subject: [PATCH 012/101] Migrate lists/modifying to pylibcudf (#16185)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16185
---
 .../_lib/pylibcudf/libcudf/lists/reverse.pxd  | 14 ++++++++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 26 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 12 +++++++++
 4 files changed, 54 insertions(+)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
new file mode 100644
index 00000000000..0382a5d42c3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] reverse(
+        const lists_column_view& lists_column,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2ccf0139e90..c9d0a84e8ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar)
 cpdef Column contains_nulls(Column)
 
 cpdef Column index_of(Column, ColumnOrScalar, bool)
+
+cpdef Column reverse(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index a94d940accd..651f1346f88 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
@@ -206,3 +207,28 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
             find_option,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column reverse(Column input):
+    """Reverse the element order within each list of the input column.
+
+    For details, see :cpp:func:`reverse`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column with reversed lists.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_reverse.reverse(
+            list_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index c781126e388..58a1dcf8d56 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -134,3 +134,15 @@ def test_index_of_list_column(test_data, column):
     expect = pa.array(column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+def test_reverse(test_data):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.reverse(plc_column)
+
+    expect = pa.array([lst[::-1] for lst in list_column])
+
+    assert_column_eq(expect, res)

From ae422187743af5b9081028de7405b9ded73787b8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 12:27:50 +0100
Subject: [PATCH 013/101] Expose type traits to pylibcudf (#16197)

Rather than recreating the classification, OAOO by using the libcudf definitions.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16197
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   7 +-
 .../user_guide/api_docs/pylibcudf/traits.rst  |   6 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   4 +
 .../pylibcudf/libcudf/utilities/traits.pxd    |  27 ++++
 python/cudf/cudf/_lib/pylibcudf/traits.pxd    |  25 +++
 python/cudf/cudf/_lib/pylibcudf/traits.pyx    | 151 ++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  39 -----
 .../cudf/cudf/pylibcudf_tests/test_copying.py |  47 +++---
 .../cudf/cudf/pylibcudf_tests/test_traits.py  | 110 +++++++++++++
 python/cudf_polars/cudf_polars/dsl/expr.py    |   3 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  13 --
 13 files changed, 361 insertions(+), 75 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_traits.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e9dad705cbf..bd6f0f77357 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
-    io/index.rst
     interop
     join
     lists
     merge
     quantiles
     reduce
+    replace
     reshape
     rolling
     round
     scalar
     search
-    stream_compaction
     sorting
-    replace
+    stream_compaction
     table
+    traits
     types
     unary
 
@@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 2
     :caption: Subpackages
 
+    io/index.rst
     strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
new file mode 100644
index 00000000000..294ca8dc78c
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -0,0 +1,6 @@
+======
+traits
+======
+
+.. automodule:: cudf._lib.pylibcudf.traits
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0a198f431a7..d22096081af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -38,6 +38,7 @@ set(cython_sources
     stream_compaction.pyx
     sorting.pyx
     table.pyx
+    traits.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5131df9a5cd..d4d615cde34 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -23,6 +23,7 @@ from . cimport (
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -54,12 +55,14 @@ __all__ = [
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 43a9e2aca31..91f8acaf682 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -23,6 +23,7 @@
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -35,6 +36,7 @@
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "TypeId",
@@ -54,12 +56,14 @@
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
new file mode 100644
index 00000000000..0cc58af735b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+
+
+cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
+    cdef bool is_relationally_comparable(data_type)
+    cdef bool is_equality_comparable(data_type)
+    cdef bool is_numeric(data_type)
+    cdef bool is_index_type(data_type)
+    cdef bool is_unsigned(data_type)
+    cdef bool is_integral(data_type)
+    cdef bool is_integral_not_bool(data_type)
+    cdef bool is_floating_point(data_type)
+    cdef bool is_boolean(data_type)
+    cdef bool is_timestamp(data_type)
+    cdef bool is_fixed_point(data_type)
+    cdef bool is_duration(data_type)
+    cdef bool is_chrono(data_type)
+    cdef bool is_dictionary(data_type)
+    cdef bool is_fixed_width(data_type)
+    cdef bool is_compound(data_type)
+    cdef bool is_nested(data_type)
+    cdef bool is_bit_castable(data_type, data_type)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
new file mode 100644
index 00000000000..668fa775202
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ)
+cpdef bool is_equality_comparable(DataType typ)
+cpdef bool is_numeric(DataType typ)
+cpdef bool is_index_type(DataType typ)
+cpdef bool is_unsigned(DataType typ)
+cpdef bool is_integral(DataType typ)
+cpdef bool is_integral_not_bool(DataType typ)
+cpdef bool is_floating_point(DataType typ)
+cpdef bool is_boolean(DataType typ)
+cpdef bool is_timestamp(DataType typ)
+cpdef bool is_fixed_point(DataType typ)
+cpdef bool is_duration(DataType typ)
+cpdef bool is_chrono(DataType typ)
+cpdef bool is_dictionary(DataType typ)
+cpdef bool is_fixed_width(DataType typ)
+cpdef bool is_compound(DataType typ)
+cpdef bool is_nested(DataType typ)
+cpdef bool is_bit_castable(DataType source, DataType target)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
new file mode 100644
index 00000000000..d2370f8d641
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ):
+    """Checks if the given data type supports relational comparisons.
+
+    For details, see :cpp:func:`is_relationally_comparable`.
+    """
+    return traits.is_relationally_comparable(typ.c_obj)
+
+
+cpdef bool is_equality_comparable(DataType typ):
+    """Checks if the given data type supports equality comparisons.
+
+    For details, see :cpp:func:`is_equality_comparable`.
+    """
+    return traits.is_equality_comparable(typ.c_obj)
+
+
+cpdef bool is_numeric(DataType typ):
+    """Checks if the given data type is numeric.
+
+    For details, see :cpp:func:`is_numeric`.
+    """
+    return traits.is_numeric(typ.c_obj)
+
+
+cpdef bool is_index_type(DataType typ):
+    """Checks if the given data type is an index type.
+
+    For details, see :cpp:func:`is_index_type`.
+    """
+    return traits.is_index_type(typ.c_obj)
+
+
+cpdef bool is_unsigned(DataType typ):
+    """Checks if the given data type is an unsigned type.
+
+    For details, see :cpp:func:`is_unsigned`.
+    """
+    return traits.is_unsigned(typ.c_obj)
+
+
+cpdef bool is_integral(DataType typ):
+    """Checks if the given data type is an integral type.
+
+    For details, see :cpp:func:`is_integral`.
+    """
+    return traits.is_integral(typ.c_obj)
+
+
+cpdef bool is_integral_not_bool(DataType typ):
+    """Checks if the given data type is an integral type excluding booleans.
+
+    For details, see :cpp:func:`is_integral_not_bool`.
+    """
+    return traits.is_integral_not_bool(typ.c_obj)
+
+
+cpdef bool is_floating_point(DataType typ):
+    """Checks if the given data type is a floating point type.
+
+    For details, see :cpp:func:`is_floating_point`.
+    """
+    return traits.is_floating_point(typ.c_obj)
+
+
+cpdef bool is_boolean(DataType typ):
+    """Checks if the given data type is a boolean type.
+
+    For details, see :cpp:func:`is_boolean`.
+    """
+    return traits.is_boolean(typ.c_obj)
+
+
+cpdef bool is_timestamp(DataType typ):
+    """Checks if the given data type is a timestamp type.
+
+    For details, see :cpp:func:`is_timestamp`.
+    """
+    return traits.is_timestamp(typ.c_obj)
+
+
+cpdef bool is_fixed_point(DataType typ):
+    """Checks if the given data type is a fixed point type.
+
+    For details, see :cpp:func:`is_fixed_point`.
+    """
+    return traits.is_fixed_point(typ.c_obj)
+
+
+cpdef bool is_duration(DataType typ):
+    """Checks if the given data type is a duration type.
+
+    For details, see :cpp:func:`is_duration`.
+    """
+    return traits.is_duration(typ.c_obj)
+
+
+cpdef bool is_chrono(DataType typ):
+    """Checks if the given data type is a chrono type.
+
+    For details, see :cpp:func:`is_chrono`.
+    """
+    return traits.is_chrono(typ.c_obj)
+
+
+cpdef bool is_dictionary(DataType typ):
+    """Checks if the given data type is a dictionary type.
+
+    For details, see :cpp:func:`is_dictionary`.
+    """
+    return traits.is_dictionary(typ.c_obj)
+
+
+cpdef bool is_fixed_width(DataType typ):
+    """Checks if the given data type is a fixed width type.
+
+    For details, see :cpp:func:`is_fixed_width`.
+    """
+    return traits.is_fixed_width(typ.c_obj)
+
+
+cpdef bool is_compound(DataType typ):
+    """Checks if the given data type is a compound type.
+
+    For details, see :cpp:func:`is_compound`.
+    """
+    return traits.is_compound(typ.c_obj)
+
+
+cpdef bool is_nested(DataType typ):
+    """Checks if the given data type is a nested type.
+
+    For details, see :cpp:func:`is_nested`.
+    """
+    return traits.is_nested(typ.c_obj)
+
+
+cpdef bool is_bit_castable(DataType source, DataType target):
+    """Checks if the source type is bit-castable to the target type.
+
+    For details, see :cpp:func:`is_bit_castable`.
+    """
+    return traits.is_bit_castable(source.c_obj, target.c_obj)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index f8bfe340ae5..d41e6c720bf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -102,49 +102,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     return pytest.raises(expected_exception, *args, **kwargs)
 
 
-# TODO: Consider moving these type utilities into pylibcudf.types itself.
-def is_signed_integer(plc_dtype: plc.DataType):
-    return (
-        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
-    )
-
-
-def is_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.INT8,
-        plc.TypeId.INT16,
-        plc.TypeId.INT32,
-        plc.TypeId.INT64,
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
-def is_floating(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.FLOAT32,
-        plc.TypeId.FLOAT64,
-    )
-
-
-def is_boolean(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.BOOL8
-
-
 def is_string(plc_dtype: plc.DataType):
     return plc_dtype.id() == plc.TypeId.STRING
 
 
-def is_fixed_width(plc_dtype: plc.DataType):
-    return (
-        is_integer(plc_dtype)
-        or is_floating(plc_dtype)
-        or is_boolean(plc_dtype)
-    )
-
-
 def nesting_level(typ) -> tuple[int, int]:
     """Return list and struct nesting of a pyarrow type."""
     if isinstance(typ, pa.ListType):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index 0a6df198d46..f27fe4e942e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -9,9 +9,6 @@
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
-    is_fixed_width,
-    is_floating,
-    is_integer,
     is_nested_list,
     is_nested_struct,
     is_string,
@@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
             pa_array = pa.array([1] * plc_source_table.num_rows())
@@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
@@ -458,7 +455,7 @@ def test_empty_like_table(source_table):
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
     _, plc_input_column = input_column
-    if is_fixed_width(plc_input_column.type()):
+    if plc.traits.is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
             plc_input_column,
             plc.copying.MaskAllocationPolicy.RETAIN,
@@ -484,7 +481,7 @@ def test_copy_range_in_place(
 
     pa_target_column, _ = target_column
 
-    if not is_fixed_width(mutable_target_column.type()):
+    if not plc.traits.is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds(
 ):
     _, plc_input_column = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds(
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
-    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := mutable_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch(
 ):
     pa_input_column, _ = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
             pa_input_column,
@@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch(
 def test_copy_range(input_column, target_column):
     pa_input_column, plc_input_column = input_column
     pa_target_column, plc_target_column = target_column
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.copy_range(
             plc_input_column,
             plc_target_column,
@@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column):
 
 def test_copy_range_different_types(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar):
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
@@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar):
 
 def test_shift_type_mismatch(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
@@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
 def test_copy_if_else_wrong_type(target_column, mask):
     _, plc_target_column = target_column
     _, plc_mask = mask
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(
             pa.array(["a"] * plc_target_column.size())
         )
@@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
     _, plc_target_table = target_table
     _, plc_mask = mask
-    if is_integer(
+    if plc.traits.is_integral_not_bool(
         dtype := plc_target_table.columns()[0].type()
-    ) or is_floating(dtype):
+    ) or plc.traits.is_floating_point(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py
new file mode 100644
index 00000000000..6c22cb02f21
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_relationally_comparable():
+    assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_relationally_comparable(
+        plc.DataType(plc.TypeId.LIST)
+    )
+
+
+def test_is_equality_comparable():
+    assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_numeric():
+    assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_index_type():
+    assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_unsigned():
+    assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8))
+    assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8))
+
+
+def test_is_integral():
+    assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32))
+
+
+def test_is_integral_not_bool():
+    assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_floating_point():
+    assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_boolean():
+    assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_timestamp():
+    assert plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+
+
+def test_is_fixed_point():
+    assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128))
+    assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32))
+
+
+def test_is_duration():
+    assert plc.traits.is_duration(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+    assert not plc.traits.is_duration(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+
+
+def test_is_chrono():
+    assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS))
+    assert plc.traits.is_chrono(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_dictionary():
+    assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32))
+    assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_fixed_width():
+    assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_compound():
+    assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_nested():
+    assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_bit_castable():
+    assert plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8)
+    )
+    assert not plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16)
+    )
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index cfc2947f8de..69bc85b109d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1218,7 +1218,8 @@ def __init__(
         self.children = (left, right)
         if (
             op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and plc.traits.is_chrono(left.dtype)
+            and plc.traits.is_chrono(right.dtype)
             and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
         ):
             raise NotImplementedError("Casting rules for timelike types")
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 507acb5d33a..918cd024fa2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -17,19 +17,6 @@
 __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
 
 
-TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
-    [
-        plc.TypeId.TIMESTAMP_MILLISECONDS,
-        plc.TypeId.TIMESTAMP_MICROSECONDS,
-        plc.TypeId.TIMESTAMP_NANOSECONDS,
-        plc.TypeId.TIMESTAMP_DAYS,
-        plc.TypeId.DURATION_MILLISECONDS,
-        plc.TypeId.DURATION_MICROSECONDS,
-        plc.TypeId.DURATION_NANOSECONDS,
-    ]
-)
-
-
 def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
     """
     Do two datetime typeids have matching resolution for a binop.

From 37defc6b943094921200146c5f6042a91e68c75a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Jul 2024 09:44:05 -0400
Subject: [PATCH 014/101] Use strings concatenate to support large strings in
 CSV writer (#16148)

Changes the CSV writer logic to use `cudf::strings::concatenate` instead of `cudf::strings::join_strings` when output size exceeds `join_strings` limit.

Closes #16137

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16148
---
 cpp/src/io/csv/writer_impl.cu | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 7c4d5711281..63eb0b03c5f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -25,6 +25,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator(), true, stream};
-  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
-                                                            newline,
-                                                            string_scalar{"", false, stream},
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
-  strings_column_view strings_column{p_str_col_w_nl->view()};
 
-  auto total_num_bytes      = strings_column.chars_size(stream);
-  char const* ptr_all_bytes = strings_column.chars_begin(stream);
+  // use strings concatenate to build the final CSV output in device memory
+  auto contents_w_nl = [&] {
+    auto const total_size =
+      str_column_view.chars_size(stream) + (newline.size() * str_column_view.size());
+    auto const empty_str = string_scalar("", true, stream);
+    // use join_strings when the output will be less than 2GB
+    if (total_size < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr)
+        ->release();
+    }
+    auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream);
+    // convert the last element into an empty string by resetting the last offset value
+    auto& offsets     = nl_col->child(strings_column_view::offsets_column_index);
+    auto offsets_view = offsets.mutable_view();
+    cudf::fill_in_place(offsets_view,
+                        offsets.size() - 1,  // set the last element with
+                        offsets.size(),      // the value from 2nd to last element
+                        *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr),
+                        stream);
+    auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()});
+    return cudf::strings::detail::concatenate(
+             nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr)
+      ->release();
+  }();
+  auto const total_num_bytes = contents_w_nl.data->size();
+  auto const ptr_all_bytes   = static_cast<char const*>(contents_w_nl.data->data());
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
@@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink,
           str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
+      write_chunked(
+        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
     }
   }
 }

From 7dd69452bb72ca8cc440af52cb6ca8386950c264 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 5 Jul 2024 07:58:30 -0700
Subject: [PATCH 015/101] CI: Build wheels for cudf-polars (#16156)

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16156
---
 .github/workflows/pr.yaml                     | 16 +++++++++++++---
 ci/build_wheel_cudf_polars.sh                 | 11 +++++++++++
 ci/run_cudf_polars_pytests.sh                 |  2 +-
 ...df_polars.sh => test_wheel_cudf_polars.sh} | 19 +++++++------------
 ci/test_wheel_dask_cudf.sh                    |  2 +-
 python/cudf_polars/pyproject.toml             |  2 --
 6 files changed, 33 insertions(+), 19 deletions(-)
 create mode 100755 ci/build_wheel_cudf_polars.sh
 rename ci/{test_cudf_polars.sh => test_wheel_cudf_polars.sh} (70%)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a35802f2ab0..ceee9074b93 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,7 +25,8 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
-      - test-cudf-polars
+      - wheel-build-cudf-polars
+      - wheel-tests-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -133,9 +134,18 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
-  test-cudf-polars:
+  wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -143,7 +153,7 @@ jobs:
       build_type: pull-request
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
-      script: "ci/test_cudf_polars.sh"
+      script: "ci/test_wheel_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
new file mode 100755
index 00000000000..9c945e11c00
--- /dev/null
+++ b/ci/build_wheel_cudf_polars.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/cudf_polars"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
index 78683b057a5..c10612a065a 100755
--- a/ci/run_cudf_polars_pytests.sh
+++ b/ci/run_cudf_polars_pytests.sh
@@ -8,4 +8,4 @@ set -euo pipefail
 # Support invoking run_cudf_polars_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
-pytest --cache-clear "$@" tests
+python -m pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
similarity index 70%
rename from ci/test_cudf_polars.sh
rename to ci/test_wheel_cudf_polars.sh
index ca98c4dadb3..900acd5d473 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,19 +18,14 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
-RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
-mkdir -p "${RAPIDS_TESTS_DIR}"
-
-rapids-logger "Install cudf wheel"
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
+# Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 rapids-logger "Install cudf_polars"
-python -m pip install 'polars>=1.0'
-python -m pip install --no-deps python/cudf_polars
+python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
 
 rapids-logger "Run cudf_polars tests"
 
@@ -45,8 +40,8 @@ set +e
 ./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
-       --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml"
+       --cov-config=./pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml"
 
 trap ERR
 set -e
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 2b20b9d9ce4..c3800d3cc25 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="
 
 # Download the cudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index bf4673fcc50..0b559f7a8e9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -182,5 +182,3 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
-# Pure python
-disable-cuda = true

From c978181a3a721ed75cf016c6f083648c65bd24cd Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 16:11:07 +0100
Subject: [PATCH 016/101] Implement translation for some unary functions and a
 single datetime extraction (#16173)

- Closes #16169

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16173
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 124 ++++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |   2 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  19 ++-
 .../tests/expressions/test_datetime_basic.py  |  28 ++++
 .../tests/expressions/test_round.py           |  32 +++++
 .../tests/expressions/test_unique.py          |  24 ++++
 python/cudf_polars/tests/test_groupby.py      |   2 +
 7 files changed, 228 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_round.py
 create mode 100644 python/cudf_polars/tests/expressions/test_unique.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 69bc85b109d..93cb9db7cbd 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -44,6 +44,7 @@
     "Col",
     "BooleanFunction",
     "StringFunction",
+    "TemporalFunction",
     "Sort",
     "SortBy",
     "Gather",
@@ -815,6 +816,129 @@ def do_evaluate(
         )  # pragma: no cover; handled by init raising
 
 
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name != pl_expr.TemporalFunction.Year:
+            raise NotImplementedError(f"String function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.TemporalFunction.Year:
+            (column,) = columns
+            return Column(plc.datetime.extract_year(column.obj))
+        raise NotImplementedError(
+            f"TemporalFunction {self.name}"
+        )  # pragma: no cover; init trips first
+
+
+class UnaryFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if self.name not in ("round", "unique"):
+            raise NotImplementedError(f"Unary function {name=}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)
+
+
 class Sort(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 31a0be004ea..6b552642e88 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -433,7 +433,7 @@ def check_agg(agg: expr.Expr) -> int:
         NotImplementedError
             For unsupported expression nodes.
         """
-        if isinstance(agg, (expr.BinOp, expr.Cast)):
+        if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 0019b3aa98a..5a1e682abe7 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -361,8 +361,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
-    else:
-        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+    elif isinstance(name, pl_expr.TemporalFunction):
+        return expr.TemporalFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, str):
+        return expr.UnaryFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    raise NotImplementedError(
+        f"No handler for Expr function node with {name=}"
+    )  # pragma: no cover; polars raises on the rust side for now
 
 
 @_translate_expr.register
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 6ba2a1dce1e..218101bf87c 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import datetime
+from operator import methodcaller
+
 import pytest
 
 import polars as pl
@@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype):
 
     query = ldf.select(pl.col("b"), pl.col("a"))
     assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "field",
+    [
+        methodcaller("year"),
+        pytest.param(
+            methodcaller("day"),
+            marks=pytest.mark.xfail(reason="day extraction not implemented"),
+        ),
+    ],
+)
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
+    )
+    q = ldf.select(field(pl.col("dates").dt))
+
+    with pytest.raises(AssertionError):
+        # polars produces int32, libcudf produces int16 for the year extraction
+        # libcudf can lose data here.
+        # https://github.com/rapidsai/cudf/issues/16196
+        assert_gpu_result_equal(q)
+
+    assert_gpu_result_equal(q, check_dtypes=False)
diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py
new file mode 100644
index 00000000000..3af3a0ce6d1
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_round.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import math
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[pl.Float32, pl.Float64])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls):
+    a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    return pl.LazyFrame({"a": a}, schema={"a": dtype})
+
+
+@pytest.mark.parametrize("decimals", [0, 2, 4])
+def test_round(df, decimals):
+    q = df.select(pl.col("a").round(decimals=decimals))
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py
new file mode 100644
index 00000000000..9b009a422c2
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_unique.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_unique(maintain_order, pre_sorted):
+    ldf = pl.DataFrame(
+        {
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        }
+    ).lazy()
+    if pre_sorted:
+        ldf = ldf.sort("b")
+
+    query = ldf.select(pl.col("b").unique(maintain_order=maintain_order))
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 8a6732b7063..b84e2c16b43 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -47,6 +47,8 @@ def keys(request):
         [pl.col("float").max() - pl.col("int").min()],
         [pl.col("float").mean(), pl.col("int").std()],
         [(pl.col("float") - pl.lit(2)).max()],
+        [pl.col("float").sum().round(decimals=1)],
+        [pl.col("float").round(decimals=1).sum()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )

From a583c97ca977041e3cc3399739e29962982d6aad Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Jul 2024 13:45:38 -0400
Subject: [PATCH 017/101] Fix cudf::strings::replace_multiple hang on empty
 target (#16167)

Fixes logic in `cudf::strings::replace_multiple` to ignore empty targets correctly in the `replace_multi_fn` functor.
Also updated the doxygen and added a gtest for this case.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16167
---
 cpp/include/cudf/strings/replace.hpp |  2 +-
 cpp/src/strings/replace/multi.cu     |  9 ++++-----
 cpp/tests/strings/replace_tests.cpp  | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a19aa9be0c0..a714f762a19 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -122,7 +122,7 @@ std::unique_ptr<column> replace_slice(
  * If a target string is found, it is replaced by the corresponding entry in the repls column.
  * All occurrences found in each string are replaced.
  *
- * This does not use regex to match targets in the string.
+ * This does not use regex to match targets in the string. Empty string targets are ignored.
  *
  * Null string entries will return null output string entries.
  *
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 43a3d69091a..2ca22f0e017 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -451,8 +451,8 @@ struct replace_multi_fn {
     while (spos < d_str.size_bytes()) {
       for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
         auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&  // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))                  // and match
         {
           auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
                                                     : d_repls.element<string_view>(tgt_idx);
@@ -468,9 +468,8 @@ struct replace_multi_fn {
       }
       ++spos;
     }
-    if (out_ptr)  // copy remainder
-    {
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    if (out_ptr) {
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // copy remainder
     } else {
       d_sizes[idx] = bytes;
     }
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 3aa7467d156..6c4afbb435a 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
   }
 }
 
+TEST_F(StringsReplaceTest, EmptyTarget)
+{
+  auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"});
+  auto const sv    = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"});
+  auto const tv      = cudf::strings_column_view(targets);
+
+  auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"});
+  auto const rv    = cudf::strings_column_view(repls);
+
+  // empty target should be ignored
+  auto results  = cudf::strings::replace_multiple(sv, tv, rv);
+  auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();

From f6b355d7761ee3ecc0b243f09dc0c1d3b214a7ad Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 5 Jul 2024 16:49:23 -0500
Subject: [PATCH 018/101] skip CMake 3.30.0 (#16202)

Contributes to https://github.com/rapidsai/build-planning/issues/80

Adds constraints to avoid pulling in CMake 3.30.0, for the reasons described in that issue.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16202
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/conda_build_config.yaml       | 2 +-
 conda/recipes/cudf_kafka/conda_build_config.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/cudf_kafka/pyproject.toml                 | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cc9238ab80a..b8d73a01f96 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 9fecd452248..c32d21c5d36 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cuda-cudart-dev
 - cuda-nvcc
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c01178bf732..4f99411e978 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 libarrow_version:
   - "==16.1.0"
diff --git a/dependencies.yaml b/dependencies.yaml
index 6d4ba0c38d1..27621ff9a3f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -243,7 +243,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.26.4
+          - &cmake_ver cmake>=3.26.4,!=3.30.0
           - &ninja ninja
   build_all:
     common:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 20b731624df..dcb33b1fc1a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -121,7 +121,7 @@ skip = [
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 11e18cd4f32..badfdf06d15 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -101,7 +101,7 @@ regex = "(?P<value>.*)"
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",

From d9a3728d37e0223afd9cfa525bd7ac8b43b39e63 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:18:30 -0700
Subject: [PATCH 019/101] Define PTDS for the stream hook libs (#16182)

We must define `CUDA_API_PER_THREAD_DEFAULT_STREAM` for the stream hook lib, since `cudaLaunchKernel` in CUDA 12.4+ is now a macro that expands to a different function when it's not defined.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16182
---
 cpp/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2811711d58c..7999ada9282 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -925,6 +925,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     add_library(
       ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp
     )
+    if(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
+      target_compile_definitions(
+        ${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM
+      )
+    endif()
 
     set_target_properties(
       ${_tgt}

From 6169ee17d31669d8930576003bc3ebaadca8a1fa Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:28:52 -0400
Subject: [PATCH 020/101] Add missing methods to lists/list_column_view.pxd in
 pylibcudf (#16175)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16175
---
 .../cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index fd21e7b334b..8917a6ac899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -10,7 +10,9 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
         lists_column_view() except +
+        lists_column_view(const lists_column_view& lists_column) except +
         lists_column_view(const column_view& lists_column) except +
+        lists_column_view& operator=(const lists_column_view&) except +
         column_view parent() except +
         column_view offsets() except +
         column_view child() except +

From 036e0ef5b99fd6ea09061af45854d28e44d21212 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 8 Jul 2024 10:06:13 -0700
Subject: [PATCH 021/101] Migrate JSON reader to pylibcudf (#15966)

Switches the JSON reader to use pylibcudf.
xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15966
---
 python/cudf/cudf/_lib/io/utils.pxd            |   4 +
 python/cudf/cudf/_lib/io/utils.pyx            |  27 ++
 python/cudf/cudf/_lib/json.pyx                | 127 ++++----
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd   |  23 +-
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx   | 122 +++++++-
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |   5 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |  57 +++-
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |   1 +
 .../_lib/pylibcudf/libcudf/io/CMakeLists.txt  |  26 ++
 .../cudf/_lib/pylibcudf/libcudf/io/json.pxd   |   8 +-
 .../cudf/_lib/pylibcudf/libcudf/io/json.pyx   |   0
 .../cudf/_lib/pylibcudf/libcudf/io/types.pyx  |   0
 python/cudf/cudf/_lib/utils.pyx               |   2 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  84 +++++-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |   5 +
 .../cudf/cudf/pylibcudf_tests/io/test_avro.py |   2 +-
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 275 +++++++++++++++++-
 python/cudf/cudf/tests/test_json.py           |   7 +-
 18 files changed, 674 insertions(+), 101 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 252d986843a..680a87c789e 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -16,6 +16,10 @@ cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
+cdef add_df_col_struct_names(
+    df,
+    child_names_dict
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 1d7c56888d9..58956b9e9b7 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -147,10 +147,37 @@ cdef cppclass iobase_data_sink(data_sink):
         return buf.tell()
 
 
+cdef add_df_col_struct_names(df, child_names_dict):
+    for name, child_names in child_names_dict.items():
+        col = df._data[name]
+
+        df._data[name] = update_col_struct_field_names(col, child_names)
+
+
+cdef update_col_struct_field_names(Column col, child_names):
+    if col.children:
+        children = list(col.children)
+        for i, (child, names) in enumerate(zip(children, child_names.values())):
+            children[i] = update_col_struct_field_names(
+                child,
+                names
+            )
+        col.set_base_children(tuple(children))
+
+    if isinstance(col.dtype, StructDtype):
+        col = col._rename_fields(
+            child_names.keys()
+        )
+
+    return col
+
+
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info
 ):
+    # Deprecated, remove in favor of add_col_struct_names
+    # when a reader is ported to pylibcudf
     for i, (name, col) in enumerate(table._data.items()):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 22e34feb547..9c646e3357b 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -8,26 +8,16 @@ import cudf
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
-from cudf._lib.pylibcudf.libcudf.io.json cimport (
-    json_reader_options,
-    json_recovery_mode_t,
-    read_json as libcudf_read_json,
-    schema_element,
-)
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    table_with_metadata,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.io.utils cimport add_df_col_struct_names
+from cudf._lib.pylibcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
 import cudf._lib.pylibcudf as plc
 
@@ -62,6 +52,7 @@ cpdef read_json(object filepaths_or_buffers,
     # If input data is a JSON string (or StringIO), hold a reference to
     # the encoded memoryview externally to ensure the encoded buffer
     # isn't destroyed before calling libcudf `read_json()`
+
     for idx in range(len(filepaths_or_buffers)):
         if isinstance(filepaths_or_buffers[idx], io.StringIO):
             filepaths_or_buffers[idx] = \
@@ -71,17 +62,7 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, schema_element] c_dtypes_schema_map
     cdef cudf_io_types.compression_type c_compression
-    # Determine byte read offsets if applicable
-    cdef size_type c_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_type c_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef bool c_lines = lines
 
     if compression is not None:
         if compression == 'gzip':
@@ -94,56 +75,50 @@ cpdef read_json(object filepaths_or_buffers,
             c_compression = cudf_io_types.compression_type.AUTO
     else:
         c_compression = cudf_io_types.compression_type.NONE
-    is_list_like_dtypes = False
+
+    processed_dtypes = None
+
     if dtype is False:
         raise ValueError("False value is unsupported for `dtype`")
     elif dtype is not True:
+        processed_dtypes = []
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                c_dtypes_schema_map[str(k).encode()] = \
-                    _get_cudf_schema_element_from_dtype(v)
+                # Make sure keys are string
+                k = str(k)
+                lib_type, child_types = _get_cudf_schema_element_from_dtype(v)
+                processed_dtypes.append((k, lib_type, child_types))
         elif isinstance(dtype, abc.Collection):
-            is_list_like_dtypes = True
-            c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(
-                        col_dtype))
+                processed_dtypes.append(
+                    # Ignore child columns since we cannot specify their dtypes
+                    # when passing a list
+                    _get_cudf_schema_element_from_dtype(col_dtype)[0]
+                )
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(make_source_info(filepaths_or_buffers))
-        .compression(c_compression)
-        .lines(c_lines)
-        .byte_range_offset(c_range_offset)
-        .byte_range_size(c_range_size)
-        .recovery_mode(_get_json_recovery_mode(on_bad_lines))
-        .build()
+    table_w_meta = plc.io.json.read_json(
+        plc.io.SourceInfo(filepaths_or_buffers),
+        processed_dtypes,
+        c_compression,
+        lines,
+        byte_range_offset = byte_range[0] if byte_range is not None else 0,
+        byte_range_size = byte_range[1] if byte_range is not None else 0,
+        keep_quotes = keep_quotes,
+        mixed_types_as_string = mixed_types_as_string,
+        prune_columns = prune_columns,
+        recovery_mode = _get_json_recovery_mode(on_bad_lines)
     )
-    if is_list_like_dtypes:
-        opts.set_dtypes(c_dtypes_list)
-    else:
-        opts.set_dtypes(c_dtypes_schema_map)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
-    # Read JSON
-    cdef cudf_io_types.table_with_metadata c_result
 
-    with nogil:
-        c_result = move(libcudf_read_json(opts))
-
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
-
-    update_struct_field_names(df, c_result.metadata.schema_info)
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(
+            table_w_meta
+        )
+    )
 
+    # Post-processing to add in struct column names
+    add_df_col_struct_names(df, table_w_meta.child_names)
     return df
 
 
@@ -192,28 +167,32 @@ def write_json(
         )
 
 
-cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
-    cdef schema_element s_element
-    cdef data_type lib_type
+cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
     dtype = cudf.dtype(dtype)
     if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-    lib_type = dtype_to_data_type(dtype)
-    s_element.type = lib_type
+
+    lib_type = DataType.from_libcudf(dtype_to_data_type(dtype))
+    child_types = []
+
     if isinstance(dtype, cudf.StructDtype):
         for name, child_type in dtype.fields.items():
-            s_element.child_types[name.encode()] = \
+            child_lib_type, grandchild_types = \
                 _get_cudf_schema_element_from_dtype(child_type)
+            child_types.append((name, child_lib_type, grandchild_types))
     elif isinstance(dtype, cudf.ListDtype):
-        s_element.child_types["offsets".encode()] = \
-            _get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
-        s_element.child_types["element".encode()] = \
+        child_lib_type, grandchild_types = \
             _get_cudf_schema_element_from_dtype(dtype.element_type)
 
-    return s_element
+        child_types = [
+            ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []),
+            ("element", child_lib_type, grandchild_types)
+        ]
+
+    return lib_type, child_types
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index a91d574131f..f7f733a493d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -1,11 +1,30 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+    compression_type,
+)
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool lines = *,
+    size_type byte_range_offset = *,
+    size_type byte_range_size = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+)
+
+
 cpdef void write_json(
     SinkInfo sink_info,
     TableWithMetadata tbl,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 7530eba3803..354cb4981de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -1,16 +1,130 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 from libcpp.limits cimport numeric_limits
+from libcpp.map cimport map
 from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+)
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_reader_options,
+    json_recovery_mode_t,
     json_writer_options,
+    read_json as cpp_read_json,
+    schema_element,
     write_json as cpp_write_json,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
-from cudf._lib.pylibcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    table_metadata,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef map[string, schema_element] _generate_schema_map(list dtypes):
+    cdef map[string, schema_element] schema_map
+    cdef schema_element s_elem
+    cdef string c_name
+
+    for name, dtype, child_dtypes in dtypes:
+        if not (isinstance(name, str) and
+                isinstance(dtype, DataType) and
+                isinstance(child_dtypes, list)):
+
+            raise ValueError("Must pass a list of a tuple containing "
+                             "(column_name, column_dtype, list of child_dtypes)")
+
+        c_name = <str>name.encode()
+
+        s_elem.type = (<DataType>dtype).c_obj
+        s_elem.child_types = _generate_schema_map(child_dtypes)
+
+        schema_map[c_name] = s_elem
+    return schema_map
+
+
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool lines = False,
+    size_type byte_range_offset = 0,
+    size_type byte_range_size = 0,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression_type: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_json(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
 
 
 cpdef void write_json(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index 88daf54f33b..ab223c16a72 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -28,6 +28,11 @@ cdef class TableWithMetadata:
 
     cdef vector[column_name_info] _make_column_info(self, list column_names)
 
+    cdef list _make_columns_list(self, dict child_dict)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index f94e20970a4..df0b729b711 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -22,6 +22,11 @@ import errno
 import io
 import os
 
+from cudf._lib.pylibcudf.libcudf.io.json import \
+    json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.io.types import \
+    compression_type as CompressionType  # no-cython-lint
+
 
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
@@ -69,16 +74,44 @@ cdef class TableWithMetadata:
         """
         return self.tbl.columns()
 
-    @property
-    def column_names(self):
+    cdef list _make_columns_list(self, dict child_dict):
+        cdef list names = []
+        for child in child_dict:
+            grandchildren = self._make_columns_list(child_dict[child])
+            names.append((child, grandchildren))
+        return names
+
+    def column_names(self, include_children=False):
         """
         Return a list containing the column names of the table
         """
         cdef list names = []
+        cdef str name
+        cdef dict child_names = self.child_names
         for col_info in self.metadata.schema_info:
-            # TODO: Handle nesting (columns with child columns)
-            assert col_info.children.size() == 0, "Child column names are not handled!"
-            names.append(col_info.name.decode())
+            name = col_info.name.decode()
+            if include_children:
+                children = self._make_columns_list(child_names[name])
+                names.append((name, children))
+            else:
+                names.append(name)
+        return names
+
+    @property
+    def child_names(self):
+        """
+        Return a dictionary mapping the names of columns with children
+        to the names of their child columns
+        """
+        return TableWithMetadata._parse_col_names(self.metadata.schema_info)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos):
+        cdef dict child_names = dict()
+        cdef dict names = dict()
+        for col_info in infos:
+            child_names = TableWithMetadata._parse_col_names(col_info.children)
+            names[col_info.name.decode()] = child_names
         return names
 
     @staticmethod
@@ -137,6 +170,15 @@ cdef class SourceInfo:
         cdef vector[host_buffer] c_host_buffers
         cdef const unsigned char[::1] c_buffer
         cdef bint empty_buffer = False
+        cdef list new_sources = []
+
+        if isinstance(sources[0], io.StringIO):
+            for buffer in sources:
+                if not isinstance(buffer, io.StringIO):
+                    raise ValueError("All sources must be of the same type!")
+                new_sources.append(buffer.read().encode())
+            sources = new_sources
+
         if isinstance(sources[0], bytes):
             empty_buffer = True
             for buffer in sources:
@@ -156,7 +198,10 @@ cdef class SourceInfo:
                                                      c_buffer.shape[0]))
         else:
             raise ValueError("Sources must be a list of str/paths, "
-                             "bytes, io.BytesIO, or a Datasource")
+                             "bytes, io.BytesIO, io.StringIO, or a Datasource")
+
+        if empty_buffer is True:
+            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
 
         self.c_obj = source_info(c_host_buffers)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 6c66d01ca57..699e85ce567 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -22,4 +22,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
+add_subdirectory(io)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
new file mode 100644
index 00000000000..6831063ecb9
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
@@ -0,0 +1,26 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources json.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_
+)
+
+set(targets_using_arrow_headers cpp_io_json cpp_io_types)
+link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 2e50cccd132..86621ae184f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -19,9 +19,9 @@ cdef extern from "cudf/io/json.hpp" \
         data_type type
         map[string, schema_element] child_types
 
-    cdef enum json_recovery_mode_t:
-        FAIL "cudf::io::json_recovery_mode_t::FAIL"
-        RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"
+    cpdef enum class json_recovery_mode_t(int32_t):
+        FAIL
+        RECOVER_WITH_NULL
 
     cdef cppclass json_reader_options:
         json_reader_options() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index de6b9f690b6..f136cd997a7 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -322,7 +322,7 @@ cdef data_from_pylibcudf_io(tbl_with_meta):
     """
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names,
+        column_names=tbl_with_meta.column_names(include_children=False),
         index_names=None
     )
 
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index d41e6c720bf..46603ff32b8 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -8,13 +8,14 @@
 import pytest
 
 from cudf._lib import pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 
 def metadata_from_arrow_type(
     pa_type: pa.Array,
     name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = plc.interop.ColumnMetadata(name)  # None
+    metadata = plc.interop.ColumnMetadata(name)
     if pa.types.is_list(pa_type):
         child_meta = [plc.interop.ColumnMetadata("offsets")]
         for i in range(pa_type.num_fields):
@@ -39,9 +40,25 @@ def metadata_from_arrow_type(
 
 
 def assert_column_eq(
-    lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column
+    lhs: pa.Array | plc.Column,
+    rhs: pa.Array | plc.Column,
+    check_field_nullability=True,
 ) -> None:
-    """Verify that a pylibcudf array and PyArrow array are equal."""
+    """Verify that a pylibcudf array and PyArrow array are equal.
+
+    Parameters
+    ----------
+    lhs: Union[pa.Array, plc.Column]
+        The array with the expected values
+    rhs: Union[pa.Array, plc.Column]
+        The array to check
+    check_field_nullability:
+        For list/struct dtypes, whether to check if the nullable attributes
+        on child fields are equal.
+
+        Useful for checking roundtripping of lossy formats like JSON that may not
+        preserve this information.
+    """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(
         rhs, plc.Column
@@ -65,6 +82,33 @@ def assert_column_eq(
     if isinstance(rhs, pa.ChunkedArray):
         rhs = rhs.combine_chunks()
 
+    def _make_fields_nullable(typ):
+        new_fields = []
+        for i in range(typ.num_fields):
+            child_field = typ.field(i)
+            if not child_field.nullable:
+                child_type = child_field.type
+                if isinstance(child_field.type, (pa.StructType, pa.ListType)):
+                    child_type = _make_fields_nullable(child_type)
+                new_fields.append(
+                    pa.field(child_field.name, child_type, nullable=True)
+                )
+            else:
+                new_fields.append(child_field)
+
+        if isinstance(typ, pa.StructType):
+            return pa.struct(new_fields)
+        elif isinstance(typ, pa.ListType):
+            return pa.list_(new_fields[0])
+        return typ
+
+    if not check_field_nullability:
+        rhs_type = _make_fields_nullable(rhs.type)
+        rhs = rhs.cast(rhs_type)
+
+        lhs_type = _make_fields_nullable(lhs.type)
+        lhs = rhs.cast(lhs_type)
+
     assert lhs.equals(rhs)
 
 
@@ -78,20 +122,24 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
 
 
 def assert_table_and_meta_eq(
-    plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table
+    pa_table: pa.Table,
+    plc_table_w_meta: plc.io.types.TableWithMetadata,
+    check_field_nullability=True,
 ) -> None:
     """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
 
     plc_table = plc_table_w_meta.tbl
 
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
-    assert plc_shape == pa_table.shape
+    assert (
+        plc_shape == pa_table.shape
+    ), f"{plc_shape} is not equal to {pa_table.shape}"
 
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
-        assert_column_eq(plc_col, pa_col)
+        assert_column_eq(pa_col, plc_col, check_field_nullability)
 
     # Check column name equality
-    assert plc_table_w_meta.column_names == pa_table.column_names
+    assert plc_table_w_meta.column_names() == pa_table.column_names
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
@@ -182,4 +230,26 @@ def sink_to_str(sink):
     + DEFAULT_PA_STRUCT_TESTING_TYPES
 )
 
+# Map pylibcudf compression types to pandas ones
+# Not all compression types map cleanly, read the comments to learn more!
+# If a compression type is unsupported, it maps to False.
+
+COMPRESSION_TYPE_TO_PANDAS = {
+    CompressionType.NONE: None,
+    # Users of this dict will have to special case
+    # AUTO
+    CompressionType.AUTO: None,
+    CompressionType.GZIP: "gzip",
+    CompressionType.BZIP2: "bz2",
+    CompressionType.ZIP: "zip",
+    CompressionType.XZ: "xz",
+    CompressionType.ZSTD: "zstd",
+    # Unsupported
+    CompressionType.ZLIB: False,
+    CompressionType.LZ4: False,
+    CompressionType.LZO: False,
+    # These only work for parquet
+    CompressionType.SNAPPY: "snappy",
+    CompressionType.BROTLI: "brotli",
+}
 ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index e4760ea7ac8..39832eb4bba 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -121,6 +121,11 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+@pytest.fixture(params=[opt for opt in plc.io.types.CompressionType])
+def compression_type(request):
+    return request.param
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
index d6cd86768cd..061d6792ce3 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
@@ -120,4 +120,4 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
     if columns != []:
         expected = expected.select(columns)
 
-    assert_table_and_meta_eq(res, expected)
+    assert_table_and_meta_eq(expected, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
index d6b8bfa6976..c13eaf40625 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -1,11 +1,49 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import io
 
+import pandas as pd
 import pyarrow as pa
 import pytest
-from utils import sink_to_str
+from utils import (
+    COMPRESSION_TYPE_TO_PANDAS,
+    assert_table_and_meta_eq,
+    sink_to_str,
+)
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+
+def make_json_source(path_or_buf, pa_table, **kwargs):
+    """
+    Uses pandas to write a pyarrow Table to a JSON file.
+
+    The caller is responsible for making sure that no arguments
+    unsupported by pandas are passed in.
+    """
+    df = pa_table.to_pandas()
+    if "compression" in kwargs:
+        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
+            kwargs["compression"]
+        ]
+    df.to_json(path_or_buf, orient="records", **kwargs)
+    if isinstance(path_or_buf, io.IOBase):
+        path_or_buf.seek(0)
+    return path_or_buf
+
+
+def write_json_bytes(source, json_str):
+    """
+    Write a JSON string to the source
+    """
+    if not isinstance(source, io.IOBase):
+        with open(source, "w") as source_f:
+            source_f.write(json_str)
+    else:
+        if isinstance(source, io.BytesIO):
+            json_str = json_str.encode("utf-8")
+        source.write(json_str)
+        source.seek(0)
 
 
 @pytest.mark.parametrize("rows_per_chunk", [8, 100])
@@ -114,3 +152,238 @@ def test_write_json_bool_opts(true_value, false_value):
         pd_result = pd_result.replace("false", false_value)
 
     assert str_result == pd_result
+
+
+@pytest.mark.parametrize("lines", [True, False])
+def test_read_json_basic(
+    table_data, source_or_sink, lines, compression_type, request
+):
+    if compression_type in {
+        # Not supported by libcudf
+        CompressionType.SNAPPY,
+        CompressionType.XZ,
+        CompressionType.ZSTD,
+        # Not supported by pandas
+        # TODO: find a way to test these
+        CompressionType.BROTLI,
+        CompressionType.LZ4,
+        CompressionType.LZO,
+        CompressionType.ZLIB,
+    }:
+        pytest.skip("unsupported compression type by pandas/libcudf")
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    _, pa_table = table_data
+
+    source = make_json_source(
+        source_or_sink, pa_table, lines=lines, compression=compression_type
+    )
+
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(
+                len(pa_table) > 0
+                and compression_type
+                not in {CompressionType.NONE, CompressionType.AUTO}
+            ),
+            # note: wasn't able to narrow down the specific types that were failing
+            # seems to be a little non-deterministic, but always fails with
+            # cudaErrorInvalidValue invalid argument
+            reason="libcudf json reader crashes on compressed non empty table_data",
+        )
+    )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]),
+        compression=compression_type,
+        lines=lines,
+    )
+
+    # Adjustments to correct for the fact orient=records is lossy
+    #  and doesn't
+    # 1) preserve colnames when zero rows in table
+    # 2) preserve struct nullability
+    # 3) differentiate int64/uint64
+    if len(pa_table) == 0:
+        pa_table = pa.table([])
+
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        curr_field = pa_table.schema.field(i)
+        if curr_field.type == pa.uint64():
+            try:
+                curr_field = curr_field.with_type(pa.int64())
+            except OverflowError:
+                # There will be no confusion, values are too large
+                # for int64 anyways
+                pass
+        new_fields.append(curr_field)
+
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    # Convert non-nullable struct fields to nullable fields
+    # since nullable=False cannot roundtrip through orient='records'
+    # JSON format
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+def test_read_json_dtypes(table_data, source_or_sink):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = table_data
+    source = make_json_source(
+        source_or_sink,
+        pa_table,
+        lines=True,
+    )
+
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        def get_child_types(typ):
+            typ_child_types = []
+            for i in range(typ.num_fields):
+                curr_field = typ.field(i)
+                typ_child_types.append(
+                    (
+                        curr_field.name,
+                        curr_field.type,
+                        get_child_types(curr_field.type),
+                    )
+                )
+            return typ_child_types
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), dtypes=dtypes, lines=True
+    )
+    new_table = pa_table.cast(new_schema)
+
+    # orient=records is lossy
+    # and doesn't preserve column names when there's zero rows in the table
+    if len(new_table) == 0:
+        new_table = pa.table([])
+
+    assert_table_and_meta_eq(new_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize("chunk_size", [10, 15, 20])
+def test_read_json_lines_byte_range(source_or_sink, chunk_size):
+    source = source_or_sink
+    if isinstance(source_or_sink, io.StringIO):
+        pytest.skip("byte_range doesn't work on StringIO")
+
+    json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"
+    write_json_bytes(source, json_str)
+
+    tbls_w_meta = []
+    for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
+        tbls_w_meta.append(
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                byte_range_offset=chunk_start,
+                byte_range_size=chunk_start + chunk_size,
+            )
+        )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_json(source, orient="records", lines=True)
+
+    # TODO: can do this operation using pylibcudf
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("keep_quotes", [True, False])
+def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
+    source = source_or_sink
+
+    json_bytes = '["a", "b", "c"]\n'
+    write_json_bytes(source, json_bytes)
+
+    tbl_w_meta = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
+    )
+
+    template = "{0}"
+    if keep_quotes:
+        template = '"{0}"'
+
+    exp = pa.Table.from_arrays(
+        [
+            [template.format("a")],
+            [template.format("b")],
+            [template.format("c")],
+        ],
+        names=["0", "1", "2"],
+    )
+
+    assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+@pytest.mark.parametrize(
+    "recovery_mode", [opt for opt in plc.io.types.JSONRecoveryMode]
+)
+def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
+    source = source_or_sink
+
+    json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    write_json_bytes(source, json_bytes)
+
+    if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
+        with pytest.raises(RuntimeError):
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                recovery_mode=recovery_mode,
+            )
+    else:
+        # Recover case (bad values replaced with nulls)
+        tbl_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo([source]),
+            lines=True,
+            recovery_mode=recovery_mode,
+        )
+        exp = pa.Table.from_arrays(
+            [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"]
+        )
+        assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+# TODO: Add tests for these!
+# Tests were not added in the initial PR porting the JSON reader to pylibcudf
+# to save time (and since there are no existing tests for these in Python cuDF)
+# mixed_types_as_string = mixed_types_as_string,
+# prune_columns = prune_columns,
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 297040b6d95..9222f6d23db 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1077,8 +1077,13 @@ def test_json_dtypes_nested_data():
     )
 
     pdf = pd.read_json(
-        StringIO(expected_json_str), orient="records", lines=True
+        StringIO(expected_json_str),
+        orient="records",
+        lines=True,
     )
+
+    assert_eq(df, pdf)
+
     pdf.columns = pdf.columns.astype("str")
     pa_table_pdf = pa.Table.from_pandas(
         pdf, schema=df.to_arrow().schema, safe=False

From 2664427d5eb427cb4c7682d51a37fde71f7c6c8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Jul 2024 14:00:30 -0400
Subject: [PATCH 022/101] Add single offset to an empty ListArray in
 cudf::to_arrow (#16201)

Closes #16164

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16201
---
 cpp/src/interop/to_arrow.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 2b3aa2f08f1..62b85891adb 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -376,7 +376,12 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
     metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
   auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
   if (child_arrays.empty()) {
-    return std::make_shared<arrow::ListArray>(arrow::list(arrow::null()), 0, nullptr, nullptr);
+    // Empty list will have only one value in offset of 4 bytes
+    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
+    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
+
+    return std::make_shared<arrow::ListArray>(
+      arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr);
   }
 
   auto offset_buffer = child_arrays[0]->data()->buffers[1];

From e9cb7dd7d3d9b810c4575cbdbead8148d85e990f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 08:36:34 -1000
Subject: [PATCH 023/101] Support at/iat indexers in cudf.pandas (#16177)

closes #16112

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16177
---
 python/cudf/cudf/core/dataframe.py            | 12 ++++++++++--
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 12 ++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 19 +++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b249410c2e4..3e5ff9c18b5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -462,6 +462,10 @@ def _setitem_tuple_arg(self, key, value):
                             self._frame[col].loc[key[0]] = value[i]
 
 
+class _DataFrameAtIndexer(_DataFrameLocIndexer):
+    pass
+
+
 class _DataFrameIlocIndexer(_DataFrameIndexer):
     """
     For selection by index.
@@ -584,6 +588,10 @@ def _setitem_tuple_arg(self, key, value):
                         self._frame[col].iloc[key[0]] = value[i]
 
 
+class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
+    pass
+
+
 class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
@@ -2581,14 +2589,14 @@ def iat(self):
         """
         Alias for ``DataFrame.iloc``; provided for compatibility with Pandas.
         """
-        return self.iloc
+        return _DataFrameiAtIndexer(self)
 
     @property
     def at(self):
         """
         Alias for ``DataFrame.loc``; provided for compatibility with Pandas.
         """
-        return self.loc
+        return _DataFrameAtIndexer(self)
 
     @property  # type: ignore
     @_external_only_api(
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index a64bf7772fe..dd6f6fe76ba 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -775,6 +775,18 @@ def Index__new__(cls, *args, **kwargs):
     pd.core.indexing._LocIndexer,
 )
 
+_AtIndexer = make_intermediate_proxy_type(
+    "_AtIndexer",
+    cudf.core.dataframe._DataFrameAtIndexer,
+    pd.core.indexing._AtIndexer,
+)
+
+_iAtIndexer = make_intermediate_proxy_type(
+    "_iAtIndexer",
+    cudf.core.dataframe._DataFrameiAtIndexer,
+    pd.core.indexing._iAtIndexer,
+)
+
 FixedForwardWindowIndexer = make_final_proxy_type(
     "FixedForwardWindowIndexer",
     _Unusable,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f51ce103677..b0aeaba3916 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1566,3 +1566,22 @@ def test_arrow_string_arrays():
     )
 
     tm.assert_equal(cu_arr, pd_arr)
+
+
+@pytest.mark.parametrize("indexer", ["at", "iat"])
+def test_at_iat(indexer):
+    df = xpd.DataFrame(range(3))
+    result = getattr(df, indexer)[0, 0]
+    assert result == 0
+
+    getattr(df, indexer)[0, 0] = 1
+    expected = pd.DataFrame([1, 1, 2])
+    tm.assert_frame_equal(df, expected)
+
+
+def test_at_setitem_empty():
+    df = xpd.DataFrame({"name": []}, dtype="float64")
+    df.at[0, "name"] = 1.0
+    df.at[0, "new"] = 2.0
+    expected = pd.DataFrame({"name": [1.0], "new": [2.0]})
+    tm.assert_frame_equal(df, expected)

From cc8c86857df92801561d2fa3311d8da85895ff33 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 8 Jul 2024 16:54:45 -0500
Subject: [PATCH 024/101] Disable large string support for Java build (#16216)

Disables libcudf large string support for the Java bindings build. The Java bindings need to be updated to handle large strings which is tracked by #16215.

Closes #16199.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16216
---
 java/README.md             | 10 +++++++---
 java/ci/build-in-docker.sh |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/java/README.md b/java/README.md
index 2d8e2190fee..0d9e060b7cd 100644
--- a/java/README.md
+++ b/java/README.md
@@ -51,9 +51,13 @@ CUDA 11.0:
 ## Build From Source
 
 Build [libcudf](../cpp) first, and make sure the JDK is installed and available. Specify
-the cmake option `-DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF` when building so
-that Apache Arrow is linked statically to libcudf, as this will help create a jar that
-does not require Arrow and its dependencies to be available in the runtime environment.
+the following cmake options to the libcudf build:
+```
+-DCUDF_LARGE_STRINGS_DISABLED=ON -DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF
+```
+These options:
+- Disable large string support, see https://github.com/rapidsai/cudf/issues/16215
+- Statically link Arrow to libcudf to remove Arrow as a runtime dependency.
 
 After building libcudf, the Java bindings can be built via Maven, e.g.:
 ```
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 72b1742f7cb..5a429bdc739 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
          -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME \
          -DUSE_NVTX=$ENABLE_NVTX \
+         -DCUDF_LARGE_STRINGS_DISABLED=ON \
          -DCUDF_USE_ARROW_STATIC=ON \
          -DCUDF_ENABLE_ARROW_S3=OFF \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \

From 58b7dc9f186c1860d4f9df80188bf21214381b1b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:01:25 -1000
Subject: [PATCH 025/101] interpolate returns new column if no values are
 interpolated (#16158)

While cleaning up the `interpolate` implementation, I noticed that a interpolation no-op did not return a new column.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16158
---
 python/cudf/cudf/core/algorithms.py        | 61 ++++++++--------------
 python/cudf/cudf/core/indexed_frame.py     | 14 +++--
 python/cudf/cudf/core/multiindex.py        |  4 +-
 python/cudf/cudf/tests/test_interpolate.py |  6 +++
 4 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index e8b82ff60c2..6c69fbd2637 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,17 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.copy_types import BooleanMask
 from cudf.core.index import RangeIndex, ensure_index
-from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column.column import ColumnBase
+    from cudf.core.index import BaseIndex
+
 
 def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     """Encode the input values as integer labels
@@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
-def _linear_interpolation(column, index=None):
-    """
-    Interpolate over a float column. Implicitly assumes that values are
-    evenly spaced with respect to the x-axis, for example the data
-    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
-    between the two valid values, yielding [1.0, 2.0, 3.0]
-    """
-
-    index = RangeIndex(start=0, stop=len(column), step=1)
-    return _index_or_values_interpolation(column, index=index)
-
-
-def _index_or_values_interpolation(column, index=None):
+def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
     """
     Interpolate over a float column. assumes a linear interpolation
     strategy using the index of the data to denote spacing of the x
     values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
-    would result in [1.0, 3.0, 4.0]
+    would result in [1.0, 3.0, 4.0].
     """
     # figure out where the nans are
-    mask = cp.isnan(column)
+    mask = column.isnull()
 
     # trivial cases, all nan or no nans
-    num_nan = mask.sum()
-    if num_nan == 0 or num_nan == len(column):
-        return column
+    if not mask.any() or mask.all():
+        return column.copy()
 
-    to_interp = IndexedFrame(data={None: column}, index=index)
-    known_x_and_y = to_interp._apply_boolean_mask(
-        BooleanMask(~mask, len(to_interp))
-    )
-
-    known_x = known_x_and_y.index.to_cupy()
-    known_y = known_x_and_y._data.columns[0].values
+    valid_locs = ~mask
+    if isinstance(index, RangeIndex):
+        # Each point is evenly spaced, index values don't matter
+        known_x = cp.flatnonzero(valid_locs.values)
+    else:
+        known_x = index._column.apply_boolean_mask(valid_locs).values  # type: ignore[attr-defined]
+    known_y = column.apply_boolean_mask(valid_locs).values
 
     result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
-    first_nan_idx = (mask == 0).argmax().item()
+    first_nan_idx = valid_locs.values.argmax().item()
     result[:first_nan_idx] = np.nan
-    return result
-
-
-def get_column_interpolator(method):
-    interpolator = {
-        "linear": _linear_interpolation,
-        "index": _index_or_values_interpolation,
-        "values": _index_or_values_interpolation,
-    }.get(method, None)
-    if not interpolator:
-        raise ValueError(f"Interpolation method `{method}` not found")
-    return interpolator
+    return as_column(result)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ff10051c52d..63fa96d0db0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -26,6 +26,8 @@
 
 import cudf
 import cudf._lib as libcudf
+import cudf.core
+import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -1987,6 +1989,8 @@ def interpolate(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
+        elif method not in {"linear", "values", "index"}:
+            raise ValueError(f"Interpolation method `{method}` not found")
 
         data = self
 
@@ -2000,7 +2004,10 @@ def interpolate(
                 )
             )
 
-        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        if method == "linear":
+            interp_index = RangeIndex(self._num_rows)
+        else:
+            interp_index = data.index
         columns = []
         for col in data._columns:
             if isinstance(col, cudf.core.column.StringColumn):
@@ -2012,8 +2019,9 @@ def interpolate(
             if col.nullable:
                 col = col.astype("float64").fillna(np.nan)
 
-            # Interpolation methods may or may not need the index
-            columns.append(interpolator(col, index=data.index))
+            columns.append(
+                cudf.core.algorithms._interpolation(col, index=interp_index)
+            )
 
         result = self._from_data_like_self(
             self._data._from_columns_like_self(columns)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9cbe863142b..dbbd1eab6c8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.algorithms import factorize
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
@@ -1373,9 +1374,6 @@ def from_arrays(
                     (2, 'blue')],
                    names=['number', 'color'])
         """
-        # Imported here due to circular import
-        from cudf.core.algorithms import factorize
-
         error_msg = "Input must be a list / sequence of array-likes."
         if not is_list_like(arrays):
             raise TypeError(error_msg)
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index 4a0dc331e1a..a4f0b9fc97e 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs):
         lfunc_args_and_kwargs=([], kwargs),
         rfunc_args_and_kwargs=([], kwargs),
     )
+
+
+def test_interpolate_noop_new_column():
+    ser = cudf.Series([1.0, 2.0, 3.0])
+    result = ser.interpolate()
+    assert ser._column is not result._column

From cf88f8e045b279cbe5caa2e19ffadc7c6400aa58 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:04:51 -1000
Subject: [PATCH 026/101] Defer copying in Column.astype(copy=True) (#16095)

Avoids:

1. Copying `self` when the `astype` would already produce a new column with its own data
2. Copying `self` when the `astype` would raise an Exception

Also cleans up some `as_categorical_column` logic.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16095
---
 python/cudf/cudf/core/column/categorical.py | 20 ++---
 python/cudf/cudf/core/column/column.py      | 91 ++++++++++-----------
 2 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 231af30c06d..cec7d5e6663 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1113,24 +1113,18 @@ def is_monotonic_decreasing(self) -> bool:
     def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
         if isinstance(dtype, str) and dtype == "category":
             return self
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
         if (
-            isinstance(
-                dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
-            )
-            and (dtype.categories is None)
-            and (dtype.ordered is None)
+            isinstance(dtype, cudf.CategoricalDtype)
+            and dtype.categories is None
+            and dtype.ordered is None
         ):
             return self
-
-        if isinstance(dtype, pd.CategoricalDtype):
-            dtype = CategoricalDtype(
-                categories=dtype.categories, ordered=dtype.ordered
-            )
-
-        if not isinstance(dtype, CategoricalDtype):
+        elif not isinstance(dtype, CategoricalDtype):
             raise ValueError("dtype must be CategoricalDtype")
 
-        if not isinstance(self.categories, type(dtype.categories._values)):
+        if not isinstance(self.categories, type(dtype.categories._column)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
             return _create_empty_categorical_column(self, dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e7a2863da8c..adc783c20c4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -962,59 +962,59 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if len(self) == 0:
             dtype = cudf.dtype(dtype)
             if self.dtype == dtype:
-                if copy:
-                    return self.copy()
-                else:
-                    return self
+                result = self
             else:
-                return column_empty(0, dtype=dtype, masked=self.nullable)
-        if copy:
-            col = self.copy()
-        else:
-            col = self
-        if dtype == "category":
+                result = column_empty(0, dtype=dtype, masked=self.nullable)
+        elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
-            return col.as_categorical_column(dtype)
+            result = self.as_categorical_column(dtype)
         elif (
             isinstance(dtype, str)
             and dtype == "interval"
             and isinstance(self.dtype, cudf.IntervalDtype)
         ):
             # astype("interval") (the string only) should no-op
-            return col
-        was_object = dtype == object or dtype == np.dtype(object)
-        dtype = cudf.dtype(dtype)
-        if self.dtype == dtype:
-            return col
-        elif isinstance(dtype, CategoricalDtype):
-            return col.as_categorical_column(dtype)
-        elif isinstance(dtype, IntervalDtype):
-            return col.as_interval_column(dtype)
-        elif isinstance(dtype, (ListDtype, StructDtype)):
-            if not col.dtype == dtype:
-                raise NotImplementedError(
-                    f"Casting {self.dtype} columns not currently supported"
-                )
-            return col
-        elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            return col.as_decimal_column(dtype)
-        elif dtype.kind == "M":
-            return col.as_datetime_column(dtype)
-        elif dtype.kind == "m":
-            return col.as_timedelta_column(dtype)
-        elif dtype.kind == "O":
-            if cudf.get_option("mode.pandas_compatible") and was_object:
-                raise ValueError(
-                    f"Casting to {dtype} is not supported, use "
-                    "`.astype('str')` instead."
-                )
-            return col.as_string_column(dtype)
+            result = self
         else:
-            return col.as_numerical_column(dtype)
+            was_object = dtype == object or dtype == np.dtype(object)
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                result = self
+            elif isinstance(dtype, CategoricalDtype):
+                result = self.as_categorical_column(dtype)
+            elif isinstance(dtype, IntervalDtype):
+                result = self.as_interval_column(dtype)
+            elif isinstance(dtype, (ListDtype, StructDtype)):
+                if not self.dtype == dtype:
+                    raise NotImplementedError(
+                        f"Casting {self.dtype} columns not currently supported"
+                    )
+                result = self
+            elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+                result = self.as_decimal_column(dtype)
+            elif dtype.kind == "M":
+                result = self.as_datetime_column(dtype)
+            elif dtype.kind == "m":
+                result = self.as_timedelta_column(dtype)
+            elif dtype.kind == "O":
+                if cudf.get_option("mode.pandas_compatible") and was_object:
+                    raise ValueError(
+                        f"Casting to {dtype} is not supported, use "
+                        "`.astype('str')` instead."
+                    )
+                result = self.as_string_column(dtype)
+            else:
+                result = self.as_numerical_column(dtype)
+
+        if copy and result is self:
+            return result.copy()
+        return result
 
     def as_categorical_column(self, dtype) -> ColumnBase:
-        if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)):
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
+        if isinstance(dtype, cudf.CategoricalDtype):
             ordered = dtype.ordered
         else:
             ordered = False
@@ -1023,14 +1023,11 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         if (
             isinstance(dtype, cudf.CategoricalDtype)
             and dtype._categories is not None
-        ) or (
-            isinstance(dtype, pd.CategoricalDtype)
-            and dtype.categories is not None
         ):
-            labels = self._label_encoding(cats=as_column(dtype.categories))
-
+            cat_col = dtype._categories
+            labels = self._label_encoding(cats=cat_col)
             return build_categorical_column(
-                categories=as_column(dtype.categories),
+                categories=cat_col,
                 codes=labels,
                 mask=self.mask,
                 ordered=dtype.ordered,

From 65e4e99d702aedbbfd489840d112faecfaeb43b9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 8 Jul 2024 23:10:23 -0500
Subject: [PATCH 027/101] Remove CCCL patch for PR 211. (#16207)

While upgrading CCCL, we ran into a test failure in cuSpatial. We added a patch to revert some changes from CCCL but the root cause was a bug in cuSpatial. I have fixed that bug here: https://github.com/rapidsai/cuspatial/pull/1402

Once that PR is merged, we can remove this CCCL patch.

See also:
- rapids-cmake patch removal: https://github.com/rapidsai/rapids-cmake/pull/640
- Original rapids-cmake patch: https://github.com/rapidsai/rapids-cmake/pull/511
- CCCL epic to remove RAPIDS patches: https://github.com/NVIDIA/cccl/issues/1939

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16207
---
 cpp/cmake/thirdparty/patches/cccl_override.json | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index e61102dffac..2f29578f7ae 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "cccl/revert_pr_211.diff",
-          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",

From b693e79b1813276700f70c2cb251d6fef71851a1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 9 Jul 2024 13:22:35 +0100
Subject: [PATCH 028/101] Handler csv reader options in cudf-polars (#16211)

Previously we were just relying on the default cudf read_csv options which doesn't do the right thing if the user has configured things.

Now that polars passes through the information to us, we can handle things properly, and raise for unsupported cases.

While here, update to new polars release and adapt tests to bug fixes that have been made upstream.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16211
---
 python/cudf/cudf/_lib/csv.pyx                 |   2 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   4 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 104 +++++++++++++++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  12 +-
 python/cudf_polars/tests/test_scan.py         | 107 ++++++++++++++++--
 5 files changed, 206 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index c706351a683..9fecff5f5f6 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -450,7 +450,7 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
     # Set index if the index_col parameter is passed
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 93cb9db7cbd..f83d9e82d30 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -32,7 +32,7 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
-    import polars.polars as plrs
+    import polars as pl
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -377,7 +377,7 @@ class LiteralColumn(Expr):
     value: pa.Array[Any, Any]
     children: tuple[()]
 
-    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         super().__init__(dtype)
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 6b552642e88..b32fa9c273e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,9 +15,9 @@
 
 import dataclasses
 import itertools
-import json
 import types
 from functools import cache
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
@@ -185,8 +185,10 @@ class Scan(IR):
 
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
-    options: tuple[Any, ...]
-    """Type specific options, as json-encoded strings."""
+    reader_options: dict[str, Any]
+    """Reader-specific options, as dictionary."""
+    cloud_options: dict[str, Any] | None
+    """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -206,9 +208,33 @@ def __post_init__(self) -> None:
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.cloud_options is not None and any(
+            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+        ):
             raise NotImplementedError(
-                f"Unhandled scan type: {self.typ}"
-            )  # pragma: no cover; polars raises on the rust side for now
+                "Read from cloud storage"
+            )  # pragma: no cover; no test yet
+        if self.typ == "csv":
+            if self.reader_options["skip_rows_after_header"] != 0:
+                raise NotImplementedError("Skipping rows after header in CSV reader")
+            parse_options = self.reader_options["parse_options"]
+            if (
+                null_values := parse_options["null_values"]
+            ) is not None and "Named" in null_values:
+                raise NotImplementedError(
+                    "Per column null value specification not supported for CSV reader"
+                )
+            if (
+                comment := parse_options["comment_prefix"]
+            ) is not None and "Multi" in comment:
+                raise NotImplementedError(
+                    "Multi-character comment prefix not supported for CSV reader"
+                )
+            if not self.reader_options["has_header"]:
+                # Need to do some file introspection to get the number
+                # of columns so that column projection works right.
+                raise NotImplementedError("Reading CSV without header")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -216,14 +242,70 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            opts, cloud_opts = map(json.loads, self.options)
-            df = DataFrame.from_cudf(
-                cudf.concat(
-                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            dtype_map = {
+                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
+                for name, typ in self.schema.items()
+            }
+            parse_options = self.reader_options["parse_options"]
+            sep = chr(parse_options["separator"])
+            quote = chr(parse_options["quote_char"])
+            eol = chr(parse_options["eol_char"])
+            if self.reader_options["schema"] is not None:
+                # Reader schema provides names
+                column_names = list(self.reader_options["schema"]["inner"].keys())
+            else:
+                # file provides column names
+                column_names = None
+            usecols = with_columns
+            # TODO: support has_header=False
+            header = 0
+
+            # polars defaults to no null recognition
+            null_values = [""]
+            if parse_options["null_values"] is not None:
+                ((typ, nulls),) = parse_options["null_values"].items()
+                if typ == "AllColumnsSingle":
+                    # Single value
+                    null_values.append(nulls)
+                else:
+                    # List of values
+                    null_values.extend(nulls)
+            if parse_options["comment_prefix"] is not None:
+                comment = chr(parse_options["comment_prefix"]["Single"])
+            else:
+                comment = None
+            decimal = "," if parse_options["decimal_comma"] else "."
+
+            # polars skips blank lines at the beginning of the file
+            pieces = []
+            for p in self.paths:
+                skiprows = self.reader_options["skip_rows"]
+                # TODO: read_csv expands globs which we should not do,
+                # because polars will already have handled them.
+                path = Path(p)
+                with path.open() as f:
+                    while f.readline() == "\n":
+                        skiprows += 1
+                pieces.append(
+                    cudf.read_csv(
+                        path,
+                        sep=sep,
+                        quotechar=quote,
+                        lineterminator=eol,
+                        names=column_names,
+                        header=header,
+                        usecols=usecols,
+                        na_filter=True,
+                        na_values=null_values,
+                        keep_default_na=False,
+                        skiprows=skiprows,
+                        comment=comment,
+                        decimal=decimal,
+                        dtype=dtype_map,
+                    )
                 )
-            )
+            df = DataFrame.from_cudf(cudf.concat(pieces))
         elif self.typ == "parquet":
-            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5a1e682abe7..dec45679c75 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
 from typing import Any
@@ -12,6 +13,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -88,10 +90,16 @@ def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
+    if typ == "ndjson":
+        (reader_options,) = map(json.loads, options)
+        cloud_options = None
+    else:
+        reader_options, cloud_options = map(json.loads, options)
     return ir.Scan(
         schema,
         typ,
-        tuple(options),
+        reader_options,
+        cloud_options,
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -402,7 +410,7 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     if isinstance(node.value, plrs.PySeries):
-        return expr.LiteralColumn(dtype, node.value)
+        return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value))
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index f129cc7ca32..c41a94da14b 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -22,22 +22,22 @@ def row_index(request):
 
 @pytest.fixture(
     params=[
-        (None, 0),
+        None,
         pytest.param(
-            (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
         pytest.param(
-            (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
     ],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
-def n_rows_skip_rows(request):
+def n_rows(request):
     return request.param
 
 
 @pytest.fixture(params=["csv", "parquet"])
-def df(request, tmp_path, row_index, n_rows_skip_rows):
+def df(request, tmp_path, row_index, n_rows):
     df = pl.DataFrame(
         {
             "a": [1, 2, 3, None],
@@ -46,14 +46,12 @@ def df(request, tmp_path, row_index, n_rows_skip_rows):
         }
     )
     name, offset = row_index
-    n_rows, skip_rows = n_rows_skip_rows
     if request.param == "csv":
         df.write_csv(tmp_path / "file.csv")
         return pl.scan_csv(
             tmp_path / "file.csv",
             row_index_name=name,
             row_index_offset=offset,
-            skip_rows_after_header=skip_rows,
             n_rows=n_rows,
         )
     else:
@@ -97,3 +95,98 @@ def test_scan_unsupported_raises(tmp_path):
     df.write_ndjson(tmp_path / "df.json")
     q = pl.scan_ndjson(tmp_path / "df.json")
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_row_index_projected_out(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.pq")
+
+    q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a"))
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_column_renames_projection_schema(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+
+    q = pl.scan_csv(
+        tmp_path / "test.csv",
+        with_column_names=lambda names: [f"{n}_suffix" for n in names],
+        schema_overrides={
+            "foo_suffix": pl.String(),
+            "bar_suffix": pl.Int8(),
+            "baz_suffix": pl.UInt16(),
+        },
+    )
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_skip_after_header_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", skip_rows_after_header=1)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_null_values_per_column_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values={"foo": "1", "baz": "5"})
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_str_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n// 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="// ")
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_char(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n# 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="#")
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("nulls", [None, "3", ["3", "5"]])
+def test_scan_csv_null_values(tmp_path, nulls):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5\n5,,2""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values=nulls)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_decimal_comma(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo|bar|baz\n1,23|2,34|3,56\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", decimal_comma=True)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_skip_initial_empty_rows(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""\n\n\n\nfoo|bar|baz\n1|2|3\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1, has_header=False)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1)
+
+    assert_gpu_result_equal(q)

From 75966deef548754a5a7f5fb49f1cf5b1be991363 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 9 Jul 2024 06:59:56 -0700
Subject: [PATCH 029/101] Publish cudf-polars nightlies (#16213)

Publish nightlies for cudf-polars.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16213
---
 .github/workflows/build.yaml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c5679cc5141..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,6 +108,28 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  wheel-build-cudf-polars:
+    needs: wheel-publish-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cudf_polars.sh
+  wheel-publish-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cudf_polars
   trigger-pandas-tests:
     if: inputs.build_type == 'nightly'
     needs: wheel-build-cudf

From 433e959deab26ccf1eb9b75b8ea3e21659da4f0a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 9 Jul 2024 10:45:05 -0400
Subject: [PATCH 030/101] Free temp memory no longer needed in multibyte_split
 processing (#16091)

Updates the `multibyte_split` logic to free temporary memory once the chars and offsets have been resolved. This gives room to the remaining processing if more temp memory is required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16091
---
 cpp/src/io/text/multibyte_split.cu | 324 ++++++++++++++---------------
 1 file changed, 162 insertions(+), 162 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 51dc0ca90af..be2e2b9a79c 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -55,6 +55,8 @@
 #include <numeric>
 #include <optional>
 
+namespace cudf::io::text {
+namespace detail {
 namespace {
 
 using cudf::io::text::detail::multistate;
@@ -299,11 +301,6 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
 
 }  // namespace
 
-namespace cudf {
-namespace io {
-namespace text {
-namespace detail {
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
@@ -336,173 +333,181 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value,
                "delimiter contains too many total tokens to produce a deterministic result.");
 
-  auto const concurrency = 2;
-
-  // must be at least 32 when using warp-reduce on partials
-  // must be at least 1 more than max possible concurrent tiles
-  // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates =
-    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-  auto tile_offsets =
-    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-
-  multibyte_split_init_kernel<<<TILES_PER_CHUNK,
-                                THREADS_PER_TILE,
-                                0,
-                                stream.value()>>>(  //
-    -TILES_PER_CHUNK,
-    TILES_PER_CHUNK,
-    tile_multistates,
-    tile_offsets,
-    cudf::io::text::detail::scan_tile_status::oob);
-
-  auto multistate_seed = multistate();
-  multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
-
-  // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
-  // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow separate logic.
-  cudf::detail::device_single_thread(
-    [tm = scan_tile_state_view<multistate>(tile_multistates),
-     to = scan_tile_state_view<output_offset>(tile_offsets),
-     multistate_seed] __device__() mutable {
-      tm.set_inclusive_prefix(-1, multistate_seed);
-      to.set_inclusive_prefix(-1, 0);
-    },
-    stream);
-
-  auto reader               = source.create_reader();
-  auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
-  auto const byte_range_end = byte_range.offset() + byte_range.size();
-  reader->skip_bytes(chunk_offset);
-  // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
-  constexpr auto max_growth = 8;
-  output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
-  output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
-
-  auto streams = cudf::detail::fork_streams(stream, concurrency);
-
-  cudaEvent_t last_launch_event;
-  CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
-
-  auto& read_stream     = streams[0];
-  auto& scan_stream     = streams[1];
-  auto chunk            = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-  int64_t base_tile_idx = 0;
+  auto chunk_offset = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
   std::optional<byte_offset> first_row_offset;
-  std::optional<byte_offset> last_row_offset;
-  bool found_last_offset = false;
   if (byte_range.offset() == 0) { first_row_offset = 0; }
-  std::swap(read_stream, scan_stream);
-
-  while (chunk->size() > 0) {
-    // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort
-    if (last_row_offset.has_value() or
-        (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
-      break;
-    }
-
-    auto tiles_in_launch =
-      cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
-
-    auto row_offsets = row_offset_storage.next_output(scan_stream);
+  std::optional<byte_offset> last_row_offset;
 
-    // reset the next chunk of tile state
-    multibyte_split_init_kernel<<<tiles_in_launch,
+  auto [global_offsets, chars] = [&] {
+    // must be at least 32 when using warp-reduce on partials
+    // must be at least 1 more than max possible concurrent tiles
+    // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
+    auto const concurrency = 2;
+    auto num_tile_states   = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+    auto tile_multistates =
+      scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+    auto tile_offsets = scan_tile_state<output_offset>(
+      num_tile_states, stream, rmm::mr::get_current_device_resource());
+
+    multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                   THREADS_PER_TILE,
                                   0,
-                                  scan_stream.value()>>>(  //
-      base_tile_idx,
-      tiles_in_launch,
+                                  stream.value()>>>(  //
+      -TILES_PER_CHUNK,
+      TILES_PER_CHUNK,
       tile_multistates,
-      tile_offsets);
+      tile_offsets,
+      cudf::io::text::detail::scan_tile_status::oob);
 
-    CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+    auto multistate_seed = multistate();
+    multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
 
-    if (delimiter.size() == 1) {
-      // the single-byte case allows for a much more efficient kernel, so we special-case it
-      byte_split_kernel<<<tiles_in_launch,
-                          THREADS_PER_TILE,
-                          0,
-                          scan_stream.value()>>>(  //
-        base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
-        tile_offsets,
-        delimiter[0],
-        *chunk,
-        row_offsets);
-    } else {
-      multibyte_split_kernel<<<tiles_in_launch,
-                               THREADS_PER_TILE,
-                               0,
-                               scan_stream.value()>>>(  //
+    // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
+    // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
+    // would have to follow separate logic.
+    cudf::detail::device_single_thread(
+      [tm = scan_tile_state_view<multistate>(tile_multistates),
+       to = scan_tile_state_view<output_offset>(tile_offsets),
+       multistate_seed] __device__() mutable {
+        tm.set_inclusive_prefix(-1, multistate_seed);
+        to.set_inclusive_prefix(-1, 0);
+      },
+      stream);
+
+    auto reader               = source.create_reader();
+    auto const byte_range_end = byte_range.offset() + byte_range.size();
+    reader->skip_bytes(chunk_offset);
+    // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
+    constexpr auto max_growth = 8;
+    output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
+    output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
+
+    auto streams = cudf::detail::fork_streams(stream, concurrency);
+
+    cudaEvent_t last_launch_event;
+    CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
+
+    auto& read_stream      = streams[0];
+    auto& scan_stream      = streams[1];
+    auto chunk             = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+    int64_t base_tile_idx  = 0;
+    bool found_last_offset = false;
+    std::swap(read_stream, scan_stream);
+
+    while (chunk->size() > 0) {
+      // if we found the last delimiter, or didn't find delimiters inside the byte range at all:
+      // abort
+      if (last_row_offset.has_value() or
+          (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
+        break;
+      }
+
+      auto tiles_in_launch =
+        cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
+
+      auto row_offsets = row_offset_storage.next_output(scan_stream);
+
+      // reset the next chunk of tile state
+      multibyte_split_init_kernel<<<tiles_in_launch,
+                                    THREADS_PER_TILE,
+                                    0,
+                                    scan_stream.value()>>>(  //
         base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
+        tiles_in_launch,
         tile_multistates,
-        tile_offsets,
-        {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
-        *chunk,
-        row_offsets);
-    }
+        tile_offsets);
+
+      CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+
+      if (delimiter.size() == 1) {
+        // the single-byte case allows for a much more efficient kernel, so we special-case it
+        byte_split_kernel<<<tiles_in_launch,
+                            THREADS_PER_TILE,
+                            0,
+                            scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_offsets,
+          delimiter[0],
+          *chunk,
+          row_offsets);
+      } else {
+        multibyte_split_kernel<<<tiles_in_launch,
+                                 THREADS_PER_TILE,
+                                 0,
+                                 scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_multistates,
+          tile_offsets,
+          {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
+          *chunk,
+          row_offsets);
+      }
 
-    // load the next chunk
-    auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-    // while that is running, determine how many offsets we output (synchronizes)
-    auto const new_offsets = [&] {
-      auto const new_offsets_unclamped =
-        tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
-        static_cast<output_offset>(row_offset_storage.size());
-      // if we are not in the last chunk, we can use all offsets
-      if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
-        return new_offsets_unclamped;
+      // load the next chunk
+      auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+      // while that is running, determine how many offsets we output (synchronizes)
+      auto const new_offsets = [&] {
+        auto const new_offsets_unclamped =
+          tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
+          static_cast<output_offset>(row_offset_storage.size());
+        // if we are not in the last chunk, we can use all offsets
+        if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
+          return new_offsets_unclamped;
+        }
+        // if we are in the last chunk, we need to find the first out-of-bounds offset
+        auto const it = thrust::make_counting_iterator(output_offset{});
+        auto const end_loc =
+          *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
+                           it,
+                           it + new_offsets_unclamped,
+                           [row_offsets, byte_range_end] __device__(output_offset i) {
+                             return row_offsets[i] >= byte_range_end;
+                           });
+        // if we had no out-of-bounds offset, we copy all offsets
+        if (end_loc == new_offsets_unclamped) { return end_loc; }
+        // otherwise we copy only up to (including) the first out-of-bounds delimiter
+        found_last_offset = true;
+        return end_loc + 1;
+      }();
+      row_offset_storage.advance_output(new_offsets, scan_stream);
+      // determine if we found the first or last field offset for the byte range
+      if (new_offsets > 0 and not first_row_offset) {
+        first_row_offset = row_offset_storage.front_element(scan_stream);
+      }
+      if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
+      // copy over the characters we need, if we already encountered the first field delimiter
+      if (first_row_offset.has_value()) {
+        auto const begin =
+          chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
+        auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
+        auto const end =
+          chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
+        auto const output_size = end - begin;
+        auto char_output       = char_storage.next_output(scan_stream);
+        thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
+        char_storage.advance_output(output_size, scan_stream);
       }
-      // if we are in the last chunk, we need to find the first out-of-bounds offset
-      auto const it = thrust::make_counting_iterator(output_offset{});
-      auto const end_loc =
-        *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
-                         it,
-                         it + new_offsets_unclamped,
-                         [row_offsets, byte_range_end] __device__(output_offset i) {
-                           return row_offsets[i] >= byte_range_end;
-                         });
-      // if we had no out-of-bounds offset, we copy all offsets
-      if (end_loc == new_offsets_unclamped) { return end_loc; }
-      // otherwise we copy only up to (including) the first out-of-bounds delimiter
-      found_last_offset = true;
-      return end_loc + 1;
-    }();
-    row_offset_storage.advance_output(new_offsets, scan_stream);
-    // determine if we found the first or last field offset for the byte range
-    if (new_offsets > 0 and not first_row_offset) {
-      first_row_offset = row_offset_storage.front_element(scan_stream);
-    }
-    if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
-    // copy over the characters we need, if we already encountered the first field delimiter
-    if (first_row_offset.has_value()) {
-      auto const begin = chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
-      auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
-      auto const end =
-        chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
-      auto const output_size = end - begin;
-      auto char_output       = char_storage.next_output(scan_stream);
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
-      char_storage.advance_output(output_size, scan_stream);
-    }
 
-    CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
+      CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
 
-    std::swap(read_stream, scan_stream);
-    base_tile_idx += tiles_in_launch;
-    chunk_offset += chunk->size();
-    chunk = std::move(next_chunk);
-  }
+      std::swap(read_stream, scan_stream);
+      base_tile_idx += tiles_in_launch;
+      chunk_offset += chunk->size();
+      chunk = std::move(next_chunk);
+    }
+
+    CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
 
-  CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
+    cudf::detail::join_streams(streams, stream);
 
-  cudf::detail::join_streams(streams, stream);
+    auto chars          = char_storage.gather(stream, mr);
+    auto global_offsets = row_offset_storage.gather(stream, mr);
+    return std::pair{std::move(global_offsets), std::move(chars)};
+  }();
 
   // if the input was empty, we didn't find a delimiter at all,
   // or the first delimiter was also the last: empty output
@@ -511,9 +516,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     return make_empty_column(type_id::STRING);
   }
 
-  auto chars          = char_storage.gather(stream, mr);
-  auto global_offsets = row_offset_storage.gather(stream, mr);
-
   // insert an offset at the beginning if we started at the beginning of the input
   bool const insert_begin = first_row_offset.value_or(0) == 0;
   // insert an offset at the end if we have not terminated the last row
@@ -591,6 +593,4 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   return result;
 }
 
-}  // namespace text
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::text

From 341e014ed22e7da1e4b8db66a1d7b6fd5fba98e9 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 9 Jul 2024 15:47:38 -0400
Subject: [PATCH 031/101] Support `pd.read_pickle` and `pd.to_pickle` in
 `cudf.pandas` (#16105)

Closes #15459

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16105
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 6 ++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index dd6f6fe76ba..3f94fc18980 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -919,6 +919,12 @@ def Index__new__(cls, *args, **kwargs):
 
 _eval_func = _FunctionProxy(_Unusable(), pd.eval)
 
+register_proxy_func(pd.read_pickle)(
+    _FunctionProxy(_Unusable(), pd.read_pickle)
+)
+
+register_proxy_func(pd.to_pickle)(_FunctionProxy(_Unusable(), pd.to_pickle))
+
 
 def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     frame = sys._getframe(level + 3)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index b0aeaba3916..bc864a48e9d 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1080,6 +1080,13 @@ def test_pickle(obj):
 
     tm.assert_equal(obj, copy)
 
+    with tempfile.TemporaryFile() as f:
+        xpd.to_pickle(obj, f)
+        f.seek(0)
+        copy = xpd.read_pickle(f)
+
+    tm.assert_equal(obj, copy)
+
 
 def test_dataframe_query():
     cudf_pandas_df = xpd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})

From 7cc01befa61d7957093bf32b99b4cac1364761f7 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:05:11 -0700
Subject: [PATCH 032/101] Parallelize `gpuInitStringDescriptors` for fixed
 length byte array data (#16109)

Closes #14113

This PR parallelizes the `gpuInitStringDescriptors` function for the fixed length byte array (FLBA) data at either warp or thread block level via cooperative groups. The function continues to execute serially (thread rank 0 in the group) for variable length arrays.

CC: @etseidl

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16109
---
 cpp/src/io/parquet/decode_preprocess.cu  |  5 +-
 cpp/src/io/parquet/page_data.cu          |  7 ++-
 cpp/src/io/parquet/page_decode.cuh       | 69 +++++++++++++++---------
 cpp/src/io/parquet/page_string_decode.cu | 10 +++-
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index e49801e6172..62f1ee88036 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -26,6 +26,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 // # of threads we're decoding with
@@ -163,7 +165,8 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
       // For V1, the choice is an overestimate (s->dict_size), or an exact number that's
       // expensive to compute. For now we're going with the latter.
       else {
-        str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+        str_len = gpuInitStringDescriptors<true, unused_state_buf>(
+          s, nullptr, target_pos, cg::this_thread_block());
       }
       break;
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 7207173b82f..e0d50d7ccf9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,6 +23,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int decode_block_size = 128;
@@ -277,6 +279,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it
     __syncthreads();
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -298,9 +301,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
       } else if (s->col.physical_type == BYTE_ARRAY or
                  s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int const dtype = s->col.physical_type;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b1f8e6dd5fe..a3f91f6859b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -21,6 +21,7 @@
 #include "parquet_gpu.hpp"
 #include "rle_stream.cuh"
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
@@ -420,46 +421,62 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int t
  * @param[in,out] s Page state input/output
  * @param[out] sb Page state buffer output
  * @param[in] target_pos Target output position
- * @param[in] t Thread ID
+ * @param[in] g Cooperative group (thread block or tile)
  * @tparam sizes_only True if only sizes are to be calculated
  * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ * @tparam thread_group Typename of the cooperative group (inferred)
  *
  * @return Total length of strings processed
  */
-template <bool sizes_only, typename state_buf>
-__device__ size_type
-gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
+template <bool sizes_only, typename state_buf, typename thread_group>
+__device__ size_type gpuInitStringDescriptors(page_state_s* s,
+                                              [[maybe_unused]] state_buf* sb,
+                                              int target_pos,
+                                              thread_group const& g)
 {
-  int pos       = s->dict_pos;
-  int total_len = 0;
+  int const t         = g.thread_rank();
+  int const dict_size = s->dict_size;
+  int k               = s->dict_val;
+  int pos             = s->dict_pos;
+  int total_len       = 0;
+
+  // All group threads can participate for fixed len byte arrays.
+  if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
+    int const dtype_len_in = s->dtype_len_in;
+    total_len              = min((target_pos - pos) * dtype_len_in, dict_size - s->dict_val);
+    if constexpr (!sizes_only) {
+      for (pos += t, k += t * dtype_len_in; pos < target_pos; pos += g.size()) {
+        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)] =
+          (k < dict_size) ? dtype_len_in : 0;
+        // dict_idx is upperbounded by dict_size.
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+        // Increment k if needed.
+        if (k < dict_size) { k = min(k + (g.size() * dtype_len_in), dict_size); }
+      }
+    }
+    // Only thread_rank = 0 updates the s->dict_val
+    if (!t) { s->dict_val += total_len; }
+  }
+  // This step is purely serial for byte arrays
+  else {
+    if (!t) {
+      uint8_t const* cur = s->data_start;
 
-  // This step is purely serial
-  if (!t) {
-    uint8_t const* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len = 0;
-      if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        if (k < dict_size) { len = s->dtype_len_in; }
-      } else {
+      for (int len = 0; pos < target_pos; pos++, len = 0) {
         if (k + 4 <= dict_size) {
           len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
           k += 4;
           if (k + len > dict_size) { len = 0; }
         }
+        if constexpr (!sizes_only) {
+          sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+          sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
+        }
+        k += len;
+        total_len += len;
       }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
-        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
+      s->dict_val = k;
     }
-    s->dict_val = k;
-    __threadfence_block();
   }
 
   return total_len;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 58e8a09d5b6..ca74a1c2ba0 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -31,6 +31,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int preprocess_block_size    = 512;
@@ -1006,6 +1008,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 1/2 modifying src_pos before all threads have read it
     __syncthreads();
+
+    // Create a warp sized thread block tile
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -1020,9 +1026,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       if (s->dict_base) {
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       int const me = t - out_thread0;
 

From 248b2de61e6b9df5fec5a15019d6db4dc52cbc01 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:50:19 -0400
Subject: [PATCH 033/101] Migrate pylibcudf lists gathering (#16170)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16170
---
 .../_lib/pylibcudf/libcudf/lists/gather.pxd   |  4 +--
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 32 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 14 ++++++++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index 17b4c1877a6..ab7ed141365 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -10,6 +10,6 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] segmented_gather(
-        const lists_column_view source_column,
-        const lists_column_view gather_map_list
+        const lists_column_view& source_column,
+        const lists_column_view& gather_map_list
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index c9d0a84e8ac..c9c43751a43 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -25,3 +25,5 @@ cpdef Column contains_nulls(Column)
 cpdef Column index_of(Column, ColumnOrScalar, bool)
 
 cpdef Column reverse(Column)
+
+cpdef Column segmented_gather(Column, Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 651f1346f88..9c56f1139c6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    gather as cpp_gather,
     reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
@@ -232,3 +233,34 @@ cpdef Column reverse(Column input):
             list_view.view(),
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column segmented_gather(Column input, Column gather_map_list):
+    """Create a column with elements gathered based on the indices in gather_map_list
+
+    For details, see :cpp:func:`segmented_gather`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    gather_map_list : Column
+        The indices of the lists column to gather.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in list of rows
+        gathered based on gather_map_list
+    """
+
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view1 = input.list_view()
+    cdef ListColumnView list_view2 = gather_map_list.list_view()
+
+    with nogil:
+        c_result = move(cpp_gather.segmented_gather(
+            list_view1.view(),
+            list_view2.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 58a1dcf8d56..0d95579acb3 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -146,3 +146,17 @@ def test_reverse(test_data):
     expect = pa.array([lst[::-1] for lst in list_column])
 
     assert_column_eq(expect, res)
+
+
+def test_segmented_gather(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = test_data[0][1]
+
+    plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
+    plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
+
+    res = plc.lists.segmented_gather(plc_column2, plc_column1)
+
+    expect = pa.array([[8, 9], [14], [0], [0, 0]])
+
+    assert_column_eq(expect, res)

From 67bd3669947da33fc56eb2b397ebbdb66223119e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 16:29:08 -0700
Subject: [PATCH 034/101] Support `arrow:schema` in Parquet writer to
 faithfully roundtrip `duration` types with Arrow (#15875)

Closes #15847

This PR adds the support to construct and write base64-encoded serialized `arrow:schema`-type IPC message to parquet file footer to allow faithfully roundtrip with Arrow via Parquet for `duration` type.

### Answered
- [x] Only construct and write `arrow:schema` if  asked by the user via `store_schema` argument (cudf) or `write_arrow_schema` (libcudf). i.e. Default these variables to `false` otherwise.
- [x] The internal/libcudf variable name for `store_schema` can stay `write_arrow_schema` and it should be fine. This has been done to disambiguate which schema (arrow or parquet) we are talking about.
- [x] Separate PR: `int96_timestamps` cannot be deprecated/removed in cuDF as Spark is actively using it. #15901
- [x] cuDF Parquet writer supports `decimal32` and `decimal64` [fixed types](https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/io/parquet/writer_impl.cu#L561). These are not directly supported by Arrow so we will [convert](https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155) `decimal32/decimal64` columns to `decimal128`.
- [x] `is_col_nullable()` function moved to `writer_impl_helpers.cpp` along with some other helper functions.
- [x] A common `convert_data_to_decimal128` can be separated out and used in `writer_impl.cu` and `to_arrow.cu`. Tracking in a separate issue. #16194

CC @vuule @etseidl @nvdbaranec @GregoryKimball @galipremsagar for vis.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15875
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/io/parquet.hpp               |  25 ++
 cpp/src/io/functions.cpp                      |  18 +
 cpp/src/io/parquet/arrow_schema_writer.cpp    | 388 ++++++++++++++++++
 cpp/src/io/parquet/arrow_schema_writer.hpp    |  53 +++
 cpp/src/io/parquet/parquet_common.hpp         |  10 +
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  15 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |   5 +-
 cpp/src/io/parquet/writer_impl.cu             | 337 +++++++++------
 cpp/src/io/parquet/writer_impl.hpp            |   1 +
 cpp/src/io/parquet/writer_impl_helpers.cpp    | 131 ++++++
 cpp/src/io/parquet/writer_impl_helpers.hpp    |  97 +++++
 cpp/tests/io/parquet_writer_test.cpp          |  89 +++-
 python/cudf/cudf/_lib/parquet.pyx             |  11 +-
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |   5 +
 python/cudf/cudf/io/parquet.py                |  11 +
 python/cudf/cudf/tests/test_parquet.py        | 379 ++++++++++++++---
 python/cudf/cudf/utils/ioutils.py             |   6 +
 18 files changed, 1386 insertions(+), 197 deletions(-)
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.cpp
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.hpp
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.cpp
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7999ada9282..903cff27be4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -409,6 +409,7 @@ add_library(
   src/io/orc/stripe_init.cu
   src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
+  src/io/parquet/arrow_schema_writer.cpp
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
@@ -425,6 +426,7 @@ add_library(
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
+  src/io/parquet/writer_impl_helpers.cpp
   src/io/parquet/decode_fixed.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 431f14af522..4d98cae73a7 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -597,6 +597,8 @@ class parquet_writer_options_base {
   // Parquet writer can write timestamps as UTC
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
+  // Whether to write ARROW schema
+  bool _write_arrow_schema = false;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -689,6 +691,13 @@ class parquet_writer_options_base {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
+  /**
+   * @brief Returns `true` if arrow schema will be written
+   *
+   * @return `true` if arrow schema will be written
+   */
+  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -824,6 +833,13 @@ class parquet_writer_options_base {
    */
   void enable_utc_timestamps(bool val);
 
+  /**
+   * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of arrow schema.
+   */
+  void enable_write_arrow_schema(bool val);
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1084,6 +1100,15 @@ class parquet_writer_options_builder_base {
    * @return this for chaining
    */
   BuilderT& utc_timestamps(bool enabled);
+
+  /**
+   * @brief Set to true if arrow schema is to be written
+   *
+   * @param enabled Boolean value to enable/disable writing of arrow schema
+   * @return this for chaining
+   */
+  BuilderT& write_arrow_schema(bool enabled);
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 5daa55d4552..b4ece9cec66 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -762,6 +762,9 @@ void parquet_writer_options_base::set_compression(compression_type compression)
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
 {
+  CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(),
+               "INT96 timestamps and arrow schema cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
   _write_timestamps_as_int96 = req;
 }
 
@@ -770,6 +773,14 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val)
   _write_timestamps_as_UTC = val;
 }
 
+void parquet_writer_options_base::enable_write_arrow_schema(bool val)
+{
+  CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(),
+               "arrow schema and INT96 timestamps cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
+  _write_arrow_schema = val;
+}
+
 void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes)
 {
   CUDF_EXPECTS(
@@ -974,6 +985,13 @@ BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::utc_timestamp
   return static_cast<BuilderT&>(*this);
 }
 
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_arrow_schema(bool enabled)
+{
+  _options.enable_write_arrow_schema(enabled);
+  return static_cast<BuilderT&>(*this);
+}
+
 template <class BuilderT, class OptionsT>
 BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_v2_headers(bool enabled)
 {
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
new file mode 100644
index 00000000000..ddf65e9020f
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.cpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#include "arrow_schema_writer.hpp"
+
+#include "io/parquet/parquet_common.hpp"
+#include "io/utilities/base64_utilities.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+namespace {
+
+// Copied over from arrow source for better code readability
+namespace flatbuf       = cudf::io::parquet::flatbuf;
+using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
+using Offset            = flatbuffers::Offset<void>;
+using FBString          = flatbuffers::Offset<flatbuffers::String>;
+
+/**
+ * @brief Recursively construct the arrow schema (fields) tree
+ *
+ * @param fbb The root flatbuffer builder object instance
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return Flatbuffer offset to the constructed field
+ */
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps);
+
+/**
+ * @brief Functor to convert cudf column metadata to arrow schema field metadata
+ */
+struct dispatch_to_flatbuf {
+  FlatBufferBuilder& fbb;
+  cudf::detail::LinkedColPtr const& col;
+  column_in_metadata const& col_meta;
+  single_write_mode const write_mode;
+  bool const utc_timestamps;
+  Offset& field_offset;
+  flatbuf::Type& field_type_id;
+  std::vector<FieldOffset>& children;
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Bool;
+    field_offset  = flatbuf::CreateBool(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, float>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, double>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Utf8View;
+    field_offset  = flatbuf::CreateUtf8View(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Date;
+    // Date type (Set unit type to DAY for arrows's Date32)
+    field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset = flatbuf::CreateTimestamp(
+                     fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
+  {
+    // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type.
+    //  This also allows for easy and faithful roundtripping with cudf.
+    field_type_id = flatbuf::Type_Time;
+    field_offset  = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    field_type_id = flatbuf::Type_Decimal;
+    field_offset  = flatbuf::CreateDecimal(fbb,
+                                          (col_meta.is_decimal_precision_set())
+                                             ? col_meta.get_decimal_precision()
+                                             : MAX_DECIMAL128_PRECISION,
+                                          col->type().scale(),
+                                          128)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
+  {
+    // Lists are represented differently in arrow and cuDF.
+    // cuDF representation: List<int>: "col_name" : { "list", "element:int" } (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item:int>" } (1 child)
+    // Hence, we only need to process the second child of the list.
+    if constexpr (std::is_same_v<T, cudf::list_view>) {
+      children.emplace_back(make_arrow_schema_fields(
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
+      field_type_id = flatbuf::Type_List;
+      field_offset  = flatbuf::CreateList(fbb).Union();
+    }
+
+    // Traverse the struct in DFS manner and process children fields.
+    else if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      std::transform(thrust::make_counting_iterator(0UL),
+                     thrust::make_counting_iterator(col->children.size()),
+                     std::back_inserter(children),
+                     [&](auto const idx) {
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
+                     });
+      field_type_id = flatbuf::Type_Struct_;
+      field_offset  = flatbuf::CreateStruct_(fbb).Union();
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    // `dictionary32` columns are not written to parquet by cudf.
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps)
+{
+  // Variables to be set by the dispatch_to_flatbuf functor
+  Offset field_offset         = 0;
+  flatbuf::Type field_type_id = flatbuf::Type_NONE;
+  std::vector<FieldOffset> children;
+
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            utc_timestamps,
+                                            field_offset,
+                                            field_type_id,
+                                            children});
+
+  // push to field offsets vector
+  return flatbuf::CreateField(
+    fbb,
+    fbb.CreateString(column_metadata.get_name()),                    // name
+    is_output_column_nullable(column, column_metadata, write_mode),  // nullable
+    field_type_id,                                                   // type id
+    field_offset,                                                    // field offset
+    {0},                                                             // DictionaryOffset
+    fbb.CreateVector(children.data(), children.size()));             // children vector
+}
+
+}  // namespace
+
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               single_write_mode const write_mode,
+                                               bool const utc_timestamps)
+{
+  // Lambda function to convert int32 to a string of uint8 bytes
+  auto const convert_int32_to_byte_string = [&](int32_t const value) {
+    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::memcpy(buffer.data(), &value, sizeof(int32_t));
+    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
+  };
+
+  // Instantiate a flatbuffer builder
+  FlatBufferBuilder fbb;
+
+  // Create an empty field offset vector and reserve space for linked columns
+  std::vector<FieldOffset> field_offsets;
+  field_offsets.reserve(linked_columns.size());
+
+  // populate field offsets (aka schema fields)
+  std::transform(thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())),
+                 thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
+                 std::back_inserter(field_offsets),
+                 [&](auto const& elem) {
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                 });
+
+  // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
+  // create an ipc message flatbuffer
+  fbb.Finish(flatbuf::CreateMessage(
+    fbb,
+    flatbuf::MetadataVersion_V5,    // Metadata version V5 (latest)
+    flatbuf::MessageHeader_Schema,  // Schema type message header
+    flatbuf::CreateSchema(fbb,
+                          flatbuf::Endianness::Endianness_Little,
+                          fbb.CreateVector(field_offsets))
+      .Union(),                                // arrow:schema built from the field vector
+    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH  // Body length is zero for schema type ipc message
+    ));
+
+  // Construct the final string and store it here to use its view in base64_encode
+  std::string const ipc_message =
+    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
+    // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the
+    // ipc message's metadata length
+    convert_int32_to_byte_string(fbb.GetSize()) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), fbb.GetSize());
+
+  // Encode the final ipc message string to base64 and return
+  return cudf::io::detail::base64_encode(ipc_message);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
new file mode 100644
index 00000000000..9bc435bf6c8
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.hpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/parquet.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Construct and return arrow schema from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct the arrow schema tree.
+ * Serializes the arrow schema tree and stores it as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header size (padded for 16 byte alignment) and a continuation string. The final
+ * string is base64 encoded and returned.
+ *
+ * @param linked_columns Vector of table column views
+ * @param metadata Metadata of the columns of the table
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return The constructed arrow ipc message string
+ */
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               cudf::io::detail::single_write_mode const write_mode,
+                                               bool const utc_timestamps);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 8507eca047e..e42c259b1bf 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include <string>
 
 namespace cudf::io::parquet::detail {
 
@@ -26,6 +27,15 @@ auto constexpr MAX_DECIMAL32_PRECISION  = 9;
 auto constexpr MAX_DECIMAL64_PRECISION  = 18;
 auto constexpr MAX_DECIMAL128_PRECISION = 38;  // log10(2^(sizeof(int128_t) * 8 - 1) - 1)
 
+// Constants copied from arrow source and renamed to match the case
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+int32_t constexpr IPC_CONTINUATION_TOKEN                             = -1;
+std::string const ARROW_SCHEMA_KEY                                   = "ARROW:schema";
+
+// Schema type ipc message has zero length body
+int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
+
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index ebd4affd099..d1e9a823d3b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -564,14 +564,14 @@ aggregate_reader_metadata::aggregate_reader_metadata(
   // Collect and apply arrow:schema from Parquet's key value metadata section
   if (use_arrow_schema) { apply_arrow_schema(); }
 
-  // Erase "ARROW:schema" from the output pfm if exists
+  // Erase ARROW_SCHEMA_KEY from the output pfm if exists
   std::for_each(
-    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase(ARROW_SCHEMA_KEY); });
 }
 
 arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
 {
-  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Check the key_value metadata for arrow schema, decode and walk it
   // Function to convert from flatbuf::duration type to cudf::type_id
   auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
     // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
@@ -645,9 +645,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       return true;
     };
 
-  // TODO: Should we check if any file has the "ARROW:schema" key
-  // Or if all files have the same "ARROW:schema"?
-  auto const it = keyval_maps[0].find("ARROW:schema");
+  auto const it = keyval_maps[0].find(ARROW_SCHEMA_KEY);
   if (it == keyval_maps[0].end()) { return {}; }
 
   // Decode the base64 encoded ipc message string
@@ -788,11 +786,6 @@ void aggregate_reader_metadata::apply_arrow_schema()
 std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   std::string_view const serialized_message) const
 {
-  // Constants copied from arrow source and renamed to match the case
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
-  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
-
   // message buffer
   auto message_buf = serialized_message.data();
   // current message (buffer) size
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9aeb19a7723..6bfa8519c76 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -117,6 +117,9 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+/**
+ * @brief Class to extract data types from arrow schema tree
+ */
 struct arrow_schema_data_types {
   std::vector<arrow_schema_data_types> children;
   data_type type{type_id::EMPTY};
@@ -142,7 +145,7 @@ class aggregate_reader_metadata {
     const;
 
   /**
-   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
    */
   [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index bed4dbc5a66..66b4fce16fe 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
@@ -30,6 +31,7 @@
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
+#include "writer_impl_helpers.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
@@ -39,9 +41,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -70,7 +69,8 @@ struct aggregate_writer_metadata {
                             host_span<std::map<std::string, std::string> const> kv_md,
                             host_span<SchemaElement const> tbl_schema,
                             size_type num_columns,
-                            statistics_freq stats_granularity)
+                            statistics_freq stats_granularity,
+                            std::string const arrow_schema_ipc_message)
     : version(1),
       schema(std::vector<SchemaElement>(tbl_schema.begin(), tbl_schema.end())),
       files(partitions.size())
@@ -92,6 +92,13 @@ struct aggregate_writer_metadata {
                        return KeyValue{kv.first, kv.second};
                      });
     }
+
+    // Append arrow schema to the key-value metadata
+    if (not arrow_schema_ipc_message.empty()) {
+      std::for_each(this->files.begin(), this->files.end(), [&](auto& file) {
+        file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message});
+      });
+    }
   }
 
   aggregate_writer_metadata(aggregate_writer_metadata const&) = default;
@@ -182,26 +189,6 @@ struct aggregate_writer_metadata {
 
 namespace {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
 /**
  * @brief Convert a mask of encodings to a vector.
  *
@@ -326,6 +313,7 @@ struct leaf_schema_fn {
   column_in_metadata const& col_meta;
   bool timestamp_is_int96;
   bool timestamp_is_utc;
+  bool write_arrow_schema;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
@@ -493,10 +481,11 @@ struct leaf_schema_fn {
     }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
+    // duration_D is based on int32_t and not a valid arrow duration type so simply convert to
+    // time32(ms).
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
@@ -507,62 +496,86 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.ts_scale       = 1000;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale       = 1000;
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.converted_type = ConvertedType::TIME_MICROS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type         = Type::INT64;
-    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if (std::is_same_v<T, numeric::decimal32>) {
-      col_schema.type              = Type::INT32;
-      col_schema.stats_dtype       = statistics_dtype::dtype_int32;
-      col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal64>) {
-      col_schema.type              = Type::INT64;
-      col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
-      col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal128>) {
+    // If writing arrow schema, then convert d32 and d64 to d128
+    if (write_arrow_schema or std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
       col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
-      CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      if (std::is_same_v<T, numeric::decimal32>) {
+        col_schema.type              = Type::INT32;
+        col_schema.stats_dtype       = statistics_dtype::dtype_int32;
+        col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
+      } else if (std::is_same_v<T, numeric::decimal64>) {
+        col_schema.type              = Type::INT64;
+        col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
+        col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
+      } else {
+        CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      }
     }
+
+    // Write logical and converted types, decimal scale and precision
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
     col_schema.logical_type->decimal_type->scale = -col->type().scale();
@@ -590,33 +603,19 @@ struct leaf_schema_fn {
   }
 };
 
-inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
-                            column_in_metadata const& col_meta,
-                            single_write_mode write_mode)
-{
-  if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return col_meta.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or col->nullable();
-}
-
 /**
  * @brief Construct schema from input columns and per-column input options
  *
  * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
-std::vector<schema_tree_node> construct_schema_tree(
+std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
   bool int96_timestamps,
-  bool utc_timestamps)
+  bool utc_timestamps,
+  bool write_arrow_schema)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -629,7 +628,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
   std::function<void(cudf::detail::LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
-      bool const col_nullable = is_col_nullable(col, col_meta, write_mode);
+      bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode);
 
       auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
                                                 column_in_metadata const& col_meta) {
@@ -854,7 +853,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         right_child_meta.set_name("value");
         // check the repetition type of key is required i.e. the col should be non-nullable
         auto key_col = col->children[lists_column_view::child_column_index]->children[0];
-        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode),
+        CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode),
                      "key column cannot be nullable. For chunked writing, explicitly set the "
                      "nullability to false in metadata");
         // process key
@@ -886,7 +885,8 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         cudf::type_dispatcher(
           col->type(),
-          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
+          leaf_schema_fn{
+            col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1148,7 +1148,6 @@ void calculate_page_fragments(device_span<PageFragment> frag,
  *
  * @param frag_stats output statistics
  * @param frags Input page fragments
- * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
@@ -1164,32 +1163,6 @@ void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
   stream.synchronize();
 }
 
-auto to_nvcomp_compression_type(Compression codec)
-{
-  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
-  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-auto page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
@@ -1629,23 +1602,127 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
 }
 
 /**
- * @brief Fill the table metadata with default column names.
+ * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
  *
- * @param table_meta The table metadata to fill
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
  */
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+template <typename DecimalType>
+rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
+                                                           rmm::cuda_stream_view stream)
 {
-  // Fill unnamed columns' names in table_meta
-  std::function<void(column_in_metadata&, std::string)> add_default_name =
-    [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
-      for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
-      }
-    };
-  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
-    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
-  }
+  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+
+  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // The lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return d128_buffer;
+}
+
+/**
+ * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
+ *        update the input table metadata, and return a new vector of column views.
+ *
+ * @param[in,out] table_meta The table metadata
+ * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+std::vector<column_view> convert_decimal_columns_and_metadata(
+  table_input_metadata& table_meta,
+  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  table_view const& table,
+  rmm::cuda_stream_view stream)
+{
+  // Lambda function to convert each decimal32/decimal64 column to decimal128.
+  std::function<column_view(column_view, column_in_metadata&)> convert_column =
+    [&](column_view column, column_in_metadata& metadata) -> column_view {
+    // Vector of passable-by-reference children column views
+    std::vector<column_view> converted_children;
+
+    // Process children column views first
+    std::transform(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(column.num_children()),
+      std::back_inserter(converted_children),
+      [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); });
+
+    // Process this column view. Only convert if decimal32 and decimal64 column.
+    switch (column.type().id()) {
+      case type_id::DECIMAL32:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      case type_id::DECIMAL64:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      default:
+        // Update the children vector keeping everything else the same
+        return {column.type(),
+                column.size(),
+                column.head(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+    }
+  };
+
+  // Vector of converted column views
+  std::vector<column_view> converted_column_views;
+
+  // Convert each column view
+  std::transform(
+    thrust::make_zip_iterator(
+      thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())),
+    std::back_inserter(converted_column_views),
+    [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+  return converted_column_views;
 }
 
 /**
@@ -1698,12 +1775,22 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
+                                   bool write_arrow_schema,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec = table_to_linked_columns(input);
-  auto schema_tree =
-    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
+  // Container to store decimal128 converted data if needed
+  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+
+  // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
+  // and initialize LinkedColVector
+  auto vec = table_to_linked_columns(
+    (write_arrow_schema)
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      : input);
+
+  auto schema_tree = construct_parquet_schema_tree(
+    vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1826,7 +1913,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::unique_ptr<aggregate_writer_metadata> agg_meta;
   if (!curr_agg_meta) {
     agg_meta = std::make_unique<aggregate_writer_metadata>(
-      partitions, kv_meta, this_table_schema, num_columns, stats_granularity);
+      partitions,
+      kv_meta,
+      this_table_schema,
+      num_columns,
+      stats_granularity,
+      (write_arrow_schema)
+        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
+        : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -2307,6 +2401,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2337,6 +2432,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2378,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
   CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = std::make_unique<table_input_metadata>(input); }
-  fill_table_meta(_table_meta);
+  fill_table_meta(*_table_meta);
 
   // All kinds of memory allocation and data compressions/encoding are performed here.
   // If any error occurs, such as out-of-memory exception, the internal state of the current
@@ -2415,6 +2511,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
+                                           _write_arrow_schema,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 784f78f06d5..63128faf993 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  bool const _write_arrow_schema;
   std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
new file mode 100644
index 00000000000..e2f09f872d3
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.cpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
+{
+  switch (codec) {
+    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
+    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+uint32_t page_alignment(Compression codec)
+{
+  if (codec == Compression::UNCOMPRESSED or
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+}
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+{
+  if (codec == Compression::UNCOMPRESSED) return 0;
+
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
+}
+
+void fill_table_meta(table_input_metadata& table_meta)
+{
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) {
+    add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i));
+  }
+}
+
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
+{
+  if (column.is_empty()) { return 0; }
+
+  if (is_fixed_width(column.type())) {
+    return size_of(column.type()) * column.size();
+  } else if (column.type().id() == type_id::STRING) {
+    auto const scol = strings_column_view(column);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
+  } else if (column.type().id() == type_id::STRUCT) {
+    auto const scol = structs_column_view(column);
+    size_t ret      = 0;
+    for (int i = 0; i < scol.num_children(); i++) {
+      ret += column_size(scol.get_sliced_child(i, stream), stream);
+    }
+    return ret;
+  } else if (column.type().id() == type_id::LIST) {
+    auto const lcol = lists_column_view(column);
+    return column_size(lcol.get_sliced_child(stream), stream);
+  }
+
+  CUDF_FAIL("Unexpected compound type");
+}
+
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             single_write_mode write_mode)
+{
+  if (column_metadata.is_nullability_defined()) {
+    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return column_metadata.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or column->nullable();
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
new file mode 100644
index 00000000000..a85411594e9
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.hpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#pragma once
+#include "io/comp/nvcomp_adapter.hpp"
+#include "parquet_common.hpp"
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/detail/parquet.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Function that translates GDF compression to parquet compression.
+ *
+ * @param compression The compression type
+ * @return The supported Parquet compression
+ */
+Compression to_parquet_compression(compression_type compression);
+
+/**
+ * @brief Function that translates the given compression codec to nvcomp compression type.
+ *
+ * @param codec Compression codec
+ * @return Translated nvcomp compression type
+ */
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
+
+/**
+ * @brief Function that computes input alignment requirements for the given compression type.
+ *
+ * @param codec Compression codec
+ * @return Required alignment
+ */
+uint32_t page_alignment(Compression codec);
+
+/**
+ * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
+ *        batch.
+ *
+ * @param codec Compression codec
+ * @param compression_blocksize Size of the largest uncompressed chunk in the batch
+ * @return Maximum compressed chunk size
+ */
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
+
+/**
+ * @brief Fill the table metadata with default column names.
+ *
+ * @param table_meta The table metadata to fill
+ */
+void fill_table_meta(table_input_metadata& table_meta);
+
+/**
+ * @brief Compute size (in bytes) of the data stored in the given column.
+ *
+ * @param column The input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The data size of the input
+ */
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Indicates if the column should be marked as nullable in the output schema
+ *
+ * Returns `true` if the input column is nullable or if the write mode is not set to
+ * write the table all at once instead of chunked.
+ *
+ * @param column A view of the (linked) column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ *
+ * @return Whether the column is nullable.
+ */
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             ::cudf::io::detail::single_write_mode write_mode);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a1f4c7b81d8..e07ebe25322 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -76,20 +76,27 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_arrow_schema(arrow_schema);
+
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
 
   auto durations_d_got =
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
 
-  auto durations_s_got =
-    cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  if (arrow_schema) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+  } else {
+    auto durations_s_got =
+      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  }
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
@@ -98,10 +105,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; }, false);
-  test_durations([](auto i) { return (i % 2) != 0; }, false);
-  test_durations([](auto i) { return (i % 3) != 0; }, false);
-  test_durations([](auto i) { return false; }, false);
+  test_durations([](auto i) { return true; }, false, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  test_durations([](auto i) { return false; }, false, false);
+
+  test_durations([](auto i) { return true; }, false, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -493,6 +505,50 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table);
 }
 
+TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema)
+{
+  constexpr cudf::size_type num_rows = 500;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto table = table_view({col0, col1});
+
+  auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
+      .write_arrow_schema(true);
+
+  cudf::io::table_input_metadata expected_metadata(table);
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION);
+  expected_metadata.column_metadata[1].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION);
+  args.set_metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(args);
+
+  auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto expected_table = table_view({expected_col0, expected_col1});
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table);
+}
+
 TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 {
   auto const unused_table = std::make_unique<table>();
@@ -1935,10 +1991,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
 
 TEST_F(ParquetWriterTest, DurationByteStreamSplit)
 {
-  test_durations([](auto i) { return true; }, true);
-  test_durations([](auto i) { return (i % 2) != 0; }, true);
-  test_durations([](auto i) { return (i % 3) != 0; }, true);
-  test_durations([](auto i) { return false; }, true);
+  test_durations([](auto i) { return true; }, true, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, false);
+  test_durations([](auto i) { return false; }, true, false);
+
+  test_durations([](auto i) { return true; }, true, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, true);
+  test_durations([](auto i) { return false; }, true, true);
 }
 
 TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d1ec5be9e62..158fb6051c3 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -440,6 +440,7 @@ def write_parquet(
     object column_encoding=None,
     object column_type_length=None,
     object output_as_binary=None,
+    write_arrow_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -544,6 +545,7 @@ def write_parquet(
         .write_v2_headers(header_version == "2.0")
         .dictionary_policy(dict_policy)
         .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
         .build()
     )
     if partitions_info is not None:
@@ -623,6 +625,9 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -641,6 +646,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
     cdef size_t max_dictionary_size
     cdef cudf_io_types.dictionary_policy dict_policy
+    cdef bool write_arrow_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
@@ -649,7 +655,8 @@ cdef class ParquetWriter:
                   int max_page_size_bytes=524288,
                   int max_page_size_rows=20000,
                   int max_dictionary_size=1048576,
-                  bool use_dictionary=True):
+                  bool use_dictionary=True,
+                  bool store_schema=False):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -670,6 +677,7 @@ cdef class ParquetWriter:
             if use_dictionary
             else cudf_io_types.dictionary_policy.NEVER
         )
+        self.write_arrow_schema = store_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -788,6 +796,7 @@ cdef class ParquetWriter:
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
                 .max_dictionary_size(self.max_dictionary_size)
+                .write_arrow_schema(self.write_arrow_schema)
                 .build()
             )
             args.set_dictionary_policy(self.dict_policy)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 0ef6553db56..c38f39f7749 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -78,6 +78,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
+        bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_page_size_rows(size_type val) except +
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
+        void enable_write_arrow_schema(bool val) except +
         void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
@@ -143,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         BuilderT& utc_timestamps(
             bool enabled
         ) except +
+        BuilderT& write_arrow_schema(
+            bool enabled
+        ) except +
         BuilderT& row_group_size_bytes(
             size_t val
         ) except +
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7733e770d99..fd0792b5edb 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -73,6 +73,7 @@ def _write_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    write_arrow_schema=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -110,6 +111,7 @@ def _write_parquet(
         "column_encoding": column_encoding,
         "column_type_length": column_type_length,
         "output_as_binary": output_as_binary,
+        "write_arrow_schema": write_arrow_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -154,6 +156,7 @@ def write_to_dataset(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -242,6 +245,9 @@ def write_to_dataset(
     output_as_binary : set, optional, default None
         If a column name is present in the set, that column will be output as
         unannotated binary, rather than the default 'UTF-8'.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -285,6 +291,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     else:
@@ -312,6 +319,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     return metadata
@@ -968,6 +976,7 @@ def to_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
     *args,
     **kwargs,
 ):
@@ -1023,6 +1032,7 @@ def to_parquet(
                 column_encoding=column_encoding,
                 column_type_length=column_type_length,
                 output_as_binary=output_as_binary,
+                store_schema=store_schema,
             )
 
         partition_info = (
@@ -1055,6 +1065,7 @@ def to_parquet(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            write_arrow_schema=store_schema,
         )
 
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 588bc87d268..ff0c9040737 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1617,7 +1617,11 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
     assert_eq(pdf, gdf)
 
     # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True)
+    gdf.to_parquet(
+        gdf_fname.strpath,
+        index=None,
+        int96_timestamps=True,
+    )
 
     assert os.path.exists(gdf_fname)
 
@@ -1789,10 +1793,11 @@ def test_parquet_write_bytes_io(simple_gdf):
     assert_eq(cudf.read_parquet(output), simple_gdf)
 
 
-def test_parquet_writer_bytes_io(simple_gdf):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_bytes_io(simple_gdf, store_schema):
     output = BytesIO()
 
-    writer = ParquetWriter(output)
+    writer = ParquetWriter(output, store_schema=store_schema)
     writer.write_table(simple_gdf)
     writer.write_table(simple_gdf)
     writer.close()
@@ -2124,7 +2129,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 
 
 @pytest.mark.parametrize("cols", [None, ["b"]])
-def test_parquet_write_to_dataset(tmpdir_factory, cols):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2140,7 +2146,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
             "b": np.random.choice(np.arange(4), size=size),
         }
     )
-    gdf.to_parquet(dir1, partition_cols=cols)
+    gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
     # Read back with cudf
@@ -2156,7 +2162,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         }
     )
     with pytest.raises(ValueError):
-        gdf.to_parquet(dir1, partition_cols=cols)
+        gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
 
 
 @pytest.mark.parametrize(
@@ -2386,7 +2392,8 @@ def test_parquet_writer_list_large_mixed(tmpdir):
     assert_eq(expect, got)
 
 
-def test_parquet_writer_list_chunked(tmpdir):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_list_chunked(tmpdir, store_schema):
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2407,7 +2414,7 @@ def test_parquet_writer_list_chunked(tmpdir):
     expect = cudf.concat([table1, table2])
     expect = expect.reset_index(drop=True)
 
-    writer = ParquetWriter(fname)
+    writer = ParquetWriter(fname, store_schema=store_schema)
     writer.write_table(table1)
     writer.write_table(table2)
     writer.close()
@@ -2542,6 +2549,10 @@ def normalized_equals(value1, value2):
         value1 = None
     if value2 is pd.NA or value2 is pd.NaT:
         value2 = None
+    if isinstance(value1, np.datetime64):
+        value1 = pd.Timestamp(value1).to_pydatetime()
+    if isinstance(value2, np.datetime64):
+        value2 = pd.Timestamp(value2).to_pydatetime()
     if isinstance(value1, pd.Timestamp):
         value1 = value1.to_pydatetime()
     if isinstance(value2, pd.Timestamp):
@@ -2550,6 +2561,9 @@ def normalized_equals(value1, value2):
         value1 = value1.replace(tzinfo=None)
     if isinstance(value2, datetime.datetime):
         value2 = value2.replace(tzinfo=None)
+    if isinstance(value1, pd.Timedelta):
+        unit = "ms" if value1.unit == "s" else value1.unit
+        value2 = pd.Timedelta(value2, unit=unit)
 
     # if one is datetime then both values are datetimes now
     if isinstance(value1, datetime.datetime):
@@ -2563,7 +2577,8 @@ def normalized_equals(value1, value2):
 
 
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2580,7 +2595,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     if add_nulls:
         for col in gdf:
             set_random_null_mask_inplace(gdf[col])
-    gdf.to_parquet(file_path, index=False)
+    gdf.to_parquet(file_path, index=False, store_schema=store_schema)
 
     # Read back from pyarrow
     pq_file = pq.ParquetFile(file_path)
@@ -3205,7 +3220,8 @@ def test_parquet_writer_zstd():
         assert_eq(expected, got)
 
 
-def test_parquet_writer_time_delta_physical_type():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_time_delta_physical_type(store_schema):
     df = cudf.DataFrame(
         {
             "s": cudf.Series([1], dtype="timedelta64[s]"),
@@ -3217,22 +3233,35 @@ def test_parquet_writer_time_delta_physical_type():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, store_schema=store_schema)
 
     got = pd.read_parquet(buffer)
-    expected = pd.DataFrame(
-        {
-            "s": ["00:00:01"],
-            "ms": ["00:00:00.002000"],
-            "us": ["00:00:00.000003"],
-            "ns": ["00:00:00.000004"],
-        },
-        dtype="str",
-    )
+
+    if store_schema:
+        expected = pd.DataFrame(
+            {
+                "s": ["0 days 00:00:01"],
+                "ms": ["0 days 00:00:00.002000"],
+                "us": ["0 days 00:00:00.000003"],
+                "ns": ["0 days 00:00:00.000004"],
+            },
+            dtype="str",
+        )
+    else:
+        expected = pd.DataFrame(
+            {
+                "s": ["00:00:01"],
+                "ms": ["00:00:00.002000"],
+                "us": ["00:00:00.000003"],
+                "ns": ["00:00:00.000004"],
+            },
+            dtype="str",
+        )
     assert_eq(got.astype("str"), expected)
 
 
-def test_parquet_roundtrip_time_delta():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_roundtrip_time_delta(store_schema):
     num_rows = 12345
     df = cudf.DataFrame(
         {
@@ -3255,10 +3284,11 @@ def test_parquet_roundtrip_time_delta():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
-    # TODO: Remove `check_dtype` once following issue is fixed in arrow:
-    # https://github.com/apache/arrow/issues/33321
+    df.to_parquet(buffer, store_schema=store_schema)
+    # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]`
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
+    if store_schema:
+        assert_eq(df, pd.read_parquet(buffer))
 
 
 def test_parquet_reader_malformed_file(datadir):
@@ -3420,35 +3450,87 @@ def test_parquet_reader_roundtrip_with_arrow_schema():
     # Check results for reader with schema
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
 
-def test_parquet_reader_roundtrip_structs_with_arrow_schema():
-    # Ensure that the structs with duration types are faithfully being
-    # roundtripped across Parquet with arrow schema
-    pdf = pd.DataFrame(
-        {
-            "struct": {
-                "payload": {
-                    "Domain": {
-                        "Name": "abc",
-                        "Id": {"Name": "host", "Value": "127.0.0.8"},
-                        "Duration": datetime.timedelta(minutes=12),
-                    },
-                    "StreamId": "12345678",
-                    "Duration": datetime.timedelta(minutes=4),
-                    "Offset": None,
-                    "Resource": [
-                        {
-                            "Name": "ZoneName",
-                            "Value": "RAPIDS",
-                            "Duration": datetime.timedelta(seconds=1),
-                        }
-                    ],
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
                 }
             }
-        }
-    )
+        ],
+    ],
+)
+def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame({"struct": pd.Series(data)})
 
-    # Reset the buffer and write parquet with arrow
     buffer = BytesIO()
     pdf.to_parquet(buffer, engine="pyarrow")
 
@@ -3460,6 +3542,203 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
     # Check results
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
+
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_with_arrow_schema(index):
+    # Ensure that the concrete and nested types are faithfully being roundtripped
+    # across Parquet with arrow schema
+    expected = cudf.DataFrame(
+        {
+            "s": cudf.Series([None, None, None], dtype="timedelta64[s]"),
+            "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([-1234, 123, 4123], dtype="int64"),
+            "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "bool": cudf.Series([True, None, False], dtype=bool),
+            "fixed32": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal32Dtype(7, 2)
+            ),
+            "fixed64": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal64Dtype(7, 2)
+            ),
+            "fixed128": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal128Dtype(7, 2)
+            ),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": cudf.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write to Parquet with arrow schema for faithful roundtrip
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Convert decimal types to d128
+    expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)})
+    expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)})
+
+    # Read parquet with pyarrow, pandas and cudf readers
+    got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
+    got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))
+    got3 = cudf.read_parquet(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        got.drop(columns="__index_level_0__", inplace=True)
+        got2.drop(columns="__index_level_0__", inplace=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+    assert_eq(expected, got3)
+
+
+def test_parquet_writer_int96_timestamps_and_arrow_schema():
+    df = cudf.DataFrame(
+        {
+            "timestamp": cudf.Series(
+                [1234, 123, 4123], dtype="datetime64[ms]"
+            ),
+        }
+    )
+
+    # Output buffer
+    buffer = BytesIO()
+
+    # Writing out parquet with both INT96 timestamps and arrow_schema
+    # enabled should throw an exception.
+    with pytest.raises(RuntimeError):
+        df.to_parquet(buffer, int96_timestamps=True, store_schema=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
+                }
+            }
+        ],
+    ],
+)
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_structs_with_arrow_schema(
+    tmpdir, data, index
+):
+    # Ensure that the structs are faithfully being roundtripped across
+    # Parquet with arrow schema
+    pa_expected = pa.Table.from_pydict({"struct": data})
+
+    expected = cudf.DataFrame.from_arrow(pa_expected)
+
+    # Write expected data frame to Parquet with arrow schema
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Read Parquet with pyarrow
+    pa_got = pq.read_table(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        pa_got = pa_got.drop(columns="__index_level_0__")
+
+    # Check results
+    assert_eq(pa_expected, pa_got)
+
+    # Convert to cuDF table and also read Parquet with cuDF reader
+    got = cudf.DataFrame.from_arrow(pa_got)
+    got2 = cudf.read_parquet(buffer)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+
 
 @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
 @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0209c692935..76c7f2bfdb8 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -322,6 +322,12 @@
 output_as_binary : set, optional, default None
     If a column name is present in the set, that column will be output as
     unannotated binary, rather than the default 'UTF-8'.
+store_schema : bool, default False
+    If ``True``, writes arrow schema to Parquet file footer's key-value
+    metadata section to faithfully round-trip ``duration`` types with arrow.
+    This cannot be used with ``int96_timestamps`` enabled as int96 timestamps
+    are deprecated in arrow. Also, all decimal32 and decimal64 columns will be
+    converted to decimal128 as arrow only supports decimal128 and decimal256 types.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From f592e9c4bfcc2d8e887ad5f96e5167ee0ee2c73a Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Tue, 9 Jul 2024 21:00:57 -0700
Subject: [PATCH 035/101] Add groupby_max multi-threaded benchmark (#16154)

This PR adds **groupby_max** multi-threaded benchmark. The benchmark runs multiple **max groupby aggregations** concurrently using one CUDA stream per host thread.

Closes #16134

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16154
---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/groupby/group_max.cpp          |  16 ++-
 .../groupby/group_max_multithreaded.cpp       | 102 ++++++++++++++++++
 3 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_max_multithreaded.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a5b248135c1..ff431c7f260 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -231,8 +231,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp
-  groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 01ca23ebbf8..f41285008c4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state,
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
 
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
   auto keys_view = keys->view();
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals->view();
-  requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  for (int64_t i = 0; i < num_aggregations; i++) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    requests[i].values = vals->view();
+    requests[i].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
@@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
   .set_name("groupby_max")
   .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18, 24})
-  .add_float64_axis("null_probability", {0, 0.1, 0.9});
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32});
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
new file mode 100644
index 00000000000..3b8faba618f
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+  auto const num_threads      = state.get_int64("num_threads");
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
+  for (auto& thread_requests : requests) {
+    for (int64_t j = 0; j < num_aggregations; j++) {
+      thread_requests.emplace_back();
+      thread_requests.back().values = vals->view();
+      thread_requests.back().aggregations.push_back(
+        cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    }
+  }
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
+      timer.start();
+      for (int64_t i = 0; i < num_threads; ++i) {
+        threads.submit(perform_agg, i);
+      }
+      threads.wait_for_tasks();
+      cudf::detail::join_streams(streams, cudf::get_default_stream());
+      cudf::get_default_stream().synchronize();
+      timer.stop();
+    });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000.,
+    "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_max_multithreaded")
+  .add_int64_axis("cardinality", {0})
+  .add_int64_power_of_two_axis("num_rows", {12, 18})
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1})
+  .add_int64_axis("num_threads", {1, 2, 4, 8});

From 7b8169ab1b042b790622c0178bc41e3045a99305 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 10 Jul 2024 10:57:41 -0500
Subject: [PATCH 036/101] Disable dict support for split-page kernel in the
 parquet reader. (#16128)

Dictionary support for this particular flavor of kernel was being compiled in. Harmless, but caused an unneeded increase in shared memory usage.  This PR disables it.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Ed Seidl (https://github.com/etseidl)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16128
---
 cpp/src/io/parquet/decode_fixed.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index ea80ae73c2f..8a866141c4b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -792,7 +792,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -801,7 +801,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -812,7 +812,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -821,7 +821,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(

From 11a5174a4639f980e9502b1eccc39ee3b4587d11 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robertjmaynard@gmail.com>
Date: Wed, 10 Jul 2024 14:10:24 -0400
Subject: [PATCH 037/101] Promote IO support queries to cudf API (#16125)

Promote the ability to query the status of cufile and nvcomp support to the public API. It seems like these kind of questions would want to be asked by external users.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16125
---
 cpp/include/cudf/io/config_utils.hpp          |  53 +++++++++
 cpp/include/cudf/io/nvcomp_adapter.hpp        | 106 ++++++++++++++++++
 cpp/include/cudf/utilities/logger.hpp         |   8 +-
 cpp/src/io/comp/nvcomp_adapter.cpp            |   8 +-
 cpp/src/io/comp/nvcomp_adapter.hpp            |  67 +----------
 cpp/src/io/orc/reader_impl_decode.cu          |   2 +-
 cpp/src/io/orc/stripe_enc.cu                  |   2 +-
 cpp/src/io/orc/writer_impl.cu                 |   1 +
 cpp/src/io/parquet/reader_impl_chunking.cu    |   6 +-
 cpp/src/io/parquet/writer_impl.cu             |   2 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu    |   2 +-
 cpp/src/io/utilities/config_utils.cpp         |   8 +-
 cpp/src/io/utilities/data_sink.cpp            |   5 +-
 cpp/src/io/utilities/datasource.cpp           |   7 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |   4 +-
 .../{config_utils.hpp => getenv_or.hpp}       |  42 +------
 16 files changed, 199 insertions(+), 124 deletions(-)
 create mode 100644 cpp/include/cudf/io/config_utils.hpp
 create mode 100644 cpp/include/cudf/io/nvcomp_adapter.hpp
 rename cpp/src/io/utilities/{config_utils.hpp => getenv_or.hpp} (63%)

diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
new file mode 100644
index 00000000000..1827ba0e3e6
--- /dev/null
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace io::cufile_integration {
+
+/**
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
+ */
+bool is_always_enabled();
+
+/**
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
+ */
+bool is_gds_enabled();
+
+/**
+ * @brief Returns true if KvikIO is enabled.
+ */
+bool is_kvikio_enabled();
+
+}  // namespace io::cufile_integration
+
+namespace io::nvcomp_integration {
+
+/**
+ * @brief Returns true if all nvCOMP uses are enabled.
+ */
+bool is_all_enabled();
+
+/**
+ * @brief Returns true if stable nvCOMP use is enabled.
+ */
+bool is_stable_enabled();
+
+}  // namespace io::nvcomp_integration
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
new file mode 100644
index 00000000000..f3260d0cb53
--- /dev/null
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+#include <optional>
+#include <string>
+
+namespace CUDF_EXPORT cudf {
+namespace io::nvcomp {
+
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+
+/**
+ * @brief Set of parameters that impact whether nvCOMP features are enabled.
+ *
+ */
+struct feature_status_parameters {
+  int lib_major_version;                 ///< major version
+  int lib_minor_version;                 ///< minor version
+  int lib_patch_version;                 ///< patch version
+  bool are_all_integrations_enabled;     ///< all integrations
+  bool are_stable_integrations_enabled;  ///< stable integrations
+  int compute_capability_major;          ///< cuda compute major version
+
+  /**
+   * @brief Default Constructor
+   */
+  feature_status_parameters();
+
+  /**
+   * @brief feature_status_parameters Constructor
+   *
+   * @param major positive integer representing major value of nvcomp
+   * @param minor positive integer representing minor value of nvcomp
+   * @param patch positive integer representing patch value of nvcomp
+   * @param all_enabled if all integrations are enabled
+   * @param stable_enabled if stable integrations are enabled
+   * @param cc_major CUDA compute capability
+   */
+  feature_status_parameters(
+    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
+    : lib_major_version{major},
+      lib_minor_version{minor},
+      lib_patch_version{patch},
+      are_all_integrations_enabled{all_enabled},
+      are_stable_integrations_enabled{stable_enabled},
+      compute_capability_major{cc_major}
+  {
+  }
+};
+
+/**
+ * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
+ */
+inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
+{
+  return lhs.lib_major_version == rhs.lib_major_version and
+         lhs.lib_minor_version == rhs.lib_minor_version and
+         lhs.lib_patch_version == rhs.lib_patch_version and
+         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
+         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
+         lhs.compute_capability_major == rhs.compute_capability_major;
+}
+
+/**
+ * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_compression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+/**
+ * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_decompression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+}  // namespace io::nvcomp
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
index a39df064f44..45d5d1b12e1 100644
--- a/cpp/include/cudf/utilities/logger.hpp
+++ b/cpp/include/cudf/utilities/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <spdlog/spdlog.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Returns the global logger.
@@ -43,4 +45,4 @@ namespace cudf {
  */
 spdlog::logger& logger();
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index f8920bf82c2..0e34c96debd 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "nvcomp_adapter.hpp"
 
-#include "io/utilities/config_utils.hpp"
 #include "nvcomp_adapter.cuh"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/lz4.h>
@@ -472,8 +474,8 @@ feature_status_parameters::feature_status_parameters()
   : lib_major_version{NVCOMP_MAJOR_VERSION},
     lib_minor_version{NVCOMP_MINOR_VERSION},
     lib_patch_version{NVCOMP_PATCH_VERSION},
-    are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()},
-    are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()}
+    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
+    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
 {
   int device;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 1a680a050fd..43c79e32375 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include "gpuinflate.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/io/config_utils.hpp>
+#include <cudf/io/nvcomp_adapter.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -27,70 +28,6 @@
 #include <optional>
 
 namespace cudf::io::nvcomp {
-
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
-
-/**
- * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
- */
-struct feature_status_parameters {
-  int lib_major_version;
-  int lib_minor_version;
-  int lib_patch_version;
-  bool are_all_integrations_enabled;
-  bool are_stable_integrations_enabled;
-  int compute_capability_major;
-
-  feature_status_parameters();
-  feature_status_parameters(
-    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
-    : lib_major_version{major},
-      lib_minor_version{minor},
-      lib_patch_version{patch},
-      are_all_integrations_enabled{all_enabled},
-      are_stable_integrations_enabled{stable_enabled},
-      compute_capability_major{cc_major}
-  {
-  }
-};
-
-/**
- * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
- */
-inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
-{
-  return lhs.lib_major_version == rhs.lib_major_version and
-         lhs.lib_minor_version == rhs.lib_minor_version and
-         lhs.lib_patch_version == rhs.lib_patch_version and
-         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
-         lhs.compute_capability_major == rhs.compute_capability_major;
-}
-
-/**
- * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result cab depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_compression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
-/**
- * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result can depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_decompression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
 /**
  * @brief Device batch decompression of given type.
  *
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 72eb41b1360..8e20505d3ff 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -19,13 +19,13 @@
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b6fc4e3510f..805959327ac 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e9e031a407a..4cb20bb7518 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index d371ef5de93..3da303e6928 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -16,7 +16,6 @@
 
 #include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
@@ -25,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -862,7 +862,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -1071,7 +1071,7 @@ struct get_decomp_scratch {
       case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
 
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           return cudf::io::nvcomp::batched_decompress_temp_size(
             cudf::io::nvcomp::compression_type::SNAPPY,
             di.num_pages,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 66b4fce16fe..8413e716224 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -27,7 +27,6 @@
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
@@ -38,6 +37,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 0e3ce779089..badcd3f58f9 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
-#include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 20ac89b4d53..a3afbd52896 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "config_utils.hpp"
+#include "getenv_or.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <cstdlib>
+#include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
+namespace cudf::io {
 
 namespace cufile_integration {
 
@@ -80,4 +82,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-}  // namespace cudf::io::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index a6cbbcd84a6..1dbb9369115 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -15,8 +15,9 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -40,7 +41,7 @@ class file_sink : public data_sink {
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index ca8932322bf..c8a438fc40b 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,9 +15,10 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -44,7 +45,7 @@ class file_source : public datasource {
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
     detail::force_init_cuda_context();
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -433,7 +434,7 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_integration::is_always_enabled()) {
+  if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index a9d4f19c848..9fe5959436d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -16,9 +16,11 @@
 
 #include "file_io_utilities.hpp"
 
-#include "io/utilities/config_utils.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/getenv_or.hpp
similarity index 63%
rename from cpp/src/io/utilities/config_utils.hpp
rename to cpp/src/io/utilities/getenv_or.hpp
index 74df1375e6f..3fd97a00b61 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,15 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <cstdlib>
 #include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
-
+namespace {
 /**
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
@@ -45,37 +46,4 @@ T getenv_or(std::string_view env_var_name, T default_val)
   return converted_val;
 }
 
-namespace cufile_integration {
-
-/**
- * @brief Returns true if cuFile and its compatibility mode are enabled.
- */
-bool is_always_enabled();
-
-/**
- * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
- */
-bool is_gds_enabled();
-
-/**
- * @brief Returns true if KvikIO is enabled.
- */
-bool is_kvikio_enabled();
-
-}  // namespace cufile_integration
-
-namespace nvcomp_integration {
-
-/**
- * @brief Returns true if all nvCOMP uses are enabled.
- */
-bool is_all_enabled();
-
-/**
- * @brief Returns true if stable nvCOMP use is enabled.
- */
-bool is_stable_enabled();
-
-}  // namespace nvcomp_integration
-
-}  // namespace cudf::io::detail
+}  // namespace

From 261f911958ee9ad76109953dfc920c08da4c6fe6 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:02:44 -0400
Subject: [PATCH 038/101] Migrate lists/extract to pylibcudf (#16071)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16071
---
 python/cudf/cudf/_lib/lists.pyx               | 36 ++++++-------------
 .../_lib/pylibcudf/libcudf/lists/extract.pxd  |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  6 ++++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 31 +++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 21 +++++++++++
 5 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 0ad09dba717..ceae1b148aa 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,11 +8,9 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -116,37 +114,23 @@ def sort_lists(Column col, bool ascending, str na_position):
 
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index.to_pylibcudf(mode="read"),
+        )
     )
 
-    cdef column_view index_view = index.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index_view))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index caa12f41914..53609ba8830 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -11,10 +11,10 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
+        const lists_column_view&,
         size_type
     ) except +
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
-        column_view
+        const lists_column_view&,
+        const column_view&
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index c9c43751a43..38a479e4791 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -12,6 +12,10 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
+ctypedef fused ColumnOrSizeType:
+    Column
+    size_type
+
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
@@ -27,3 +31,5 @@ cpdef Column index_of(Column, ColumnOrScalar, bool)
 cpdef Column reverse(Column)
 
 cpdef Column segmented_gather(Column, Column)
+
+cpdef Column extract_list_element(Column, ColumnOrSizeType)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 9c56f1139c6..19c961aa014 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -17,9 +17,12 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
+    extract_list_element as cpp_extract_list_element,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
@@ -264,3 +267,29 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
             list_view2.view(),
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
+    """Create a column of extracted list elements.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    index : Union[Column, size_type]
+        The selection index or indices.
+
+    Returns
+    -------
+    Column
+        A new Column with elements extracted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_extract_list_element(
+            list_view.view(),
+            index.view() if ColumnOrSizeType is Column else index,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 0d95579acb3..07ecaed5012 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -160,3 +160,24 @@ def test_segmented_gather(test_data):
     expect = pa.array([[8, 9], [14], [0], [0, 0]])
 
     assert_column_eq(expect, res)
+
+
+def test_extract_list_element_scalar(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.extract_list_element(plc_column, 0)
+    expect = pa.compute.list_element(test_data[0][0], 0)
+
+    assert_column_eq(expect, res)
+
+
+def test_extract_list_element_column(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+    indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))
+
+    res = plc.lists.extract_list_element(plc_column, indices)
+    expect = pa.array([0, None, None, 7])
+
+    assert_column_eq(expect, res)

From 64e3e8d4259eff85a4d2708333b0cfb43a3e79e3 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 10 Jul 2024 14:23:00 -0700
Subject: [PATCH 039/101] Remove `mr` param from `write_csv` and `write_json`
 (#16231)

Fixes #16200

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16231
---
 cpp/include/cudf/io/csv.hpp         |  4 +-
 cpp/include/cudf/io/detail/csv.hpp  |  4 +-
 cpp/include/cudf/io/detail/json.hpp |  4 +-
 cpp/include/cudf/io/json.hpp        |  4 +-
 cpp/src/io/csv/writer_impl.cu       |  6 +--
 cpp/src/io/functions.cpp            | 14 ++-----
 cpp/src/io/json/write_json.cu       |  8 ++--
 cpp/tests/io/json_test.cpp          |  4 +-
 cpp/tests/io/json_writer.cpp        | 64 +++++++++--------------------
 9 files changed, 34 insertions(+), 78 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 68bb7fba00e..cc361f0918e 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1756,11 +1756,9 @@ class csv_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 50c1a7c163d..2a70fa888f4 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -49,14 +49,12 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
  * @param column_names Column names for the output CSV
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(data_sink* sink,
                table_view const& table,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr);
+               rmm::cuda_stream_view stream);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 540a584908d..6ff1c12831b 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -46,13 +46,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
  * @param table The set of columns
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8de690482f9..7af90766ad0 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1018,11 +1018,9 @@ class json_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 63eb0b03c5f..00a6dcb2286 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -430,13 +430,13 @@ void write_csv(data_sink* out_sink,
                table_view const& table,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+               rmm::cuda_stream_view stream)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, user_column_names, options, stream, mr);
+  write_chunked_begin(
+    out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource());
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b4ece9cec66..6d2834206d4 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -215,9 +215,7 @@ table_with_metadata read_json(json_reader_options options,
   return json::detail::read_json(datasources, options, stream, mr);
 }
 
-void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+void write_json(json_writer_options const& options, rmm::cuda_stream_view stream)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -226,8 +224,7 @@ void write_json(json_writer_options const& options,
     sinks[0].get(),
     options.get_table(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 table_with_metadata read_csv(csv_reader_options options,
@@ -252,9 +249,7 @@ table_with_metadata read_csv(csv_reader_options options,
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+void write_csv(csv_writer_options const& options, rmm::cuda_stream_view stream)
 {
   using namespace cudf::io::detail;
 
@@ -266,8 +261,7 @@ void write_csv(csv_writer_options const& options,
     options.get_table(),
     options.get_names(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 997d6fd99f8..c688c809e04 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -805,8 +805,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    int const skip_last_chars,
                    json_writer_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -829,8 +828,7 @@ void write_chunked(data_sink* out_sink,
 void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+                rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
@@ -912,7 +910,7 @@ void write_json(data_sink* out_sink,
       bool const include_line_terminator =
         (&sub_view != &vector_views.back()) or options.is_enabled_lines();
       auto const skip_last_chars = (include_line_terminator ? 0 : line_terminator.size());
-      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream, mr);
+      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream);
     }
   } else {
     if (options.is_enabled_lines()) {
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9c76c344157..993ab82f423 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1400,9 +1400,7 @@ TEST_F(JsonReaderTest, JsonLongString)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   cudf::column_view int16_with_mask(repeat_times);
   cudf::column_view int16(
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index 946b939f456..2c4e29a01b9 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -51,16 +51,14 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .build();
 
   // Empty columns in table
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"([])";
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected_lines = "\n";
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 
@@ -68,8 +66,7 @@ TEST_F(JsonWriterTest, EmptyInput)
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
@@ -94,22 +91,17 @@ TEST_F(JsonWriterTest, ErrorCases)
                        .build();
 
   // not enough column names
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 
   mt.schema_info.emplace_back("int16");
   out_options.set_metadata(mt);
-  EXPECT_NO_THROW(cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()));
+  EXPECT_NO_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()));
 
   // chunk_rows must be at least 8
   out_options.set_rows_per_chunk(0);
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 }
 
 TEST_F(JsonWriterTest, PlainTable)
@@ -131,9 +123,7 @@ TEST_F(JsonWriterTest, PlainTable)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
@@ -163,9 +153,7 @@ TEST_F(JsonWriterTest, SimpleNested)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -197,9 +185,7 @@ TEST_F(JsonWriterTest, MixedNested)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
@@ -232,8 +218,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -308,8 +293,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   mt.schema_info[2].children.clear();
   out_options.set_metadata(mt);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
 
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
@@ -332,8 +316,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   // without column names
   out_options.set_metadata(cudf::io::table_metadata{});
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
                  .lines(true)
@@ -371,8 +354,7 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
@@ -405,9 +387,7 @@ TEST_F(JsonWriterTest, NullList)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
@@ -446,9 +426,7 @@ TEST_F(JsonWriterTest, ChunkedNested)
                            .na_rep("null")
                            .rows_per_chunk(8);
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -504,9 +482,7 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -568,9 +544,7 @@ TEST_F(JsonWriterTest, Unicode)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}

From 3c83ce451446dfd556bd14ad8537b0189226a0e5 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:58:31 -0600
Subject: [PATCH 040/101] New Decimal <--> Floating conversion (#15905)

This PR contains the main algorithm for the new decimal <--> floating conversion code. This algorithm was written to address the precision issues described [here](https://github.com/rapidsai/cudf/issues/14169).

### Summary
* The new algorithm is more accurate than the previous code, but it is also far more complex.
* It can perform conversions that were not even possible in the old code due to overflow (decimal32/64/128 conversions only worked for scale factors up to 10^9/18/38, respectively). Now the entire floating-point range is convertible, including denormals.
* This new algorithm is significantly faster in some parts of the conversion phase-space, and in some parts slightly slower.

### Previous PR's
These contain the supporting parts of this work:
* [Explicit conversion PR](https://github.com/rapidsai/cudf/pull/15438)
* [Benchmarking PR](https://github.com/rapidsai/cudf/pull/15334)
* [Powers-of-10 PR](https://github.com/rapidsai/cudf/pull/15353)
* [Utilities PR](https://github.com/rapidsai/cudf/pull/15359). These utilities are updated here to support denormals.

### Algorithm Outline
We convert floating -> (integer) decimal by:
* Extract the floating-point mantissa (converted to integer) and power-of-2
* For float we use a uint64 to contain our data during the below shifting/scaling, for double uint128_t
* In this shifting integer, we alternately apply the extracted powers-of-2 (bit-shifts, until they're all used) and scale-factor powers-of-10 (multiply/divide) as needed to reach the desired scale factor.

Decimal -> floating is just the reverse operation.

### Supplemental Changes
* Testing: Add decimal128, add precise-conversion tests. Remove kludges due to inaccurate conversions. Add test for zeroes.
* Benchmarking: Enable regions of conversion phase-space for benchmarking that were not possible in the old algorithm.
* Unary: Cleanup by using CUDF_ENABLE_IF.  Call new conversion code for base-10 fixed-point.

### Performance for various conversions/input-ranges
* Note: F32/F64 is float/double

New algorithm is **FASTER** by:
* F64             --> decimal64:   60% for E8    --> E15
* F64             --> decimal128: 13% for E-8  --> E-15
* F64             --> decimal128: 22% for E8    --> E15
* F64             --> decimal128: 27% for E31  --> E38
* decimal32   --> F64:             18% for E-3   --> E4
* decimal64   --> F64:             27% for E-14 --> E-7
* decimal64   --> F64:             17% for E-3   --> E4
* decimal128 --> F64:             21% for E-14 --> E-7
* decimal128 --> F64:             11% for E-3   --> E4
* decimal128 --> F64:             13% for E31   --> E38

New algorithm is **SLOWER** by:
* F32             --> decimal32:     3% for E-3   --> E4
* F32             --> decimal64:     2% for E-14   --> E14
* F64             --> decimal32:     3% for E-3   --> E4
* decimal32   --> F32:               5% for E-3   --> E4
* decimal128 --> F64:             36% for E-37 --> E-30

Other kernels:
* The PYMOD binary-op benchmark is 7% slower.

### Performance discussion
* Many conversions have identical speed, indicating these algorithms are often fast and we are instead bottlenecked on overheads such as getting the input to the gpu in the first place.
* F64 conversions are often much faster than the old algorithm as the new algorithm completely avoids the FP64 pipeline. Other than the cast to double itself, all of the operations are on integers. Thus we don't have threads competing with each other and taking turns for access to the floating-point cores.
* The conversions are slightly slower for floats with powers-of-10 near zero.  Presumably this is due to code overhead for e.g., handling a large range of inputs, UB-checks for bit shifts, branches for denormals, etc.
* The conversion is slower for decimal128 conversions with very small exponents, which requires several large divisions (128bit divided by 64bit).
* The PYMOD kernel is slower due to register pressure from the introduction of the new division routines in the earlier PR. Even though this benchmark does not perform decimal <--> floating conversions, it gets hit because of inlined template code in the kernel increasing the code/register pressure.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15905
---
 cpp/benchmarks/decimal/convert_floating.cpp   |  17 -
 .../cudf/fixed_point/floating_conversion.hpp  | 964 +++++++++++++++---
 cpp/include/cudf/unary.hpp                    |  35 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   | 129 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |   6 +-
 python/cudf/cudf/tests/test_decimal.py        |   2 +-
 6 files changed, 958 insertions(+), 195 deletions(-)

diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
index a367036c494..ac09c3400cb 100644
--- a/cpp/benchmarks/decimal/convert_floating.cpp
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -32,8 +32,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
 
   static constexpr bool is_double =
     std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
-  static constexpr bool is_32bit =
-    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
   static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
                                     std::is_same_v<OutputType, numeric::decimal128>;
 
@@ -69,21 +67,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
     return;
   }
 
-  // The current float <--> decimal conversion algorithm is limited
-  static constexpr bool is_64bit = !is_32bit && !is_128bit;
-  if (is_32bit && (exp_mode != 3)) {
-    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
-    return;
-  }
-  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
-    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
-    return;
-  }
-  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
-    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
-    return;
-  }
-
   // Type IDs
   auto const input_id  = cudf::type_to_id<InputType>();
   auto const output_id = cudf::type_to_id<OutputType>();
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index 2c3a5c5629d..c64ae8877d4 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -34,6 +35,49 @@ namespace numeric {
 
 namespace detail {
 
+/**
+ * @brief Determine the number of significant bits in an integer
+ *
+ * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
+ * @param value The integer whose bits are being counted
+ * @return The number of significant bits: the # of bits - # of leading zeroes
+ */
+template <typename T,
+          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
+                         std::is_same_v<T, __uint128_t>)>
+CUDF_HOST_DEVICE inline int count_significant_bits(T value)
+{
+#ifdef __CUDA_ARCH__
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __clzll(static_cast<int64_t>(value));
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __clz(static_cast<int32_t>(value));
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<int64_t>(value >> 64);
+    auto const low_bits  = static_cast<int64_t>(value);
+    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
+  }
+#else
+  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
+  if (value == 0) { return 0; }
+
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __builtin_clzll(value);
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __builtin_clz(value);
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<uint64_t>(value >> 64);
+    if (high_bits == 0) {
+      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
+    } else {
+      return 128 - __builtin_clzll(high_bits);
+    }
+  }
+#endif
+}
+
 /**
  * @brief Helper struct for getting and setting the components of a floating-point value
  *
@@ -62,27 +106,28 @@ struct floating_converter {
   // The low 23 / 52 bits (for float / double) are the mantissa.
   // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
   // The value of the mantissa is in the range [1, 2).
-  /// # mantissa bits (-1 for understood bit)
-  static constexpr int num_mantissa_bits = cuda::std::numeric_limits<FloatingType>::digits - 1;
+  /// # significand bits (includes understood bit)
+  static constexpr int num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+  /// # stored mantissa bits (-1 for understood bit)
+  static constexpr int num_stored_mantissa_bits = num_significand_bits - 1;
   /// The mask for the understood bit
-  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits);
+  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits);
   /// The mask to select the mantissa
   static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
 
   // And in between are the bits used to store the biased power-of-2 exponent.
   /// # exponents bits (-1 for sign bit)
-  static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1;
+  static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1;
   /// The mask for the exponents, unshifted
   static constexpr IntegralType unshifted_exponent_mask =
     (IntegralType(1) << num_exponent_bits) - 1;
   /// The mask to select the exponents
-  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits;
+  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits;
 
   // To store positive and negative exponents as unsigned values, the stored value for
   // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
   /// 127 / 1023 for float / double
-  static constexpr IntegralType exponent_bias =
-    cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
+  static constexpr int exponent_bias = cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
 
   /**
    * @brief Reinterpret the bits of a floating-point value as an integer
@@ -113,15 +158,15 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the integral significand of a bit-casted floating-point number
+   * @brief Checks whether the bit-casted floating-point value is +/-0
    *
-   * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The integral significand, bit-shifted to a (large) whole number
+   * @param integer_rep The bit-casted floating value to check if is +/-0
+   * @return True if is a zero, else false
    */
-  CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep)
   {
-    // Extract the significand, setting the high bit for the understood 1/2
-    return (integer_rep & mantissa_mask) | understood_bit_mask;
+    // It's a zero if every non-sign bit is zero
+    return ((integer_rep & ~sign_mask) == 0);
   }
 
   /**
@@ -137,40 +182,59 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the exponent of a bit-casted floating-point number
+   * @brief Extracts the significand and exponent of a bit-casted floating-point number,
+   * shifted for denormals.
    *
-   * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals
-   * For all of these cases, the decimal fixed_point number should be set to zero
+   * @note Zeros/inf/NaN not handled.
    *
    * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The stored base-2 exponent, or INT_MIN for special values
+   * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+    IntegralType integer_rep)
   {
-    // First extract the exponent bits and handle its special values.
-    // To minimize branching, all of these special cases will return INT_MIN.
-    // For all of these cases, the decimal fixed_point number should be set to zero.
+    // Extract the significand
+    auto significand = (integer_rep & mantissa_mask);
+
+    // Extract the exponent bits.
     auto const exponent_bits = integer_rep & exponent_mask;
+
+    // Notes on special values of exponent_bits:
+    // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input.
+    // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller).
+    int floating_pow2;
     if (exponent_bits == 0) {
-      // Because of the understood set-bit not stored in the mantissa, it is not possible
-      // to store the value zero directly. Instead both +/-0 and denormals are represented with
-      // the exponent bits set to zero.
-      // Thus it's fastest to just floor (generally unwanted) denormals to zero.
-      return INT_MIN;
-    } else if (exponent_bits == exponent_mask) {
-      //+/-inf and NaN values are stored with all of the exponent bits set.
-      // As none of these are representable by integers, we'll return the same value for all cases.
-      return INT_MIN;
+      // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i)
+      // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1),
+      // and B_i is the value of that bit (0 or 1)
+      // So e.g. for the minimum denormal, only the lowest bit is set:
+      // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149
+      // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074
+      floating_pow2 = 1 - exponent_bias;
+
+      // Line-up denormal to same (understood) bit as normal numbers
+      // This is so bit-shifting starts at the same bit index
+      auto const lineup_shift = num_significand_bits - count_significant_bits(significand);
+      significand <<= lineup_shift;
+      floating_pow2 -= lineup_shift;
+    } else {
+      // Extract the exponent value: shift the bits down and subtract the bias.
+      auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits;
+      floating_pow2                    = static_cast<int>(shifted_exponent_bits) - exponent_bias;
+
+      // Set the high bit for the understood 1/2
+      significand |= understood_bit_mask;
     }
 
-    // Extract the exponent value: shift the bits down and subtract the bias.
-    using SignedIntegralType                       = cuda::std::make_signed_t<IntegralType>;
-    SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits;
-    return shifted_exponent_bits - static_cast<SignedIntegralType>(exponent_bias);
+    // To convert the mantissa to an integer, we effectively applied #-mantissa-bits
+    // powers of 2 to convert the fractional value to an integer, so subtract them off here
+    int const pow2 = floating_pow2 - num_stored_mantissa_bits;
+
+    return {significand, pow2};
   }
 
   /**
-   * @brief Sets the sign bit of a positive floating-point number
+   * @brief Sets the sign bit of a floating-point number
    *
    * @param floating The floating-point value to set the sign of. Must be positive.
    * @param is_negative The sign bit to set for the floating-point number
@@ -192,83 +256,60 @@ struct floating_converter {
   /**
    * @brief Adds to the base-2 exponent of a floating-point number
    *
+   * @note The caller must guarantee that the input is a positive (> 0) whole number.
+   *
    * @param floating The floating value to add to the exponent of. Must be positive.
-   * @param exp2 The power-of-2 to add to the floating-point number
-   * @return The input floating-point value * 2^exp2
+   * @param pow2 The power-of-2 to add to the floating-point number
+   * @return The input floating-point value * 2^pow2
    */
-  CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2)
+  CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2)
   {
+    // Note that the input floating-point number is positive (& whole), so we don't have to
+    // worry about the sign here; the sign will be set later in set_is_negative()
+
     // Convert floating to integer
     auto integer_rep = bit_cast_to_integer(floating);
 
     // Extract the currently stored (biased) exponent
+    using SignedType   = std::make_signed_t<IntegralType>;
     auto exponent_bits = integer_rep & exponent_mask;
-    auto stored_exp2   = exponent_bits >> num_mantissa_bits;
+    auto stored_pow2   = static_cast<SignedType>(exponent_bits >> num_stored_mantissa_bits);
 
     // Add the additional power-of-2
-    stored_exp2 += exp2;
+    stored_pow2 += pow2;
 
     // Check for exponent over/under-flow.
-    // Note that the input floating-point number is always positive, so we don't have to
-    // worry about the sign here; the sign will be set later in set_is_negative()
-    if (stored_exp2 <= 0) {
-      return 0.0;
-    } else if (stored_exp2 >= unshifted_exponent_mask) {
+    if (stored_pow2 <= 0) {
+      // Denormal (zero handled prior to input)
+
+      // Early out if bit shift will zero it anyway.
+      // Note: We must handle this explicitly, as too-large a bit-shift is UB
+      auto const bit_shift = -stored_pow2 + 1;  //+1 due to understood bit set below
+      if (bit_shift > num_stored_mantissa_bits) { return 0.0; }
+
+      // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit)
+      integer_rep &= (~exponent_mask);
+
+      // The input floating-point number has an "understood" bit that we need to set
+      // prior to bit-shifting. Set the understood bit.
+      integer_rep |= understood_bit_mask;
+
+      // Convert to denormal: bit shift off the low bits
+      integer_rep >>= bit_shift;
+    } else if (stored_pow2 >= static_cast<SignedType>(unshifted_exponent_mask)) {
+      // Overflow: Set infinity
       return cuda::std::numeric_limits<FloatingType>::infinity();
     } else {
-      // Clear existing exponent bits and set new ones
-      exponent_bits = stored_exp2 << num_mantissa_bits;
+      // Normal number: Clear existing exponent bits and set new ones
+      exponent_bits = static_cast<IntegralType>(stored_pow2) << num_stored_mantissa_bits;
       integer_rep &= (~exponent_mask);
       integer_rep |= exponent_bits;
-
-      // Convert back to float
-      return bit_cast_to_floating(integer_rep);
     }
-  }
-};
 
-/**
- * @brief Determine the number of significant bits in an integer
- *
- * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
- * @param value The integer whose bits are being counted
- * @return The number of significant bits: the # of bits - # of leading zeroes
- */
-template <typename T,
-          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
-                         std::is_same_v<T, __uint128_t>)>
-CUDF_HOST_DEVICE inline int count_significant_bits(T value)
-{
-#ifdef __CUDA_ARCH__
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __clzll(static_cast<int64_t>(value));
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __clz(static_cast<int32_t>(value));
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<int64_t>(value >> 64);
-    auto const low_bits  = static_cast<int64_t>(value);
-    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
-  }
-#else
-  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
-  if (value == 0) { return 0; }
-
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __builtin_clzll(value);
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __builtin_clz(value);
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<uint64_t>(value >> 64);
-    if (high_bits == 0) {
-      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
-    } else {
-      return 128 - __builtin_clzll(high_bits);
-    }
+    // Convert back to float
+    return bit_cast_to_floating(integer_rep);
   }
-#endif
-}
+};
 
 /**
  * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
@@ -276,18 +317,18 @@ CUDF_HOST_DEVICE inline int count_significant_bits(T value)
  *
  * @note Intended to be run at compile time.
  *
- * @tparam Exp10 The power of 10 to calculate
- * @return Returns 10^Exp10
+ * @tparam Pow10 The power of 10 to calculate
+ * @return Returns 10^Pow10
  */
-template <int Exp10>
+template <int Pow10>
 constexpr __uint128_t large_power_of_10()
 {
   // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
-  static_assert(Exp10 >= 19);
-  if constexpr (Exp10 == 19)
+  static_assert(Pow10 >= 19);
+  if constexpr (Pow10 == 19)
     return __uint128_t(10000000000000000000ULL);
   else
-    return large_power_of_10<Exp10 - 1>() * __uint128_t(10);
+    return large_power_of_10<Pow10 - 1>() * __uint128_t(10);
 }
 
 /**
@@ -295,11 +336,11 @@ constexpr __uint128_t large_power_of_10()
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 9 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 {
   // Computing division this way is much faster than the alternatives.
   // Division is not implemented in GPU hardware, and the compiler will often implement it as a
@@ -309,7 +350,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
 
   // Instead, if the compiler can see exactly what number it is dividing by, it can
   // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
-  // For the compiler to see the value though, array lookup (with exp10 as the index)
+  // For the compiler to see the value though, array lookup (with pow10 as the index)
   // is not sufficient: We have to use a switch statement. Although this introduces a branch,
   // it is still much faster than doing the divide any other way.
   // Perhaps an array can be used in C++23 with the assume attribute?
@@ -325,7 +366,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
   // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
   // It also dramatically slows down the compile time.
 
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -345,14 +386,14 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 19 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -382,14 +423,14 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive.
- * @return Returns value / 10^exp10.
+ * @param pow10 The power-of-10 of the denominator, from 0 to 38 inclusive.
+ * @return Returns value / 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for an introduction.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -438,14 +479,14 @@ CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -465,14 +506,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -502,14 +543,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
- * @return Returns value * 10^exp10.
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
+ * @return Returns value * 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_128bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -556,59 +597,678 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
 /**
  * @brief Multiply an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier.
+ * @return Returns value * 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return multiply_power10_32bit(value, exp10);
+    return multiply_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return multiply_power10_64bit(value, exp10);
+    return multiply_power10_64bit(value, pow10);
   } else {
-    return multiply_power10_128bit(value, exp10);
+    return multiply_power10_128bit(value, pow10);
   }
 }
 
 /**
  * @brief Divide an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator.
+ * @return Returns value / 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return divide_power10_32bit(value, exp10);
+    return divide_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return divide_power10_64bit(value, exp10);
+    return divide_power10_64bit(value, pow10);
   } else {
-    return divide_power10_128bit(value, exp10);
+    return divide_power10_128bit(value, pow10);
   }
 }
 
+/**
+ * @brief Perform a bit-shift left, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift left
+ * @return The bit-shifted integer, except max value if UB would occur
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value << bit_shift
+                                           : cuda::std::numeric_limits<IntegerType>::max();
+}
+
+/**
+ * @brief Perform a bit-shift right, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift right
+ * @return The bit-shifted integer, which is zero on underflow
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0;
+}
+
+/**
+ * @brief Helper struct with common constants needed by the floating <--> decimal conversions
+ */
+template <typename FloatingType>
+struct shifting_constants {
+  /// Whether the type is double
+  static constexpr bool is_double = cuda::std::is_same_v<FloatingType, double>;
+
+  /// Integer type that can hold the value of the significand
+  using IntegerRep = std::conditional_t<is_double, uint64_t, uint32_t>;
+
+  /// Num bits needed to hold the significand
+  static constexpr auto num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+
+  /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room
+  using ShiftingRep = std::conditional_t<is_double, __uint128_t, uint64_t>;
+
+  // The significand of a float / double is 24 / 53 bits
+  // However, to uniquely represent each double / float as different #'s in decimal
+  // you need 17 / 9 digits (from std::numeric_limits<T>::max_digits10)
+  // To represent 10^17 / 10^9, you need 57 / 30 bits
+  // So we need to keep track of at least this # of bits during shifting to ensure no info is lost
+
+  // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert
+  // between floating and decimal (see shifting functions for details).
+
+  // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must
+  // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer
+
+  // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio
+  // The difference between 2^10 and 10^3 is 1024/1000: 2.4%
+  // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4%
+
+  // This 2.4% error compounds each time we do an iteration.
+  // The min (normal) float is 2^-126.
+  // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45
+  // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts)
+  // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45
+
+  // Similarly, the min (normal) double is 2^-1022.
+  // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324
+  // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts)
+  // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324
+
+  // To account for this compounding error, we can either complicate our loop code (slow),
+  // or use extra bits (in the direction we're shifting the 2s!) to compensate:
+  // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error)
+  /// # buffer bits to account for shifting error
+  static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1;
+
+  // How much room do we have for shifting?
+  // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9
+  // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20
+  // Thus for double / float we can shift up to 20 / 9 decimal places at once
+
+  // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow.
+  // To simplify our loop code, we'll keep to this ratio by instead shifting a max of
+  // 18 / 9 decimal places, for double / float (60 / 30 bits)
+  /// Max at-once decimal place shift
+  static constexpr int max_digits_shift = is_double ? 18 : 9;
+  /// Max at-once bit shift
+  static constexpr int max_bits_shift = max_digits_shift * 10 / 3;
+
+  // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep
+  /// 10^max_digits_shift
+  static constexpr auto max_digits_shift_pow =
+    multiply_power10<IntegerRep>(IntegerRep(1), max_digits_shift);
+};
+
+/**
+ * @brief Add half a bit to integer rep of floating point if conversion causes truncation
+ *
+ * @note This fixes problems like 1.2 (value = 1.1999...) at scale -1 -> 11
+ *
+ * @tparam FloatingType Type of integer holding the floating-point significand
+ * @param floating The floating-point number to convert
+ * @param integer_rep The integer representation of the floating-point significand
+ * @param pow2 The power of 2 that needs to be applied to the significand
+ * @param pow10 The power of 10 that needs to be applied to the significand
+ * @return integer_rep, shifted 1 and ++'d if the conversion to decimal causes truncation
+ */
+template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE cuda::std::pair<typename floating_converter<FloatingType>::IntegralType, int>
+add_half_if_truncates(FloatingType floating,
+                      typename floating_converter<FloatingType>::IntegralType integer_rep,
+                      int pow2,
+                      int pow10)
+{
+  // The user-supplied scale may truncate information, so we need to talk about rounding.
+  // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345
+
+  // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12!
+  // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1
+  // While correct (given our choice to truncate), this is surprising and undesirable.
+  // This problem happens because 1.2 is not perfectly representable in floating point,
+  // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...)
+
+  // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how
+  // do we make sure we store 1.2?  We'll add half an ulp! (unit in the last place)
+  // Then 1.1999... becomes 1.2000...1... which truncates to 1.2.
+  // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2
+
+  // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the
+  // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number.
+  // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way.
+
+  // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1.
+  // We'll always shift up so every input to the conversion algorithm is aligned the same way.
+
+  // If we add a full ulp we run into issues where we add too much and get the wrong result.
+  // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision.
+  // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill.
+
+  // So when does the user-supplied scale truncate info?
+  // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift.
+  // For powers < 0: When the 10s shift is less than the corresponding bit-shift.
+
+  // Corresponding bit-shift:
+  // 2^10 is approximately 10^3, but this is off by 1.024%
+  // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10
+  // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90)
+  // Do comparison without dividing, which loses information:
+  // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3)
+  int const pow2_term  = 90 * pow2;
+  int const pow10_term = 299 * pow10;
+  bool const conversion_truncates =
+    (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0));
+
+  // However, don't add a half-bit if the input is a whole number!
+  // This is only for errors introduced by rounding decimal fractions!
+  bool const is_whole_number = (cuda::std::floor(floating) == floating);
+  bool const add_half_bit    = conversion_truncates && !is_whole_number;
+
+  // Add half a bit on truncation (shift to make room and update pow2)
+  integer_rep <<= 1;
+  --pow2;
+  integer_rep += static_cast<decltype(integer_rep)>(add_half_bit);
+
+  return {integer_rep, pow2};
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 > 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_pospow(
+  typename shifting_constants<FloatingType>::IntegerRep const base2_value, int pow2, int pow10)
+{
+  // To convert to decimal, we need to apply the input powers of 2 and 10
+  // The result will be (integer) base2_value * (2^pow2) / (10^pow10)
+  // Output type is ShiftingRep
+
+  // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s.
+  // We'll iterate back and forth between them, shifting up by 2s
+  // and down by 10s until all of the powers have been applied.
+
+  // However the input base2_value type has virtually no spare room to shift our data
+  // without over- or under-flowing and losing precision.
+  // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we don't lose information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side,
+  // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  static constexpr int shift_from  = Constants::num_significand_bits + 1;
+  static constexpr int max_init_shift = shift_up_to - shift_from;
+
+  // If our total bit shift is less than this, we don't need to iterate
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow2 <= max_init_shift) {
+    // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+    shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+    // NOTE: Cast can overflow!
+    return static_cast<UnsignedRep>(shifting_rep);
+  }
+
+  // We need to iterate. Do the combined initial shift
+  shifting_rep <<= max_init_shift;
+  pow2 -= max_init_shift;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2 <= Constants::max_bits_shift) {
+      // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+      shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+
+      // NOTE: Cast can overflow!
+      return static_cast<UnsignedRep>(shifting_rep);
+    }
+
+    // Shift the max number of bits left again
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail
+  // Note: This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Final bit shift: Shift may be large, guard against UB
+  // NOTE: This can overflow (both cast and shift)!
+  return guarded_left_shift(static_cast<UnsignedRep>(shifting_rep), pow2);
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 < 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_negpow(
+  typename shifting_constants<FloatingType>::IntegerRep base2_value, int pow2, int pow10)
+{
+  // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0
+  // See comments in that function for details.
+  // Instead here we need to multiply by 10s and shift right by 2s
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // Convert to using positive values so we don't have keep negating
+  int pow10_mag = -pow10;
+  int pow2_mag  = -pow2;
+
+  // For performing final 10s-shift
+  using UnsignedRep        = cuda::std::make_unsigned_t<Rep>;
+  auto final_shifts_low10s = [&]() {
+    // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail
+    // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+    if constexpr (Constants::is_double) {
+      shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag);
+    } else {
+      shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag);
+    }
+
+    // Final bit shifting: Shift may be large, guard against UB
+    return static_cast<UnsignedRep>(guarded_right_shift(shifting_rep, pow2_mag));
+  };
+
+  // If our total decimal shift is less than the max, we don't need to iterate
+  if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); }
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to        = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  static constexpr int shift_from         = Constants::num_significand_bits + 1;
+  static constexpr int num_init_bit_shift = shift_up_to - shift_from;
+
+  // Perform initial shift
+  shifting_rep <<= num_init_bit_shift;
+  pow2_mag += num_init_bit_shift;
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  do {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2_mag <= Constants::max_bits_shift) {
+      // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail
+      shifting_rep >>= pow2_mag;
+
+      // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.)
+      // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep.
+      // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying.
+      // NOTE: This can overflow! (Both multiply and cast)
+      return multiply_power10<UnsignedRep>(static_cast<UnsignedRep>(shifting_rep), pow10_mag);
+    }
+
+    // More bits to shift than we have room: Shift the max number of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2_mag -= Constants::max_bits_shift;
+  } while (pow10_mag > Constants::max_digits_shift);
+
+  // Do our final shifts
+  return final_shifts_low10s();
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> convert_floating_to_integral_shifting(
+  typename floating_converter<FloatingType>::IntegralType base2_value, int pow10, int pow2)
+{
+  // Apply the powers of 2 and 10 to convert to decimal.
+  // The result will be base2_value * (2^pow2) / (10^pow10)
+
+  // Note that while this code is branchy, the decimal scale factor is part of the
+  // column type itself, so every thread will take the same branches on pow10.
+  // Also data within a column tends to be similar, so they will often take the
+  // same branches on pow2 as well.
+
+  // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep)
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow10 == 0) {
+    // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32)
+    // Bit shifts may be large, guard against UB
+    if (pow2 >= 0) {
+      return guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+    } else {
+      return static_cast<UnsignedRep>(guarded_right_shift(base2_value, -pow2));
+    }
+  } else if (pow10 > 0) {
+    if (pow2 <= 0) {
+      // Power-2/10 shifts both downward: order doesn't matter, apply and bail.
+      // Guard against shift being undefined behavior
+      auto const shifted = guarded_right_shift(base2_value, -pow2);
+      return static_cast<UnsignedRep>(divide_power10<decltype(shifted)>(shifted, pow10));
+    }
+    return shift_to_decimal_pospow<Rep, FloatingType>(base2_value, pow2, pow10);
+  } else {  // pow10 < 0
+    if (pow2 >= 0) {
+      // Power-2/10 shifts both upward: order doesn't matter, apply and bail.
+      // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow!
+      auto const shifted = guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+      return multiply_power10<UnsignedRep>(shifted, -pow10);
+    }
+    return shift_to_decimal_negpow<Rep, FloatingType>(base2_value, pow2, pow10);
+  }
+}
+
+/**
+ * @brief Perform floating-point -> integer decimal conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param floating The floating point value to convert
+ * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating,
+                                                         scale_type const& scale)
+{
+  // Extract components of the floating point number
+  using converter        = floating_converter<FloatingType>;
+  auto const integer_rep = converter::bit_cast_to_integer(floating);
+  if (converter::is_zero(integer_rep)) { return 0; }
+
+  // Note that the significand here is an unsigned integer with sizeof(FloatingType)
+  auto const is_negative                  = converter::get_is_negative(integer_rep);
+  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
+
+  // Add half a bit if truncating to yield expected value, see function for discussion.
+  auto const pow10 = static_cast<int>(scale);
+  auto const [base2_value, pow2] =
+    add_half_if_truncates(floating, significand, floating_pow2, pow10);
+
+  // Apply the powers of 2 and 10 to convert to decimal.
+  auto const magnitude =
+    convert_floating_to_integral_shifting<Rep, FloatingType>(base2_value, pow10, pow2);
+
+  // Reapply the sign and return
+  // NOTE: Cast can overflow!
+  auto const signed_magnitude = static_cast<Rep>(magnitude);
+  return is_negative ? -signed_magnitude : signed_magnitude;
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 > 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
+{
+  // This is the reverse of shift_to_decimal_pospow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting down by the max # of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2 += Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: multiply all remaining decimal places
+  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = multiply_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = multiply_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 < 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
+{
+  // This is the reverse of shift_to_decimal_negpow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we lose minimal information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Convert to using positive values upfront, simpler than doing later.
+  int pow10_mag = -pow10;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10_mag > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting up by the max # of 2s
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divdie all remaining decimal places.
+  // This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform integer decimal -> floating-point conversion
+ *
+ * @tparam FloatingType The type of floating-point object we are converting to
+ * @tparam Rep The decimal integer type we are converting from
+ * @param value The decimal integer to convert
+ * @param scale The base-10 scale factor for the input integer
+ * @return Floating-point representation of the scaled integral value
+ */
+template <typename FloatingType,
+          typename Rep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value,
+                                                                  scale_type const& scale)
+{
+  // Check the sign of the input
+  bool const is_negative = (value < 0);
+
+  // Convert to unsigned for bit counting/shifting
+  using UnsignedType        = cuda::std::make_unsigned_t<Rep>;
+  auto const unsigned_value = [&]() -> UnsignedType {
+    // Must guard against minimum value, as we can't just negate it: not representable.
+    if (value == cuda::std::numeric_limits<Rep>::min()) { return static_cast<UnsignedType>(value); }
+
+    // No abs function for 128bit types, so have to do it manually.
+    if constexpr (cuda::std::is_same_v<Rep, __int128_t>) {
+      return static_cast<UnsignedType>(is_negative ? -value : value);
+    } else {
+      return cuda::std::abs(value);
+    }
+  }();
+
+  // Shift by powers of 2 and 10 to get our integer mantissa
+  auto const [mantissa, pow2] = [&]() {
+    auto const pow10 = static_cast<int32_t>(scale);
+    if (pow10 >= 0) {
+      return shift_to_binary_pospow<FloatingType>(unsigned_value, pow10);
+    } else {  // pow10 < 0
+      return shift_to_binary_negpow<FloatingType>(unsigned_value, pow10);
+    }
+  }();
+
+  // Zero has special exponent bits, just handle it here
+  if (mantissa == 0) { return FloatingType(0.0f); }
+
+  // Cast our integer mantissa to floating point
+  auto const floating = static_cast<FloatingType>(mantissa);  // IEEE-754 rounds to even
+
+  // Apply the sign and the remaining powers of 2
+  using converter      = floating_converter<FloatingType>;
+  auto const magnitude = converter::add_pow2(floating, pow2);
+  return converter::set_is_negative(magnitude, is_negative);
+}
+
 }  // namespace detail
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 74c8bc67d3a..8a515335351 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -50,14 +51,19 @@ namespace cudf {
  */
 template <typename Fixed,
           typename Floating,
-          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
-                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
 {
-  using Rep          = typename Fixed::rep;
-  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
-  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
-  return Fixed(scaled);
+  using Rep        = typename Fixed::rep;
+  auto const value = [&]() {
+    if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+      return numeric::detail::convert_floating_to_integral<Rep>(floating, scale);
+    } else {
+      return static_cast<Rep>(numeric::detail::shift<Rep, Fixed::rad>(floating, scale));
+    }
+  }();
+
+  return Fixed(numeric::scaled_integer<Rep>{value, scale});
 }
 
 /**
@@ -75,14 +81,17 @@ CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::sca
  */
 template <typename Floating,
           typename Fixed,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
-                                          is_fixed_point<Fixed>()>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
 {
-  using Rep         = typename Fixed::rep;
-  auto const casted = static_cast<Floating>(fixed.value());
-  auto const scale  = numeric::scale_type{-fixed.scale()};
-  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  using Rep = typename Fixed::rep;
+  if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+    return numeric::detail::convert_integral_to_floating<Floating>(fixed.value(), fixed.scale());
+  } else {
+    auto const casted = static_cast<Floating>(fixed.value());
+    auto const scale  = numeric::scale_type{-fixed.scale()};
+    return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  }
 }
 
 /**
@@ -95,7 +104,7 @@ CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
  */
 template <typename Floating,
           typename Input,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>)>
 CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
 {
   if constexpr (is_fixed_point<Input>()) {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index ab7984d4b03..a222289216d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -38,7 +38,7 @@ struct FixedPointTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
 TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
@@ -53,6 +53,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(0.0, scale_type(-4));
 
   EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -61,6 +62,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
@@ -74,6 +76,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(-0.0, scale_type(-4));
 
   EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -82,6 +85,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(-0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
@@ -99,14 +103,10 @@ TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 
   EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
   EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.001, cudf::convert_fixed_to_floating<double>(c));
   EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
   EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              f));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.000001, cudf::convert_fixed_to_floating<double>(f));
 
   EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
               std::numeric_limits<double>::epsilon());
@@ -153,6 +153,119 @@ TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
+TEST_F(FixedPointTest, PreciseFloatDecimal64Construction)
+{
+  // Need 9 decimal digits to uniquely represent all floats (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 9 less than the order-of-magnitude.
+  // But with -9 scale factor decimal32 can overflow: use decimal64 instead.
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E7f, scale_type(-2));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E12f, scale_type(3));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E17f, scale_type(8));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E22f, scale_type(13));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E27f, scale_type(18));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E32f, scale_type(23));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E37f, scale_type(28));
+
+    EXPECT_EQ(3.141593E7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E37f, cudf::convert_fixed_to_floating<float>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-7f, scale_type(-16));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-12f, scale_type(-21));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-17f, scale_type(-26));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-22f, scale_type(-31));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-27f, scale_type(-36));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-32f, scale_type(-41));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-37f, scale_type(-47));
+
+    EXPECT_EQ(3.141593E-7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E-12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E-17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E-22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E-27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E-32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E-37f, cudf::convert_fixed_to_floating<float>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-39f, scale_type(-48));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-41f, scale_type(-50));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-43f, scale_type(-52));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(FLT_TRUE_MIN, scale_type(-54));
+
+    EXPECT_EQ(3.141593E-39f, cudf::convert_fixed_to_floating<float>(num7));
+    EXPECT_EQ(3.141593E-41f, cudf::convert_fixed_to_floating<float>(num8));
+    EXPECT_EQ(3.141593E-43f, cudf::convert_fixed_to_floating<float>(num9));
+    EXPECT_EQ(FLT_TRUE_MIN, cudf::convert_fixed_to_floating<float>(num10));
+  }
+}
+
+TEST_F(FixedPointTest, PreciseDoubleDecimal64Construction)
+{
+  // Need 17 decimal digits to uniquely represent all doubles (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 17 less than the order-of-magnitude.
+
+  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E8, scale_type(-9));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E58, scale_type(41));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E108, scale_type(91));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E158, scale_type(141));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E208, scale_type(191));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E258, scale_type(241));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E307, scale_type(290));
+
+    EXPECT_EQ(3.141593E8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E307, cudf::convert_fixed_to_floating<double>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-8, scale_type(-25));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-58, scale_type(-75));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-108, scale_type(-125));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-158, scale_type(-175));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-208, scale_type(-225));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-258, scale_type(-275));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-308, scale_type(-325));
+
+    EXPECT_EQ(3.141593E-8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E-58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E-108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E-158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E-208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E-258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E-308, cudf::convert_fixed_to_floating<double>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-309, scale_type(-326));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-314, scale_type(-331));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-319, scale_type(-336));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(DBL_TRUE_MIN, scale_type(-341));
+
+    EXPECT_EQ(3.141593E-309, cudf::convert_fixed_to_floating<double>(num7));
+    EXPECT_EQ(3.141593E-314, cudf::convert_fixed_to_floating<double>(num8));
+    EXPECT_EQ(3.141593E-319, cudf::convert_fixed_to_floating<double>(num9));
+    EXPECT_EQ(DBL_TRUE_MIN, cudf::convert_fixed_to_floating<double>(num10));
+  }
+}
+
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -442,8 +555,6 @@ void float_vector_test(ValueType const initial_value,
                        int32_t const scale,
                        Binop binop)
 {
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1d6a3b3304a..7136b162c13 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3509,9 +3509,9 @@ void testCastFloatToDecimal() {
   @Test
   void testCastDoubleToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, 0,
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Long.MAX_VALUE),
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Long.MAX_VALUE),
-        new Long[]{1L, 2L, -3L, null, 2L, Long.MAX_VALUE}
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Integer.MAX_VALUE),
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Integer.MAX_VALUE),
+        new Long[]{1L, 2L, -3L, null, 2L, (long) Integer.MAX_VALUE}
     );
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, -2,
         () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, -55.01999),
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c41a938f6ea..65f739bc74a 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -97,7 +97,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
         pytest.mark.xfail(
             condition=version.parse(pa.__version__) >= version.parse("13.0.0")
             and from_dtype == np.dtype("float32")
-            and to_dtype.precision > 7,
+            and to_dtype.precision > 12,
             reason="https://github.com/rapidsai/cudf/issues/14169",
         )
     )

From 2b2058de941289ca343cb1d3a3eb143a84998dfd Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 11 Jul 2024 07:19:12 -0400
Subject: [PATCH 041/101] Add custom name setter and getter for proxy objects
 in `cudf.pandas` (#16234)

Closes #14524

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16234
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 50 +++++++++++++++++--
 .../cudf_pandas_tests/test_cudf_pandas.py     | 40 +++++++++++++++
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 3f94fc18980..d3a3488081a 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -260,6 +260,23 @@ def Index__new__(cls, *args, **kwargs):
     return self
 
 
+def name(self):
+    return self._fsproxy_wrapped._name
+
+
+def Index__setattr__(self, name, value):
+    if name.startswith("_"):
+        object.__setattr__(self, name, value)
+        return
+    if name == "name":
+        setattr(self._fsproxy_wrapped, "_name", value)
+    if name == "names":
+        setattr(self._fsproxy_wrapped, "_names", value)
+    return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+        name, value
+    )
+
+
 Index = make_final_proxy_type(
     "Index",
     cudf.Index,
@@ -277,11 +294,13 @@ def Index__new__(cls, *args, **kwargs):
         "__iter__": custom_iter,
         "__init__": _DELETE,
         "__new__": Index__new__,
+        "__setattr__": Index__setattr__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         "_accessors": set(),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": property(name),
     },
 )
 
@@ -292,7 +311,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(name),
+    },
 )
 
 SparseDtype = make_final_proxy_type(
@@ -319,7 +342,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(name),
+    },
 )
 
 Categorical = make_final_proxy_type(
@@ -350,6 +377,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -385,6 +414,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -441,6 +472,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -474,6 +507,11 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
+def names(self):
+    return self._fsproxy_wrapped._names
+
+
 MultiIndex = make_final_proxy_type(
     "MultiIndex",
     cudf.MultiIndex,
@@ -481,7 +519,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(names),
+    },
 )
 
 TimeGrouper = make_intermediate_proxy_type(
@@ -669,6 +711,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index bc864a48e9d..6292022d8e4 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1592,3 +1592,43 @@ def test_at_setitem_empty():
     df.at[0, "new"] = 2.0
     expected = pd.DataFrame({"name": [1.0], "new": [2.0]})
     tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        xpd.Index([1, 2, 3], name="foo"),
+        xpd.Index(["a", "b", "c"], name="foo"),
+        xpd.RangeIndex(start=0, stop=3, step=1, name="foo"),
+        xpd.CategoricalIndex(["a", "b", "a"], name="foo"),
+        xpd.DatetimeIndex(
+            ["2024-04-24", "2025-04-24", "2026-04-24"], name="foo"
+        ),
+        xpd.TimedeltaIndex(["1 days", "2 days", "3 days"], name="foo"),
+        xpd.PeriodIndex(
+            ["2024-06", "2023-06", "2022-06"], freq="M", name="foo"
+        ),
+        xpd.IntervalIndex.from_breaks([0, 1, 2, 3], name="foo"),
+        xpd.MultiIndex.from_tuples(
+            [(1, "a"), (2, "b"), (3, "c")], names=["foo1", "bar1"]
+        ),
+    ],
+)
+def test_change_index_name(index):
+    s = xpd.Series([1, 2, object()], index=index)
+    df = xpd.DataFrame({"values": [1, 2, object()]}, index=index)
+
+    if isinstance(index, xpd.MultiIndex):
+        names = ["foo2", "bar2"]
+        s.index.names = names
+        df.index.names = names
+
+        assert s.index.names == names
+        assert df.index.names == names
+    else:
+        name = "bar"
+        s.index.name = name
+        df.index.name = name
+
+        assert s.index.name == name
+        assert df.index.name == name

From b06d883486e8e1e1afeb9406eebb2d2429de96a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:41:07 -0400
Subject: [PATCH 042/101] Allow only scale=0 fixed-point values in
 fixed_width_column_wrapper (#16120)

The `cudf::test::fixed_width_column_wrapper` supports all fixed-width type including fixed-point types. However, there is no mechanism to specify the fixed-point scale value which is common for the entire column and stored in the column's type.
This fixes the case by throwing an error if a non-zero scale is specified for the input values in a fixed-point `fixed_width_column_wrapper` instance.

Also fixed several tests that incorrectly specified a non-zero scale.

Closes #16092

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16120
---
 cpp/include/cudf_test/column_wrapper.hpp      |  3 +
 cpp/tests/io/orc_test.cpp                     | 55 +++++--------------
 cpp/tests/io/parquet_v2_test.cpp              | 34 ++++--------
 .../reshape/interleave_columns_tests.cpp      | 17 +++---
 cpp/tests/streams/io/csv_test.cpp             | 24 +++-----
 cpp/tests/streams/io/orc_test.cpp             | 20 ++-----
 cpp/tests/streams/io/parquet_test.cpp         | 18 ++----
 7 files changed, 52 insertions(+), 119 deletions(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 7363f965af8..2abd6f0abac 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -226,6 +226,9 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   using namespace numeric;
   using RepType = typename ElementTo::rep;
 
+  CUDF_EXPECTS(std::all_of(begin, end, [](ElementFrom v) { return v.scale() == 0; }),
+               "Only zero-scale fixed-point values are supported");
+
   auto to_rep            = [](ElementTo fp) { return fp.value(); };
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b5e080f3cc5..39ba62952b4 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -54,9 +54,9 @@ using int32_col   = column_wrapper<int32_t>;
 using int64_col   = column_wrapper<int64_t>;
 using float32_col = column_wrapper<float>;
 using float64_col = column_wrapper<double>;
-using dec32_col   = column_wrapper<numeric::decimal32>;
-using dec64_col   = column_wrapper<numeric::decimal64>;
-using dec128_col  = column_wrapper<numeric::decimal128>;
+using dec32_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep>;
+using dec64_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep>;
+using dec128_col  = cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep>;
 using struct_col  = cudf::test::structs_column_wrapper;
 template <typename T>
 using list_col = cudf::test::lists_column_wrapper<T>;
@@ -355,12 +355,6 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
-  });
 
   bool_col col0(col0_data.begin(), col0_data.end());
   int8_col col1(col1_data.begin(), col1_data.end());
@@ -368,8 +362,8 @@ TEST_F(OrcWriterTest, MultiColumn)
   int32_col col3(col3_data.begin(), col3_data.end());
   float32_col col4(col4_data.begin(), col4_data.end());
   float64_col col5(col5_data.begin(), col5_data.end());
-  dec128_col col6(col6_data, col6_data + num_rows);
-  dec128_col col7(col7_data, col7_data + num_rows);
+  dec128_col col6{col6_vals.begin(), col6_vals.end(), numeric::scale_type{12}};
+  dec128_col col7{col6_vals.begin(), col6_vals.end(), numeric::scale_type{-12}};
 
   list_col<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
@@ -416,9 +410,6 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
-  });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
@@ -438,7 +429,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
   float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
   float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
-  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
+  dec64_col col6{col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{2}};
   list_col<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};
@@ -541,14 +532,11 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto seq_col0  = random_values<int32_t>(num_rows);
   auto seq_col2  = random_values<float>(num_rows);
   auto vals_col3 = random_values<int32_t>(num_rows);
-  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
-  });
 
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  dec64_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3{vals_col3.begin(), vals_col3.end(), numeric::scale_type{2}};
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
@@ -1213,11 +1201,8 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  dec64_col col{data, data + num_rows, mask};
+  dec64_col col{vals.begin(), vals.end(), mask, numeric::scale_type{scale}};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
@@ -1244,11 +1229,8 @@ TEST_F(OrcWriterTest, Decimal32)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int16_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal32{vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
-  dec32_col col{data, data + num_rows, mask};
+  dec32_col col{vals.begin(), vals.end(), mask, numeric::scale_type{2}};
   cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
@@ -1527,12 +1509,9 @@ TEST_F(OrcReaderTest, DecimalOptions)
 {
   constexpr auto num_rows = 10;
   auto col_vals           = random_values<int64_t>(num_rows);
-  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
 
-  dec128_col col{col_data, col_data + num_rows, mask};
+  dec128_col col{col_vals.begin(), col_vals.end(), mask, numeric::scale_type{2}};
   table_view expected({col});
 
   cudf::io::table_input_metadata expected_metadata(expected);
@@ -1555,15 +1534,9 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
 {
   auto const num_rows = 100;
 
-  auto dec_vals  = random_values<int32_t>(num_rows);
-  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
-  });
-  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
-  });
-  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
-  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
+  auto dec_vals = random_values<int32_t>(num_rows);
+  dec64_col dec1_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
+  dec128_col dec2_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
   auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
 
   auto int_vals = random_values<int32_t>(num_rows);
@@ -1974,7 +1947,7 @@ TEST_F(OrcStatisticsTest, Empty)
   int32_col col0{};
   float64_col col1{};
   str_col col2{};
-  dec64_col col3{};
+  dec64_col col3{{}, numeric::scale_type{0}};
   column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> col4;
   bool_col col5{};
   table_view expected({col0, col1, col2, col3, col4, col5});
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index f106fd5a487..9e66fc9409f 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -47,15 +47,6 @@ TEST_P(ParquetV2Test, MultiColumn)
   auto col6_vals = random_values<int16_t>(num_rows);
   auto col7_vals = random_values<int32_t>(num_rows);
   auto col8_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
-  });
-  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
-    return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
-  });
 
   // column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
   column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
@@ -63,9 +54,13 @@ TEST_P(ParquetV2Test, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), no_nulls(), numeric::scale_type{5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), no_nulls(), numeric::scale_type{-5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col8(
+    col8_vals.begin(), col8_vals.end(), no_nulls(), numeric::scale_type{-6});
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
@@ -109,14 +104,6 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col7_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
-  });
-  // auto col0_mask = cudf::detail::make_counting_transform_iterator(
-  //    0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
   auto col2_mask = no_nulls();
@@ -138,8 +125,11 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, col6_mask};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, col7_mask};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), col7_mask, numeric::scale_type{-8});
 
   auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}};
 
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index bc7488bbf9e..de155c35a5e 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -363,19 +363,16 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
+  using RepType   = typename decimalXX::rep;
 
   for (int i = 0; i > -4; --i) {
-    auto const ONE  = decimalXX{1, scale_type{i}};
-    auto const TWO  = decimalXX{2, scale_type{i}};
-    auto const FOUR = decimalXX{4, scale_type{i}};
-    auto const FIVE = decimalXX{5, scale_type{i}};
+    auto const a = cudf::test::fixed_point_column_wrapper<RepType>({1, 4}, scale_type{i});
+    auto const b = cudf::test::fixed_point_column_wrapper<RepType>({2, 5}, scale_type{i});
 
-    auto const a = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, FOUR});
-    auto const b = cudf::test::fixed_width_column_wrapper<decimalXX>({TWO, FIVE});
-
-    auto const input    = cudf::table_view{std::vector<cudf::column_view>{a, b}};
-    auto const expected = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, TWO, FOUR, FIVE});
-    auto const actual   = cudf::interleave_columns(input);
+    auto const input = cudf::table_view{std::vector<cudf::column_view>{a, b}};
+    auto const expected =
+      cudf::test::fixed_point_column_wrapper<RepType>({1, 2, 4, 5}, scale_type{i});
+    auto const actual = cudf::interleave_columns(input);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, actual->view());
   }
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 6e27db02d56..42894a0ebcb 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -39,12 +39,6 @@ TEST_F(CSVTest, CSVWriter)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -52,8 +46,10 @@ TEST_F(CSVTest, CSVWriter)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
@@ -72,12 +68,6 @@ TEST_F(CSVTest, CSVReader)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -85,8 +75,10 @@ TEST_F(CSVTest, CSVReader)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index 401c7049381..cc43bf15b5d 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -59,22 +59,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<float> col4(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<double> col5(zeros_iterator, zeros_iterator + num_rows);
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8 = [] {
     auto col8_mask =
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index b277d184e3a..9d2dec2d697 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -55,20 +55,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8{
     {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}};

From 53de73d3010ce4fa3b27ab53d14d58312d4793dc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 11 Jul 2024 07:59:39 -1000
Subject: [PATCH 043/101] Add Column.strftime/strptime instead of overloading
 `as_string/datetime/timedelta_column` (#16243)

`Column.as_string/datetime/timedelta_column` had a `format` argument that was not used for columns that weren't these types or didn't require conversion to these types.

This PR introduces a `strftime` and `strptime` on the column that will handle this `format` argument.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16243
---
 python/cudf/cudf/core/column/categorical.py |  24 +---
 python/cudf/cudf/core/column/column.py      |  14 +--
 python/cudf/cudf/core/column/datetime.py    | 126 +++++++++-----------
 python/cudf/cudf/core/column/decimal.py     |   4 +-
 python/cudf/cudf/core/column/lists.py       |   8 +-
 python/cudf/cudf/core/column/numerical.py   |  12 +-
 python/cudf/cudf/core/column/string.py      |  91 +++++++-------
 python/cudf/cudf/core/column/timedelta.py   |  30 ++---
 python/cudf/cudf/core/series.py             |   4 +-
 python/cudf/cudf/core/tools/datetimes.py    |  38 +++---
 10 files changed, 150 insertions(+), 201 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index cec7d5e6663..f763d3b4b0c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1136,26 +1136,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         return self._get_decategorized_column().as_numerical_column(dtype)
 
-    def as_string_column(
-        self, dtype, format: str | None = None
-    ) -> StringColumn:
-        return self._get_decategorized_column().as_string_column(
-            dtype, format=format
-        )
+    def as_string_column(self) -> StringColumn:
+        return self._get_decategorized_column().as_string_column()
 
-    def as_datetime_column(
-        self, dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        return self._get_decategorized_column().as_datetime_column(
-            dtype, format
-        )
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
+        return self._get_decategorized_column().as_datetime_column(dtype)
 
-    def as_timedelta_column(
-        self, dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        return self._get_decategorized_column().as_timedelta_column(
-            dtype, format
-        )
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
+        return self._get_decategorized_column().as_timedelta_column(dtype)
 
     def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index adc783c20c4..f633d527681 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1003,7 +1003,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
                         f"Casting to {dtype} is not supported, use "
                         "`.astype('str')` instead."
                     )
-                result = self.as_string_column(dtype)
+                result = self.as_string_column()
             else:
                 result = self.as_numerical_column(dtype)
 
@@ -1059,8 +1059,8 @@ def as_numerical_column(
         raise NotImplementedError
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         raise NotImplementedError
 
     def as_interval_column(
@@ -1069,13 +1069,11 @@ def as_interval_column(
         raise NotImplementedError
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         raise NotImplementedError
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         raise NotImplementedError
 
     def as_decimal_column(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c10aceba9f4..214e84028d2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -178,43 +178,6 @@ def _resolve_mixed_dtypes(
     return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")
 
 
-def _get_datetime_format(col, dtype, time_unit):
-    format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
-    if format.endswith("f"):
-        sub_second_res_len = 3
-    else:
-        sub_second_res_len = 0
-
-    has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
-    has_micros = (
-        time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
-    )
-    has_millis = (
-        time_unit in {"ns", "us", "ms"}
-        and col.get_dt_field("millisecond").any()
-    )
-    has_seconds = col.get_dt_field("second").any()
-    has_minutes = col.get_dt_field("minute").any()
-    has_hours = col.get_dt_field("hour").any()
-    if sub_second_res_len:
-        if has_nanos:
-            # format should be intact and rest of the
-            # following conditions shouldn't execute.
-            pass
-        elif has_micros:
-            format = format[:-sub_second_res_len] + "%6f"
-        elif has_millis:
-            format = format[:-sub_second_res_len] + "%3f"
-        elif has_seconds or has_minutes or has_hours:
-            format = format[:-4]
-        else:
-            format = format.split(" ")[0]
-    else:
-        if not (has_seconds or has_minutes or has_hours):
-            format = format.split(" ")[0]
-    return format
-
-
 class DatetimeColumn(column.ColumnBase):
     """
     A Column implementation for Date-time types.
@@ -381,9 +344,7 @@ def round(self, freq: str) -> ColumnBase:
 
     def isocalendar(self) -> dict[str, ColumnBase]:
         return {
-            field: self.as_string_column("str", format=directive).astype(
-                "uint32"
-            )
+            field: self.strftime(format=directive).astype("uint32")
             for field, directive in zip(
                 ["year", "week", "day"], ["%G", "%V", "%u"]
             )
@@ -445,17 +406,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        dtype = cudf.dtype(dtype)
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+    def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a datetimelike from {self.dtype} to {dtype}"
         )
@@ -472,32 +428,63 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%Y-%m-%d %H:%M:%S"
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
+            return cast(
+                cudf.core.column.StringColumn,
+                column.column_empty(0, dtype="object", masked=False),
             )
-            if cudf.get_option("mode.pandas_compatible"):
-                format = _get_datetime_format(
-                    self, dtype=self.dtype, time_unit=self.time_unit
-                )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
             names = cudf.core.column.column_empty(
                 0, dtype="object", masked=False
             )
-        if len(self) > 0:
-            return string._datetime_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format, names)
-        else:
-            return cast(
-                "cudf.core.column.StringColumn",
-                column.column_empty(0, dtype="object", masked=False),
+        return string._datetime_to_str_typecast_functions[self.dtype](
+            self, format, names
+        )
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        format = _dtype_to_format_conversion.get(
+            self.dtype.name, "%Y-%m-%d %H:%M:%S"
+        )
+        if cudf.get_option("mode.pandas_compatible"):
+            if format.endswith("f"):
+                sub_second_res_len = 3
+            else:
+                sub_second_res_len = 0
+
+            has_nanos = (
+                self.time_unit in {"ns"}
+                and self.get_dt_field("nanosecond").any()
             )
+            has_micros = (
+                self.time_unit in {"ns", "us"}
+                and self.get_dt_field("microsecond").any()
+            )
+            has_millis = (
+                self.time_unit in {"ns", "us", "ms"}
+                and self.get_dt_field("millisecond").any()
+            )
+            has_seconds = self.get_dt_field("second").any()
+            has_minutes = self.get_dt_field("minute").any()
+            has_hours = self.get_dt_field("hour").any()
+            if sub_second_res_len:
+                if has_nanos:
+                    # format should be intact and rest of the
+                    # following conditions shouldn't execute.
+                    pass
+                elif has_micros:
+                    format = format[:-sub_second_res_len] + "%6f"
+                elif has_millis:
+                    format = format[:-sub_second_res_len] + "%3f"
+                elif has_seconds or has_minutes or has_hours:
+                    format = format[:-4]
+                else:
+                    format = format.split(" ")[0]
+            elif not (has_seconds or has_minutes or has_hours):
+                format = format.split(" ")[0]
+        return self.strftime(format)
 
     def mean(
         self, skipna=None, min_count: int = 0, dtype=np.float64
@@ -872,10 +859,11 @@ def _local_time(self):
         offsets_from_utc = offsets.take(indices, nullify=True)
         return self + offsets_from_utc
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        return self._local_time.as_string_column(dtype, format)
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        return self._local_time.strftime(format)
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self._local_time.as_string_column()
 
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3e238d65cff..a63055ed527 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -62,9 +62,7 @@ def as_decimal_column(
             return self
         return libcudf.unary.cast(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return cpp_from_decimal(self)
         else:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1992d471947..cc15e78314e 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -253,15 +253,11 @@ def from_sequences(
         )
         return res
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         """
         Create a strings column from a list column
         """
-        lc = self._transform_leaves(
-            lambda col, dtype: col.as_string_column(dtype), dtype
-        )
+        lc = self._transform_leaves(lambda col: col.as_string_column())
 
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 76c64e1aea0..a0550bff72b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -331,9 +331,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":
 
         return libcudf.string_casting.int2ip(self)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -345,8 +343,8 @@ def as_string_column(
             )
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         return cast(
             "cudf.core.column.DatetimeColumn",
             build_column(
@@ -359,8 +357,8 @@ def as_datetime_column(
         )
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         return cast(
             "cudf.core.column.TimeDeltaColumn",
             build_column(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 936cd1eccb0..96f9cdfd655 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5669,16 +5669,25 @@ def as_numerical_column(
         result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
         return result_col
 
-    def _as_datetime_or_timedelta_column(self, dtype, format):
-        if len(self) == 0:
-            return cudf.core.column.column_empty(0, dtype=dtype)
-
-        # Check for None strings
-        if (self == "None").any():
-            raise ValueError("Could not convert `None` value to datetime")
-
-        is_nat = self == "NaT"
-        if dtype.kind == "M":
+    def strptime(
+        self, dtype: Dtype, format: str
+    ) -> cudf.core.column.DatetimeColumn | cudf.core.column.TimeDeltaColumn:
+        if dtype.kind not in "Mm":  # type: ignore[union-attr]
+            raise ValueError(
+                f"dtype must be datetime or timedelta type, not {dtype}"
+            )
+        elif self.null_count == len(self):
+            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+        elif (self == "None").any():
+            raise ValueError(
+                "Cannot convert `None` value to datetime or timedelta."
+            )
+        elif dtype.kind == "M":  # type: ignore[union-attr]
+            if format.endswith("%z"):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
+            is_nat = self == "NaT"
             without_nat = self.apply_boolean_mask(is_nat.unary_operator("not"))
             all_same_length = (
                 libstrings.count_characters(without_nat).distinct_count(
@@ -5699,61 +5708,43 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
             if not valid.all():
                 raise ValueError(f"Column contains invalid data for {format=}")
 
-        casting_func = (
-            str_cast.timestamp2int
-            if dtype.type == np.datetime64
-            else str_cast.timedelta2int
-        )
+            casting_func = str_cast.timestamp2int
+            add_back_nat = is_nat.any()
+        elif dtype.kind == "m":  # type: ignore[union-attr]
+            casting_func = str_cast.timedelta2int
+            add_back_nat = False
+
         result_col = casting_func(self, dtype, format)
 
-        if is_nat.any():
+        if add_back_nat:
             result_col[is_nat] = None
 
         return result_col
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-
-        # infer on host from the first not na element
-        # or return all null column if all values
-        # are null in current column
-        if format is None:
-            if self.null_count == len(self):
-                return cast(
-                    "cudf.core.column.DatetimeColumn",
-                    column.column_empty(
-                        len(self), dtype=out_dtype, masked=True
-                    ),
-                )
-            else:
-                format = datetime.infer_format(
-                    self.apply_boolean_mask(self.notnull()).element_indexing(0)
-                )
-
-        if format.endswith("%z"):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
+        not_null = self.apply_boolean_mask(self.notnull())
+        if len(not_null) == 0:
+            # We should hit the self.null_count == len(self) condition
+            # so format doesn't matter
+            format = ""
+        else:
+            # infer on host from the first not na element
+            format = datetime.infer_format(not_null.element_indexing(0))
+        return self.strptime(dtype, format)  # type: ignore[return-value]
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-        if format is None:
-            format = "%D days %H:%M:%S"
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
+        return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
     def as_decimal_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.DecimalBaseColumn":
         return libstrings.to_decimal(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> StringColumn:
+    def as_string_column(self) -> StringColumn:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 5a0171bbbdc..2cbed9212de 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -263,32 +263,26 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+    def as_datetime_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a timedelta from {self.dtype} to {dtype}"
         )
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = "%D days %H:%M:%S"
-        if len(self) > 0:
-            return string._timedelta_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format=format)
-        else:
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
             return cast(
-                "cudf.core.column.StringColumn",
+                cudf.core.column.StringColumn,
                 column.column_empty(0, dtype="object", masked=False),
             )
+        else:
+            return string._timedelta_to_str_typecast_functions[self.dtype](
+                self, format=format
+            )
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        dtype = cudf.dtype(dtype)
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self.strftime("%D days %H:%M:%S")
+
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4a60470fafa..8c8fa75918c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4731,9 +4731,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
                     f"for tracking purposes."
                 )
         return self._return_result_like_self(
-            self.series._column.as_string_column(
-                dtype="str", format=date_format
-            )
+            self.series._column.strftime(format=date_format)
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 397bfe1d472..064e8fc667d 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -216,25 +216,25 @@ def to_datetime(
                 + arg[unit_rev["day"]].astype("str").str.zfill(2)
             )
             format = "%Y-%m-%d"
-            col = new_series._column.as_datetime_column(
-                "datetime64[s]", format=format
-            )
-
             for u in ["h", "m", "s", "ms", "us", "ns"]:
                 value = unit_rev.get(u)
                 if value is not None and value in arg:
                     arg_col = arg._data[value]
-                    if arg_col.dtype.kind in ("f"):
-                        col = new_series._column.as_datetime_column(
-                            "datetime64[ns]", format=format
+                    if arg_col.dtype.kind == "f":
+                        col = new_series._column.strptime(
+                            cudf.dtype("datetime64[ns]"), format=format
                         )
                         break
-                    elif arg_col.dtype.kind in ("O"):
+                    elif arg_col.dtype.kind == "O":
                         if not cpp_is_integer(arg_col).all():
-                            col = new_series._column.as_datetime_column(
-                                "datetime64[ns]", format=format
+                            col = new_series._column.strptime(
+                                cudf.dtype("datetime64[ns]"), format=format
                             )
                             break
+            else:
+                col = new_series._column.strptime(
+                    cudf.dtype("datetime64[s]"), format=format
+                )
 
             times_column = None
             for u in ["h", "m", "s", "ms", "us", "ns"]:
@@ -334,15 +334,15 @@ def _process_col(
             col = (
                 col.astype("int")
                 .astype("str")
-                .as_datetime_column(
-                    dtype="datetime64[us]"
+                .strptime(
+                    dtype=cudf.dtype("datetime64[us]")
                     if "%f" in format
-                    else "datetime64[s]",
+                    else cudf.dtype("datetime64[s]"),
                     format=format,
                 )
             )
         else:
-            col = col.as_datetime_column(dtype="datetime64[ns]")
+            col = col.astype(dtype="datetime64[ns]")
 
     elif col.dtype.kind in "iu":
         if unit in ("D", "h", "m"):
@@ -353,11 +353,11 @@ def _process_col(
             col = col * factor
 
         if format is not None:
-            col = col.astype("str").as_datetime_column(
-                dtype=_unit_dtype_map[unit], format=format
+            col = col.astype("str").strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]), format=format
             )
         else:
-            col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
+            col = col.astype(dtype=cudf.dtype(_unit_dtype_map[unit]))
 
     elif col.dtype.kind == "O":
         if unit not in (None, "ns") or col.null_count == len(col):
@@ -384,8 +384,8 @@ def _process_col(
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            col = col.as_datetime_column(
-                dtype=_unit_dtype_map[unit],
+            col = col.strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]),
                 format=format,
             )
     elif col.dtype.kind != "M":

From adee00aca95a749ecdf86975ae6d5b7fa1c01733 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Thu, 11 Jul 2024 12:46:08 -0700
Subject: [PATCH 044/101] remove `cuco_noexcept.diff` (#16254)

This PR removes the cuDF `cuco_noexcept.diff` patch since it no longer applies after https://github.com/rapidsai/rapids-cmake/pull/628.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16254
---
 cpp/cmake/thirdparty/get_cucollections.cmake  |   4 -
 .../thirdparty/patches/cuco_noexcept.diff     | 227 ------------------
 .../thirdparty/patches/cuco_override.json     |  14 --
 3 files changed, 245 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/cuco_noexcept.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/cuco_override.json

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 6ec35ddcaf1..fb82b0f5ff3 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -15,10 +15,6 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
 
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
deleted file mode 100644
index 0f334c0e81f..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
+++ /dev/null
@@ -1,227 +0,0 @@
-diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
-index 7f9de01..5228193 100644
---- a/include/cuco/aow_storage.cuh
-+++ b/include/cuco/aow_storage.cuh
-@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param size Number of windows to (de)allocate
-    * @param allocator Allocator used for (de)allocating device storage
-    */
--  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
-+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
- 
-   aow_storage(aow_storage&&) = default;  ///< Move constructor
-   /**
-@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param key Key to which all keys in `slots` are initialized
-    * @param stream Stream used for executing the kernel
-    */
--  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
-+  void initialize(value_type key, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
-diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-index c2c9c14..8ac4236 100644
---- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-@@ -125,7 +125,7 @@ class open_addressing_impl {
-                                  KeyEqual const& pred,
-                                  ProbingScheme const& probing_scheme,
-                                  Allocator const& alloc,
--                                 cuda_stream_ref stream) noexcept
-+                                 cuda_stream_ref stream)
-     : empty_slot_sentinel_{empty_slot_sentinel},
-       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
-       predicate_{pred},
-@@ -233,7 +233,7 @@ class open_addressing_impl {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
-+  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -599,7 +599,7 @@ class open_addressing_impl {
-    *
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
-+  [[nodiscard]] size_type size(cuda_stream_ref stream) const
-   {
-     auto counter =
-       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
-diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
-index e17a145..3fa1d02 100644
---- a/include/cuco/detail/static_map/static_map.inl
-+++ b/include/cuco/detail/static_map/static_map.inl
-@@ -123,7 +123,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -215,7 +215,7 @@ template <class Key,
-           class Storage>
- template <typename InputIt>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
--  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
-+  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
- {
-   return this->insert_or_assign_async(first, last, stream);
-   stream.synchronize();
-@@ -465,7 +465,7 @@ template <class Key,
-           class Storage>
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
-index 174f9bc..582926b 100644
---- a/include/cuco/detail/static_multiset/static_multiset.inl
-+++ b/include/cuco/detail/static_multiset/static_multiset.inl
-@@ -97,7 +97,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -183,7 +183,7 @@ template <class Key,
-           class Storage>
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
-index 645013f..d3cece0 100644
---- a/include/cuco/detail/static_set/static_set.inl
-+++ b/include/cuco/detail/static_set/static_set.inl
-@@ -98,7 +98,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -429,7 +429,7 @@ template <class Key,
-           class Storage>
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
-index 3547f4c..94b7f98 100644
---- a/include/cuco/detail/storage/aow_storage.inl
-+++ b/include/cuco/detail/storage/aow_storage.inl
-@@ -32,8 +32,8 @@
- namespace cuco {
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
--constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
--  Extent size, Allocator const& allocator) noexcept
-+constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
-+                                                                     Allocator const& allocator)
-   : detail::aow_storage_base<T, WindowSize, Extent>{size},
-     allocator_{allocator},
-     window_deleter_{capacity(), allocator_},
-@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
- void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
--                                                               cuda_stream_ref stream) noexcept
-+                                                               cuda_stream_ref stream)
- {
-   this->initialize_async(key, stream);
-   stream.synchronize();
-diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
-index c86e90c..95da423 100644
---- a/include/cuco/static_map.cuh
-+++ b/include/cuco/static_map.cuh
-@@ -269,7 +269,7 @@ class static_map {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -387,7 +387,7 @@ class static_map {
-    * @param stream CUDA stream used for insert
-    */
-   template <typename InputIt>
--  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
-+  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
-@@ -690,7 +690,7 @@ class static_map {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash map can hold.
-diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
-index 0daf103..fbcbc9c 100644
---- a/include/cuco/static_multiset.cuh
-+++ b/include/cuco/static_multiset.cuh
-@@ -235,7 +235,7 @@ class static_multiset {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -339,7 +339,7 @@ class static_multiset {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the multiset can hold.
-diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
-index a069939..3517f84 100644
---- a/include/cuco/static_set.cuh
-+++ b/include/cuco/static_set.cuh
-@@ -240,7 +240,7 @@ class static_set {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -687,7 +687,7 @@ class static_set {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
deleted file mode 100644
index ae0a9a4b4f0..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_override.json
+++ /dev/null
@@ -1,14 +0,0 @@
-
-{
-  "packages" : {
-    "cuco" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/cuco_noexcept.diff",
-          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}

From cd2d53b23fb37b8f68fe59571454f7b95ff98e2f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 11 Jul 2024 21:25:35 +0100
Subject: [PATCH 045/101] Expose reflection to check if casting between two
 types is supported (#16239)

In cudf-polars we need to check if a cast between two datatypes is supported (and fallback, or generate different code if not).

Let's ask libcudf to be the source of truth for when a cast is supported.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Matthew Roeschke (https://github.com/mroeschke)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16239
---
 cpp/include/cudf/unary.hpp                    | 10 +++++++++
 cpp/src/unary/cast_ops.cu                     | 16 ++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/unary.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/unary.pxd     |  4 ++++
 python/cudf/cudf/_lib/pylibcudf/unary.pyx     | 21 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_unary.py   | 19 +++++++++++++++++
 6 files changed, 72 insertions(+)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_unary.py

diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 8a515335351..1609c72f175 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -211,6 +211,16 @@ std::unique_ptr<column> cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Check if a cast between two datatypes is supported.
+ *
+ * @param from source type
+ * @param to   target type
+ *
+ * @returns true if the cast is supported.
+ */
+bool is_supported_cast(data_type from, data_type to) noexcept;
+
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
  * in a column of floating point values.
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 64427326d87..ec21813705a 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -28,6 +28,7 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -459,6 +460,14 @@ std::unique_ptr<column> cast(column_view const& input,
   return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr);
 }
 
+struct is_supported_cast_impl {
+  template <typename From, typename To>
+  bool operator()() const
+  {
+    return is_supported_cast<From, To>();
+  }
+};
+
 }  // namespace detail
 
 std::unique_ptr<column> cast(column_view const& input,
@@ -470,4 +479,11 @@ std::unique_ptr<column> cast(column_view const& input,
   return detail::cast(input, type, stream, mr);
 }
 
+bool is_supported_cast(data_type from, data_type to) noexcept
+{
+  // No matching detail API call/nvtx annotation, since this doesn't
+  // launch a kernel.
+  return double_type_dispatcher(from, to, detail::is_supported_cast_impl{});
+}
+
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index 7f8ae2b7617..2a1b189af51 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
@@ -43,5 +44,6 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
     cdef extern unique_ptr[column] cast(
         column_view input,
         data_type out_type) except +
+    cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept
     cdef extern unique_ptr[column] is_nan(column_view input) except +
     cdef extern unique_ptr[column] is_not_nan(column_view input) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index 4aa4543bb80..d07df838172 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
@@ -17,3 +19,5 @@ cpdef Column cast(Column input, DataType data_type)
 cpdef Column is_nan(Column input)
 
 cpdef Column is_not_nan(Column input)
+
+cpdef bool is_supported_cast(DataType from_, DataType to)
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 0879b501a49..8da46f0a832 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -154,3 +155,23 @@ cpdef Column is_not_nan(Column input):
         result = move(cpp_unary.is_not_nan(input.view()))
 
     return Column.from_libcudf(move(result))
+
+cpdef bool is_supported_cast(DataType from_, DataType to):
+    """Check if a cast between datatypes is supported.
+
+    For details, see :cpp:func:`is_supported_cast`.
+
+    Parameters
+    ----------
+    from_
+        The source datatype
+    to
+        The target datatype
+
+    Returns
+    -------
+    bool
+        True if the cast is supported.
+    """
+    with nogil:
+        return cpp_unary.is_supported_cast(from_.c_obj, to.c_obj)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/cudf/cudf/pylibcudf_tests/test_unary.py
new file mode 100644
index 00000000000..b5e4f0cb0e8
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_unary.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_supported_cast():
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT64)
+    )
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.DURATION_MILLISECONDS),
+        plc.DataType(plc.TypeId.UINT64),
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.STRING)
+    )

From dddeb120d0cf8fc33f7f1a07149221fdb2a29e7a Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 11 Jul 2024 16:25:12 -0700
Subject: [PATCH 046/101] Fix ArrowDeviceArray interface to pass address of
 event (#16058)

the `sync_event` member of `ArrowDeviceArray` needs to be a pointer to a `cudaEvent_t`, currently we're returning the `cudaEvent_t` directly. We need to be passing the address of the event. Thankfully this is a single line change, plus adding a test to confirm.

Authors:
  - Matt Topol (https://github.com/zeroshade)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16058
---
 cpp/src/interop/to_arrow_device.cu         |  2 +-
 cpp/tests/interop/to_arrow_device_test.cpp | 26 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index ebfd6605977..b9d3a59e647 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -603,7 +603,7 @@ unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
   });
   result->device_id          = rmm::get_current_cuda_device().value();
   result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = private_data->sync_event;
+  result->sync_event         = &private_data->sync_event;
   result->array              = private_data->parent;  // makes a shallow copy
   result->array.private_data = private_data.release();
   result->array.release      = &detail::ArrowDeviceArrayRelease;
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 860544b8606..8903f09b82b 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -352,11 +352,15 @@ TEST_F(ToArrowDeviceTest, EmptyTable)
   auto got_arrow_device = cudf::to_arrow_device(table->view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 
   got_arrow_device = cudf::to_arrow_device(std::move(*table));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 }
 
@@ -386,6 +390,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -402,6 +408,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -456,6 +464,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -472,6 +482,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -538,6 +550,8 @@ TEST_F(ToArrowDeviceTest, NestedList)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
@@ -682,11 +696,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 }
 
@@ -755,11 +773,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }
@@ -802,11 +824,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }

From 30e3209894d78fe7d5927cde62b6c5975257958a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 14:59:27 +0100
Subject: [PATCH 047/101] Assert valid metadata is passed in to_arrow for
 list_view (#16198)

When converting a list column to arrow with metadata, one must provide metadata information for both the offset and value columns, or none at all. This is not completely obvious (perhaps we only need the metadata for the inner value column), so explicitly assert this case.

- Closes #16069

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16198
---
 cpp/src/interop/to_arrow.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 62b85891adb..8c4be1b50a5 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -365,6 +365,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(metadata.children_meta.empty() ||
+                 metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children do not match\n");
   std::unique_ptr<column> tmp_column = nullptr;
   if ((input.offset() != 0) or
       ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {

From 1ff74612a17336131ef2d1b00f83be177e1af128 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 12 Jul 2024 08:12:18 -0700
Subject: [PATCH 048/101] Improve the test data for pylibcudf I/O tests
 (#16247)

Don't just use random integers for every data type.

Decided not to use hypothesis since I don't think there's a good way to re-use the table across calls
(and I would like to keep the runtime of pylibcudf tests down).

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16247
---
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 40 +++++++++
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 70 ++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 85 ++++---------------
 3 files changed, 124 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 46603ff32b8..efb192b3251 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -174,6 +174,21 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
+def write_source_str(source, input_str):
+    """
+    Write a string to the source
+    (useful for testing CSV/JSON I/O)
+    """
+    if not isinstance(source, io.IOBase):
+        with open(source, "w") as source_f:
+            source_f.write(input_str)
+    else:
+        if isinstance(source, io.BytesIO):
+            input_str = input_str.encode("utf-8")
+        source.write(input_str)
+        source.seek(0)
+
+
 def sink_to_str(sink):
     """
     Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
@@ -192,6 +207,31 @@ def sink_to_str(sink):
     return str_result
 
 
+def make_source(path_or_buf, pa_table, format, **kwargs):
+    """
+    Write a pyarrow Table to a specific format using pandas
+    by dispatching to the appropriate to_* call.
+    The caller is responsible for making sure that no arguments
+    unsupported by pandas are passed in.
+    """
+    df = pa_table.to_pandas()
+    mode = "w"
+    if "compression" in kwargs:
+        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
+            kwargs["compression"]
+        ]
+        if kwargs["compression"] is not None and format != "json":
+            # pandas json method only supports mode="w"/"a"
+            mode = "wb"
+    if format == "json":
+        df.to_json(path_or_buf, mode=mode, **kwargs)
+    elif format == "csv":
+        df.to_csv(path_or_buf, mode=mode, **kwargs)
+    if isinstance(path_or_buf, io.IOBase):
+        path_or_buf.seek(0)
+    return path_or_buf
+
+
 NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
 STRING_PA_TYPES = [pa.string()]
 BOOL_PA_TYPES = [pa.bool_()]
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 39832eb4bba..3ef1e40b630 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
@@ -37,6 +38,37 @@ def numeric_pa_type(request):
     return request.param
 
 
+def _get_vals_of_type(pa_type, length, seed):
+    """
+    Returns an list-like of random values of that type
+    """
+    rng = np.random.default_rng(seed=seed)
+    if pa_type == pa.int64():
+        half = length // 2
+        negs = rng.integers(-length, 0, half, dtype=np.int64)
+        pos = rng.integers(0, length, length - half, dtype=np.int64)
+        return np.concatenate([negs, pos])
+    elif pa_type == pa.uint64():
+        return rng.integers(0, length, length, dtype=np.uint64)
+    elif pa_type == pa.float64():
+        # Round to 6 decimal places or else we have problems comparing our
+        # output to pandas due to floating point/rounding differences
+        return rng.uniform(-length, length, length).round(6)
+    elif pa_type == pa.bool_():
+        return rng.integers(0, 2, length, dtype=bool)
+    elif pa_type == pa.string():
+        # Generate random ASCII strings
+        strs = []
+        for _ in range(length):
+            chrs = rng.integers(33, 128, length)
+            strs.append("".join(chr(x) for x in chrs))
+        return strs
+    else:
+        raise NotImplementedError(
+            f"random data generation not implemented for {pa_type}"
+        )
+
+
 # TODO: Consider adding another fixture/adapting this
 # fixture to consider nullability
 @pytest.fixture(scope="session", params=[0, 100])
@@ -57,10 +89,9 @@ def table_data(request):
     # plc.io.TableWithMetadata
     colnames = []
 
-    np.random.seed(42)
+    seed = 42
 
     for typ in ALL_PA_TYPES:
-        rand_vals = np.random.randint(0, nrows, nrows)
         child_colnames = []
 
         def _generate_nested_data(typ):
@@ -88,13 +119,17 @@ def _generate_nested_data(typ):
                 child_colnames.append(("", grandchild_colnames))
             else:
                 # typ is scalar type
-                pa_array = pa.array(rand_vals).cast(typ)
+                pa_array = pa.array(
+                    _get_vals_of_type(typ, nrows, seed=seed), type=typ
+                )
             return pa_array, child_colnames
 
         if isinstance(typ, (pa.ListType, pa.StructType)):
             rand_arr, child_colnames = _generate_nested_data(typ)
         else:
-            rand_arr = pa.array(rand_vals).cast(typ)
+            rand_arr = pa.array(
+                _get_vals_of_type(typ, nrows, seed=seed), type=typ
+            )
 
         table_dict[f"col_{typ}"] = rand_arr
         colnames.append((f"col_{typ}", child_colnames))
@@ -121,6 +156,33 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+unsupported_types = {
+    # Not supported by pandas
+    # TODO: find a way to test these
+    CompressionType.SNAPPY,
+    CompressionType.BROTLI,
+    CompressionType.LZ4,
+    CompressionType.LZO,
+    CompressionType.ZLIB,
+}
+
+unsupported_text_compression_types = unsupported_types.union(
+    {
+        # compressions not supported by libcudf
+        # for csv/json
+        CompressionType.XZ,
+        CompressionType.ZSTD,
+    }
+)
+
+
+@pytest.fixture(
+    params=set(CompressionType).difference(unsupported_text_compression_types)
+)
+def text_compression_type(request):
+    return request.param
+
+
 @pytest.fixture(params=[opt for opt in plc.io.types.CompressionType])
 def compression_type(request):
     return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
index c13eaf40625..4239f2438bb 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -5,45 +5,17 @@
 import pyarrow as pa
 import pytest
 from utils import (
-    COMPRESSION_TYPE_TO_PANDAS,
     assert_table_and_meta_eq,
+    make_source,
     sink_to_str,
+    write_source_str,
 )
 
 import cudf._lib.pylibcudf as plc
 from cudf._lib.pylibcudf.io.types import CompressionType
 
-
-def make_json_source(path_or_buf, pa_table, **kwargs):
-    """
-    Uses pandas to write a pyarrow Table to a JSON file.
-
-    The caller is responsible for making sure that no arguments
-    unsupported by pandas are passed in.
-    """
-    df = pa_table.to_pandas()
-    if "compression" in kwargs:
-        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
-            kwargs["compression"]
-        ]
-    df.to_json(path_or_buf, orient="records", **kwargs)
-    if isinstance(path_or_buf, io.IOBase):
-        path_or_buf.seek(0)
-    return path_or_buf
-
-
-def write_json_bytes(source, json_str):
-    """
-    Write a JSON string to the source
-    """
-    if not isinstance(source, io.IOBase):
-        with open(source, "w") as source_f:
-            source_f.write(json_str)
-    else:
-        if isinstance(source, io.BytesIO):
-            json_str = json_str.encode("utf-8")
-        source.write(json_str)
-        source.seek(0)
+# Shared kwargs to pass to make_source
+_COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
 
 @pytest.mark.parametrize("rows_per_chunk", [8, 100])
@@ -156,21 +128,9 @@ def test_write_json_bool_opts(true_value, false_value):
 
 @pytest.mark.parametrize("lines", [True, False])
 def test_read_json_basic(
-    table_data, source_or_sink, lines, compression_type, request
+    table_data, source_or_sink, lines, text_compression_type
 ):
-    if compression_type in {
-        # Not supported by libcudf
-        CompressionType.SNAPPY,
-        CompressionType.XZ,
-        CompressionType.ZSTD,
-        # Not supported by pandas
-        # TODO: find a way to test these
-        CompressionType.BROTLI,
-        CompressionType.LZ4,
-        CompressionType.LZO,
-        CompressionType.ZLIB,
-    }:
-        pytest.skip("unsupported compression type by pandas/libcudf")
+    compression_type = text_compression_type
 
     # can't compress non-binary data with pandas
     if isinstance(source_or_sink, io.StringIO):
@@ -178,22 +138,12 @@ def test_read_json_basic(
 
     _, pa_table = table_data
 
-    source = make_json_source(
-        source_or_sink, pa_table, lines=lines, compression=compression_type
-    )
-
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                len(pa_table) > 0
-                and compression_type
-                not in {CompressionType.NONE, CompressionType.AUTO}
-            ),
-            # note: wasn't able to narrow down the specific types that were failing
-            # seems to be a little non-deterministic, but always fails with
-            # cudaErrorInvalidValue invalid argument
-            reason="libcudf json reader crashes on compressed non empty table_data",
-        )
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        lines=lines,
+        compression=compression_type,
+        **_COMMON_JSON_SOURCE_KWARGS,
     )
 
     if isinstance(source, io.IOBase):
@@ -237,10 +187,11 @@ def test_read_json_dtypes(table_data, source_or_sink):
     # Simple test for dtypes where we read in
     # all numeric data as floats
     _, pa_table = table_data
-    source = make_json_source(
+    source = make_source(
         source_or_sink,
         pa_table,
         lines=True,
+        **_COMMON_JSON_SOURCE_KWARGS,
     )
 
     dtypes = []
@@ -295,7 +246,7 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size):
         pytest.skip("byte_range doesn't work on StringIO")
 
     json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"
-    write_json_bytes(source, json_str)
+    write_source_str(source, json_str)
 
     tbls_w_meta = []
     for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
@@ -331,7 +282,7 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
     source = source_or_sink
 
     json_bytes = '["a", "b", "c"]\n'
-    write_json_bytes(source, json_bytes)
+    write_source_str(source, json_bytes)
 
     tbl_w_meta = plc.io.json.read_json(
         plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
@@ -359,8 +310,8 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
 def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
     source = source_or_sink
 
-    json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
-    write_json_bytes(source, json_bytes)
+    json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    write_source_str(source, json_str)
 
     if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
         with pytest.raises(RuntimeError):

From 4fc8e790bf0671bba85a94e29deb4f3bc511a416 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 17:08:55 +0100
Subject: [PATCH 049/101] Handle nans in groupby-aggregations in polars
 executor (#16233)

Polars `min` and `max` by default ignore nans (treating them as nulls), to mimic this behaviour we must mask out nans before performing a min/max aggregation.

Do this by exposing `nans_to_nulls` in pylibcudf and implementing a `with_mask` method on pylibcudf Columns.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16233
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../api_docs/pylibcudf/transform.rst          |  6 +++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  2 +
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |  1 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 32 +++++++++++--
 .../cudf/_lib/pylibcudf/gpumemoryview.pyx     |  1 +
 python/cudf/cudf/_lib/pylibcudf/transform.pxd |  7 +++
 python/cudf/cudf/_lib/pylibcudf/transform.pyx | 35 +++++++++++++++
 python/cudf/cudf/_lib/transform.pyx           | 17 +++----
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 11 ++++-
 .../cudf/pylibcudf_tests/test_transform.py    | 32 +++++++++++++
 .../cudf_polars/containers/column.py          | 45 +++++++++++++------
 python/cudf_polars/cudf_polars/dsl/expr.py    | 12 ++++-
 .../tests/containers/test_column.py           | 20 ++++++---
 .../cudf_polars/tests/expressions/test_agg.py | 25 ++++++++---
 python/cudf_polars/tests/test_groupby.py      | 24 ++++++++++
 18 files changed, 230 insertions(+), 44 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_transform.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index bd6f0f77357..5899d272160 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -34,6 +34,7 @@ This page provides API documentation for pylibcudf.
     stream_compaction
     table
     traits
+    transform
     types
     unary
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
new file mode 100644
index 00000000000..ef04bbad7e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -0,0 +1,6 @@
+=========
+transform
+=========
+
+.. automodule:: cudf._lib.pylibcudf.transform
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index d22096081af..a2d11bbea6e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -39,6 +39,7 @@ set(cython_sources
     sorting.pyx
     table.pyx
     traits.pyx
+    transform.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index d4d615cde34..da2b7806203 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -24,6 +24,7 @@ from . cimport (
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -63,6 +64,7 @@ __all__ = [
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 91f8acaf682..acbc84d7177 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -64,6 +65,7 @@
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index d13791d95cf..13ee0a70681 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -50,6 +50,7 @@ cdef class Column:
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
     cpdef Column copy(self)
+    cpdef Column with_mask(self, gpumemoryview, size_type)
 
     cpdef ListColumnView list_view(self)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e0cf8b7ee32..cb96c1d9fce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -175,6 +175,32 @@ cdef class Column:
             children,
         )
 
+    cpdef Column with_mask(self, gpumemoryview mask, size_type null_count):
+        """Augment this column with a new null mask.
+
+        Parameters
+        ----------
+        mask : gpumemoryview
+            New mask (or None to unset the mask)
+        null_count : int
+            New null count. If this is incorrect, bad things happen.
+
+        Returns
+        -------
+        New Column object sharing data with self (except for the mask which is new).
+        """
+        if mask is None and null_count > 0:
+            raise ValueError("Empty mask must have null count of zero")
+        return Column(
+            self._data_type,
+            self._size,
+            self._data,
+            mask,
+            null_count,
+            self._offset,
+            self._children,
+        )
+
     @staticmethod
     cdef Column from_column_view(const column_view& cv, Column owner):
         """Create a Column from a libcudf column_view.
@@ -250,7 +276,7 @@ cdef class Column:
         column is in use.
         """
         data = gpumemoryview(obj)
-        iface = data.__cuda_array_interface__()
+        iface = data.__cuda_array_interface__
         if iface.get('mask') is not None:
             raise ValueError("mask not yet supported.")
 
@@ -400,8 +426,8 @@ def is_c_contiguous(
     itemsize : int
         Size of an element in bytes.
 
-    Return
-    ------
+    Returns
+    -------
     bool
         The boolean answer.
     """
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
index a2f5b2ac387..0904022a944 100644
--- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -22,5 +22,6 @@ cdef class gpumemoryview:
         # TODO: Need to respect readonly
         self.ptr = cai["data"][0]
 
+    @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
new file mode 100644
index 00000000000..4b21feffe25
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
new file mode 100644
index 00000000000..a734e71b820
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move, pair
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
+    """Create a null mask preserving existing nulls and converting nans to null.
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    Two-tuple of a gpumemoryview wrapping the null mask and the new null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index b325173f20d..86a4a60eef1 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -20,6 +20,7 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf cimport transform as plc_transform
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
@@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    cdef column_view c_input = input.view()
-    cdef pair[unique_ptr[device_buffer], size_type] c_output
-    cdef unique_ptr[device_buffer] c_buffer
-
-    with nogil:
-        c_output = move(libcudf_transform.nans_to_nulls(c_input))
-        c_buffer = move(c_output.first)
-
-    if c_output.second == 0:
-        return None
-
-    return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer)))
+    (mask, _) = plc_transform.nans_to_nulls(
+        input.to_pylibcudf(mode="read")
+    )
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 3ef1e40b630..53e207f29cb 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -203,6 +203,15 @@ def sorted_opt(request):
     return request.param
 
 
-@pytest.fixture(scope="session", params=[False, True])
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nulls", "with_nulls"]
+)
 def has_nulls(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nans", "with_nans"]
+)
+def has_nans(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py
new file mode 100644
index 00000000000..312939888dd
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_transform.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_nans_to_nulls(has_nans):
+    if has_nans:
+        values = [1, float("nan"), float("nan"), None, 3, None]
+    else:
+        values = [1, 4, 5, None, 3, None]
+
+    replaced = [
+        None if (v is None or (v is not None and math.isnan(v))) else v
+        for v in values
+    ]
+
+    h_input = pa.array(values, type=pa.float32())
+    input = plc.interop.from_arrow(h_input)
+    assert input.null_count() == h_input.null_count
+    expect = pa.array(replaced, type=pa.float32())
+
+    mask, null_count = plc.transform.nans_to_nulls(input)
+
+    assert null_count == expect.null_count
+    got = input.with_mask(mask, null_count)
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 28685f0c4ed..af67059844e 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -128,24 +128,29 @@ def copy(self) -> Self:
         )
 
     def mask_nans(self) -> Self:
-        """Return a copy of self with nans masked out."""
-        if self.nan_count > 0:
-            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        """Return a shallow copy of self with nans masked out."""
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count))
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
         return self.copy()
 
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-            return 0
-        return plc.interop.to_arrow(
-            plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
-        ).as_py()
+        if plc.traits.is_floating_point(self.obj.type()):
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    # TODO: pylibcudf needs to have a SizeType DataType singleton
+                    plc.DataType(plc.TypeId.INT32),
+                )
+            ).as_py()
+        return 0
 
 
 class NamedColumn(Column):
@@ -187,3 +192,17 @@ def copy(self, *, new_name: str | None = None) -> Self:
             order=self.order,
             null_order=self.null_order,
         )
+
+    def mask_nans(self) -> Self:
+        """Return a shallow copy of self with nans masked out."""
+        # Annoying, the inheritance is not right (can't call the
+        # super-type mask_nans), but will sort that by refactoring
+        # later.
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
+        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f83d9e82d30..adf266bab81 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -867,7 +867,7 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("round", "unique"):
+        if self.name not in ("round", "unique", "mask_nans"):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -878,6 +878,9 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
         if self.name == "round":
             (decimal_places,) = self.options
             (values,) = (
@@ -1215,12 +1218,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError(
                 "Nested aggregations in groupby"
             )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
         if self.request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
         return AggInfo([(expr, self.request, self)])
 
     def _reduce(
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 3291d8db161..4f3c0de5975 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -3,12 +3,14 @@
 
 from __future__ import annotations
 
+from functools import partial
+
 import pyarrow
 import pytest
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, NamedColumn
 
 
 def test_non_scalar_access_raises():
@@ -54,17 +56,21 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-def test_mask_nans(typeid):
+@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
+def test_mask_nans(typeid, constructor):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = Column(plc.interop.from_arrow(values))
+    column = constructor(plc.interop.from_arrow(values))
     masked = column.mask_nans()
-    assert column.obj is masked.obj
+    assert column.obj.null_count() == masked.obj.null_count()
 
 
-def test_mask_nans_float_with_nan_notimplemented():
+def test_mask_nans_float():
     dtype = plc.DataType(plc.TypeId.FLOAT32)
     values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype))
     column = Column(plc.interop.from_arrow(values))
-    with pytest.raises(NotImplementedError):
-        _ = column.mask_nans()
+    masked = column.mask_nans()
+    expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype))
+    got = pyarrow.array(plc.interop.to_arrow(masked.obj))
+
+    assert expect == got
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 267d0a99692..e53fd7f8615 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -59,14 +59,25 @@ def test_agg(df, agg):
 
 
 @pytest.mark.parametrize(
-    "propagate_nans",
-    [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True],
-    ids=["mask_nans", "propagate_nans"],
+    "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max]
 )
-@pytest.mark.parametrize("op", ["min", "max"])
-def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
-    op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
+def test_agg_float_with_nans(op):
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, float("nan")], dtype=pl.Float64()),
+            "b": pl.Series([1, 2, None], dtype=pl.Int8()),
+        }
+    )
+    q = df.select(op(pl.col("a")), op(pl.col("b")))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+@pytest.mark.parametrize("op", [pl.Expr.max, pl.Expr.min])
+def test_agg_singleton(op):
+    df = pl.LazyFrame({"a": pl.Series([float("nan")])})
+
     q = df.select(op(pl.col("a")))
 
     assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b84e2c16b43..81306397b9f 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -99,3 +99,27 @@ def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+def test_groupby_minmax_with_nan():
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(
+        pl.col("value").max().alias("max"), pl.col("value").min().alias("min")
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("op", [pl.Expr.nan_max, pl.Expr.nan_min])
+def test_groupby_nan_minmax_raises(op):
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(op(pl.col("value")))
+
+    assert_ir_translation_raises(q, NotImplementedError)

From f79ca04fe792107a69b5ccf18f41d65c44957dbd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Jul 2024 07:11:19 -1000
Subject: [PATCH 050/101] Add docstring for from_dataframe (#16260)

xref https://github.com/rapidsai/cudf/issues/16238

Mainly direct users to use `from_pandas` instead of `from_dataframe` if the user has a `pandas.DataFrame`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16260
---
 python/cudf/cudf/core/dataframe.py   | 21 ++++++++++++++++++++-
 python/cudf/cudf/core/df_protocol.py |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3e5ff9c18b5..2be59f87483 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7849,7 +7849,26 @@ def value_counts(
         return result
 
 
-def from_dataframe(df, allow_copy=False):
+def from_dataframe(df, allow_copy: bool = False) -> DataFrame:
+    """
+    Build a :class:`DataFrame` from an object supporting the dataframe interchange protocol.
+
+    .. note::
+
+        If you have a ``pandas.DataFrame``, use :func:`from_pandas` instead.
+
+    Parameters
+    ----------
+    df : DataFrameXchg
+        Object supporting the interchange protocol, i.e. ``__dataframe__`` method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    :class:`DataFrame`
+    """
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
 
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9cd573aceb9..a70a42c04af 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -648,7 +648,7 @@ def __dataframe__(
 
 def from_dataframe(
     df: DataFrameObject, allow_copy: bool = False
-) -> _CuDFDataFrame:
+) -> cudf.DataFrame:
     """
     Construct a ``DataFrame`` from ``df`` if it supports the
     dataframe interchange protocol (``__dataframe__``).

From 1737e70a006740b157624599b86929c01940fa3c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 19:33:00 +0100
Subject: [PATCH 051/101] Expose sorted groupby parameters to pylibcudf
 (#16240)

And plumb through to cudf-polars.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16240
---
 python/cudf/cudf/_lib/pylibcudf/groupby.pxd   |  4 ++
 python/cudf/cudf/_lib/pylibcudf/groupby.pyx   | 38 +++++++++++---
 .../cudf_polars/containers/column.py          | 41 ++++++++++++++-
 .../cudf_polars/containers/dataframe.py       | 45 ++++++++++++++--
 python/cudf_polars/cudf_polars/dsl/expr.py    | 29 ++++++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 29 ++++-------
 .../tests/containers/test_dataframe.py        | 51 +++++++++++++++++++
 .../cudf_polars/tests/expressions/test_agg.py |  8 +--
 .../tests/expressions/test_sort.py            | 31 +++++++++++
 python/cudf_polars/tests/test_groupby.py      | 39 ++++++++++++--
 10 files changed, 274 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index c6c146b0445..eaa05c26986 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -16,6 +16,7 @@ from cudf._lib.pylibcudf.libcudf.groupby cimport (
     scan_request,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
@@ -38,6 +39,9 @@ cdef class GroupByRequest:
 cdef class GroupBy:
     cdef unique_ptr[groupby] c_obj
     cdef Table _keys
+    cdef unique_ptr[vector[order]] _column_order
+    cdef unique_ptr[vector[null_order]] _null_precedence
+
     cpdef tuple aggregate(self, list requests)
     cpdef tuple scan(self, list requests)
     cpdef tuple shift(self, Table values, list offset, list fill_values)
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 46fe61025ce..f5bb46ca6a2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -2,7 +2,7 @@
 
 from cython.operator cimport dereference
 from libcpp.functional cimport reference_wrapper
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .table cimport Table
-from .types cimport null_policy, sorted
+from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector
 
 
@@ -87,17 +87,43 @@ cdef class GroupBy:
     keys : Table
         The columns to group by.
     null_handling : null_policy, optional
-        Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE.
+        Whether or not to include null rows in `keys`.
+        Default is ``null_policy.EXCLUDE``.
     keys_are_sorted : sorted, optional
-        Whether the keys are already sorted. Default is sorted.NO.
+        Whether the keys are already sorted. Default is ``sorted.NO``.
+    column_order : list[order]
+        Indicates the order of each column. Default is ``order.ASCENDING``.
+        Ignored if `keys_are_sorted` is ``sorted.NO``.
+    null_precedence : list[null_order]
+        Indicates the ordering of null values in each column.
+        Default is ``null_order.AFTER``. Ignored if `keys_are_sorted` is ``sorted.NO``.
     """
     def __init__(
         self,
         Table keys,
         null_policy null_handling=null_policy.EXCLUDE,
-        sorted keys_are_sorted=sorted.NO
+        sorted keys_are_sorted=sorted.NO,
+        list column_order=None,
+        list null_precedence=None,
     ):
-        self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted))
+        self._column_order = make_unique[vector[order]]()
+        self._null_precedence = make_unique[vector[null_order]]()
+        if column_order is not None:
+            for o in column_order:
+                dereference(self._column_order).push_back(<order?>o)
+        if null_precedence is not None:
+            for o in null_precedence:
+                dereference(self._null_precedence).push_back(<null_order?>o)
+
+        self.c_obj.reset(
+            new groupby(
+                keys.view(),
+                null_handling,
+                keys_are_sorted,
+                dereference(self._column_order.get()),
+                dereference(self._null_precedence.get()),
+            )
+        )
         # keep a reference to the keys table so it doesn't get
         # deallocated from under us:
         self._keys = keys
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index af67059844e..42aba0fcdc0 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -13,6 +13,8 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    import polars as pl
+
 __all__: list[str] = ["Column", "NamedColumn"]
 
 
@@ -76,12 +78,49 @@ def sorted_like(self, like: Column, /) -> Self:
 
         See Also
         --------
-        set_sorted
+        set_sorted, copy_metadata
         """
         return self.set_sorted(
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
 
+    def copy_metadata(self, from_: pl.Series, /) -> Self:
+        """
+        Copy metadata from a host series onto self.
+
+        Parameters
+        ----------
+        from_
+            Polars series to copy metadata from
+
+        Returns
+        -------
+        Self with metadata set.
+
+        See Also
+        --------
+        set_sorted, sorted_like
+        """
+        if len(from_) <= 1:
+            return self
+        ascending = from_.flags["SORTED_ASC"]
+        descending = from_.flags["SORTED_DESC"]
+        if ascending or descending:
+            has_null_first = from_.item(0) is None
+            has_null_last = from_.item(-1) is None
+            order = (
+                plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if (descending and has_null_first) or (ascending and has_null_last):
+                null_order = plc.types.NullOrder.AFTER
+            return self.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        return self
+
     def set_sorted(
         self,
         *,
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index d86656578d7..cbeadf1426a 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -9,16 +9,18 @@
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
+import pyarrow as pa
+
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers.column import NamedColumn
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
 
-    import pyarrow as pa
     from typing_extensions import Self
 
     import cudf
@@ -50,8 +52,16 @@ def to_polars(self) -> pl.DataFrame:
             self.table,
             [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
         )
-
-        return cast(pl.DataFrame, pl.from_arrow(table))
+        return cast(pl.DataFrame, pl.from_arrow(table)).with_columns(
+            *(
+                pl.col(c.name).set_sorted(
+                    descending=c.order == plc.types.Order.DESCENDING
+                )
+                if c.is_sorted
+                else pl.col(c.name)
+                for c in self.columns
+            )
+        )
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:
@@ -83,6 +93,35 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
             ]
         )
 
+    @classmethod
+    def from_polars(cls, df: pl.DataFrame) -> Self:
+        """
+        Create from a polars dataframe.
+
+        Parameters
+        ----------
+        df
+            Polars dataframe to convert
+
+        Returns
+        -------
+        New dataframe representing the input.
+        """
+        table = df.to_arrow()
+        schema = table.schema
+        for i, field in enumerate(schema):
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
+        d_table = plc.interop.from_arrow(table.cast(schema))
+        return cls(
+            [
+                NamedColumn(column, h_col.name).copy_metadata(h_col)
+                for column, h_col in zip(d_table.columns(), df.iter_columns())
+            ]
+        )
+
     @classmethod
     def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index adf266bab81..f37cb3f475c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -867,7 +867,7 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("round", "unique", "mask_nans"):
+        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -926,6 +926,33 @@ def do_evaluate(
             if maintain_order:
                 return Column(column).sorted_like(values)
             return Column(column)
+        elif self.name == "setsorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b32fa9c273e..5e6544ef77c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -30,7 +30,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -385,17 +385,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
             pdf = pdf.select(self.projection)
-        table = pdf.to_arrow()
-        schema = table.schema
-        for i, field in enumerate(schema):
-            schema = schema.set(
-                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
-            )
-        # No-op if the schema is unchanged.
-        table = table.cast(schema)
-        df = DataFrame.from_table(
-            plc.interop.from_arrow(table), list(self.schema.keys())
-        )
+        df = DataFrame.from_polars(pdf)
         assert all(
             c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
         )
@@ -542,16 +532,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
-        # TODO: use sorted information, need to expose column_order
-        # and null_precedence in pylibcudf groupby constructor
-        # sorted = (
-        #     plc.types.Sorted.YES
-        #     if all(k.is_sorted for k in keys)
-        #     else plc.types.Sorted.NO
-        # )
+        sorted = (
+            plc.types.Sorted.YES
+            if all(k.is_sorted for k in keys)
+            else plc.types.Sorted.NO
+        )
         grouper = plc.groupby.GroupBy(
             plc.Table([k.obj for k in keys]),
             null_handling=plc.types.NullPolicy.INCLUDE,
+            keys_are_sorted=sorted,
+            column_order=[k.order for k in keys],
+            null_precedence=[k.null_order for k in keys],
         )
         # TODO: uniquify
         requests = []
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 2e385e39eef..87508e17407 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -5,6 +5,8 @@
 
 import pytest
 
+import polars as pl
+
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import DataFrame, NamedColumn
@@ -90,3 +92,52 @@ def test_shallow_copy():
     )
     assert df.columns[0].is_sorted == plc.types.Sorted.YES
     assert copy.columns[0].is_sorted == plc.types.Sorted.NO
+
+
+def test_sorted_flags_preserved_empty():
+    df = pl.DataFrame({"a": pl.Series([], dtype=pl.Int8())})
+    df.select(pl.col("a").sort())
+
+    gf = DataFrame.from_polars(df)
+
+    (a,) = gf.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+
+    assert df.flags == gf.to_polars().flags
+
+
+@pytest.mark.parametrize("nulls_last", [True, False])
+def test_sorted_flags_preserved(with_nulls, nulls_last):
+    values = [1, 2, -1, 2, 4, 5]
+    if with_nulls:
+        values[4] = None
+    df = pl.DataFrame({"a": values, "b": values, "c": values})
+
+    df = df.select(
+        pl.col("a").sort(descending=False, nulls_last=nulls_last),
+        pl.col("b").sort(descending=True, nulls_last=nulls_last),
+        pl.col("c"),
+    )
+
+    gf = DataFrame.from_polars(df)
+
+    a_null_order = (
+        plc.types.NullOrder.AFTER
+        if nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    b_null_order = (
+        plc.types.NullOrder.AFTER
+        if not nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    a, b, c = gf.columns
+    assert a.is_sorted == plc.types.Sorted.YES
+    assert a.order == plc.types.Order.ASCENDING
+    assert a.null_order == a_null_order
+    assert b.is_sorted == plc.types.Sorted.YES
+    assert b.order == plc.types.Order.DESCENDING
+    assert b.null_order == b_null_order
+    assert c.is_sorted == plc.types.Sorted.NO
+    assert df.flags == gf.to_polars().flags
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index e53fd7f8615..245bde3acab 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -20,13 +20,7 @@ def dtype(request):
     return request.param
 
 
-@pytest.fixture(
-    params=[
-        False,
-        pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")),
-    ],
-    ids=["unsorted", "sorted"],
-)
+@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
 def is_sorted(request):
     return request.param
 
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 0195266f5c6..d46df92db94 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -8,6 +8,9 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -51,3 +54,31 @@ def test_sort_by_expression(descending, nulls_last, maintain_order):
         )
     )
     assert_gpu_result_equal(query, check_row_order=maintain_order)
+
+
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("nulls_last", [False, True])
+def test_setsorted(descending, nulls_last, with_nulls):
+    values = sorted([1, 2, 3, 4, 5, 6, -2], reverse=descending)
+    if with_nulls:
+        values[-1 if nulls_last else 0] = None
+    df = pl.LazyFrame({"a": values})
+
+    q = df.set_sorted("a", descending=descending)
+
+    assert_gpu_result_equal(q)
+
+    df = translate_ir(q._ldf.visit()).evaluate(cache={})
+
+    (a,) = df.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+    null_order = (
+        plc.types.NullOrder.AFTER
+        if (descending ^ nulls_last) and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    assert a.null_order == null_order
+    assert a.order == (
+        plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING
+    )
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 81306397b9f..50adca01950 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import itertools
+
 import pytest
 
 import polars as pl
@@ -26,12 +28,12 @@ def df():
 
 @pytest.fixture(
     params=[
-        ["key1"],
-        ["key2"],
+        [pl.col("key1")],
+        [pl.col("key2")],
         [pl.col("key1") * pl.col("key2")],
-        ["key1", "key2"],
+        [pl.col("key1"), pl.col("key2")],
         [pl.col("key1") == pl.col("key2")],
-        ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
+        [pl.col("key2"), pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
     ],
     ids=lambda keys: "-".join(map(str, keys)),
 )
@@ -82,6 +84,35 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     assert_gpu_result_equal(q, check_exact=False)
 
 
+def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
+    sorted_keys = [
+        key.sort(descending=descending)
+        for key, descending in zip(keys, itertools.cycle([False, True]))
+    ]
+
+    q = df.group_by(*sorted_keys).agg(*exprs)
+
+    schema = q.collect_schema()
+    sort_keys = list(schema.keys())[: len(keys)]
+    # Multiple keys don't do sorting
+    qsorted = q.sort(*sort_keys)
+    if len(keys) > 1:
+        with pytest.raises(AssertionError):
+            # https://github.com/pola-rs/polars/issues/17556
+            assert_gpu_result_equal(q, check_exact=False)
+        if schema[sort_keys[1]] == pl.Boolean():
+            # https://github.com/pola-rs/polars/issues/17557
+            with pytest.raises(AssertionError):
+                assert_gpu_result_equal(qsorted, check_exact=False)
+        else:
+            assert_gpu_result_equal(qsorted, check_exact=False)
+    elif schema[sort_keys[0]] == pl.Boolean():
+        # Boolean keys don't do sorting, so we get random order
+        assert_gpu_result_equal(qsorted, check_exact=False)
+    else:
+        assert_gpu_result_equal(q, check_exact=False)
+
+
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 

From 1cbd9eb327f4290ef402234f0cb65b93df01ba0a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:01:24 -0400
Subject: [PATCH 052/101] Update contains_tests.cpp to use public cudf::slice
 (#16253)

Changes the `cpp/tests/lists/contains_test.cpp` to use `cudf::slice` instead of `cudf::detail::slice()`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16253
---
 cpp/tests/lists/contains_tests.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 718ee83cf09..8fb2b403051 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -224,9 +224,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // First Slice.
-    auto sliced_column_1 =
-      cudf::detail::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_1 = cudf::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_1, *search_key_one);
@@ -257,9 +256,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // Second Slice.
-    auto sliced_column_2 =
-      cudf::detail::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_2 = cudf::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_2, *search_key_one);

From 99ad73d5c1b95374255a7abb20911a26eb292fa5 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 12 Jul 2024 12:38:37 -0700
Subject: [PATCH 053/101] Remove temporary functor overloads required by cuco
 version bump (#16242)

This is a follow-up of #15938. It removes the temporary workaround no longer needed after the cuco version bump.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16242
---
 .../cudf/detail/distinct_hash_join.cuh        | 20 -------------------
 cpp/src/search/contains_table.cu              | 19 ------------------
 cpp/src/text/bpe/byte_pair_encoding.cuh       | 13 ------------
 cpp/src/text/vocabulary_tokenize.cu           |  8 --------
 4 files changed, 60 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 1ef8b3b120a..c3bc3ad89fa 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,17 +42,6 @@ template <typename Equal>
 struct comparator_adapter {
   comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, lhs_index_type> const&,
-    cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
-  {
-    // All build table keys are distinct thus `false` no matter what
-    return false;
-  }
-
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
@@ -69,15 +58,6 @@ struct comparator_adapter {
     return _d_equal(lhs.second, rhs.second);
   }
 
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, rhs_index_type> const& lhs,
-    cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
-  {
-    if (lhs.first != rhs.first) { return false; }
-    return _d_equal(lhs.second, rhs.second);
-  }
-#pragma nv_diagnostic pop
-
  private:
   Equal _d_equal;
 };
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index fbb0f6cb0f5..4fb983dc5a6 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -76,18 +76,6 @@ struct comparator_adapter {
   {
   }
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(lhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    auto const lhs = static_cast<size_type>(lhs_index);
-    auto const rhs = static_cast<size_type>(rhs_index);
-
-    return _self_equal(lhs, rhs);
-  }
-
   __device__ constexpr auto operator()(rhs_index_type lhs_index,
                                        rhs_index_type rhs_index) const noexcept
   {
@@ -103,13 +91,6 @@ struct comparator_adapter {
     return _two_table_equal(lhs_index, rhs_index);
   }
 
-  __device__ constexpr auto operator()(rhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    return _two_table_equal(lhs_index, rhs_index);
-  }
-#pragma nv_diagnostic pop
-
  private:
   SelfEqual const _self_equal;
   TwoTableEqual const _two_table_equal;
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 3bb574748b6..a2e441c3284 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -89,14 +89,6 @@ struct bpe_equal {
     return lhs == rhs;  // all rows are unique
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
-  {
-    lhs *= 2;
-    auto const left  = d_strings.element<cudf::string_view>(lhs);
-    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
-    return (left == rhs.first) && (right == rhs.second);
-  }
-  // used by find
   __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept
   {
     rhs *= 2;
@@ -157,11 +149,6 @@ struct mp_equal {
     return left == right;
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    auto const left = d_strings.element<cudf::string_view>(lhs);
-    return left == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     auto const right = d_strings.element<cudf::string_view>(rhs);
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index ea09f5d17af..97abb1487d8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -86,18 +86,10 @@ struct vocab_equal {
     return lhs == rhs;  // all rows are expected to be unique
   }
   // used by find
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     return d_strings.element<cudf::string_view>(rhs) == lhs;
   }
-#pragma nv_diagnostic pop
 };
 
 using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;

From 390e6fec3c6b3c257c93d38c3a999f2e4c9706e1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Jul 2024 12:10:11 -1000
Subject: [PATCH 054/101] Clean up state variables in MultiIndex (#16203)

MultiIndex sets it's own state variables outside of `__init__` and allows some uninitialized private variables that may be called in other methods. This PR now ensures these state variables are always initialized in `__init__`, `_from_data` and `_simple_new`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16203
---
 python/cudf/cudf/core/dataframe.py        |  22 +-
 python/cudf/cudf/core/multiindex.py       | 275 +++++++++++-----------
 python/cudf/cudf/tests/test_multiindex.py |  16 +-
 python/cudf/cudf/tests/test_repr.py       |  12 +-
 4 files changed, 152 insertions(+), 173 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2be59f87483..f110b788789 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3593,15 +3593,15 @@ def rename(
 
             if level is not None and isinstance(self.index, MultiIndex):
                 level = self.index._get_level_label(level)
-                out_index = self.index.copy(deep=copy)
-                level_values = out_index.get_level_values(level)
-                level_values.to_frame().replace(
+                level_values = self.index.get_level_values(level)
+                ca = self.index._data.copy(deep=copy)
+                ca[level] = level_values._column.find_and_replace(
                     to_replace=list(index.keys()),
-                    value=list(index.values()),
-                    inplace=True,
+                    replacement=list(index.values()),
+                )
+                out_index = type(self.index)._from_data(
+                    ca, name=self.index.name
                 )
-                out_index._data[level] = column.as_column(level_values)
-                out_index._compute_levels_and_codes()
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
@@ -7058,12 +7058,8 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
         index_names = [*self.index.names, *unique_named_levels.names]
-        new_index = MultiIndex.from_frame(
-            DataFrame._from_data(
-                dict(zip(range(0, len(new_index_columns)), new_index_columns))
-            ),
-            names=index_names,
-        )
+        new_index = MultiIndex._from_data(dict(enumerate(new_index_columns)))
+        new_index.names = index_names
 
         # Compute the column indices that serves as the input for
         # `interleave_columns`
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dbbd1eab6c8..6503dae6ff5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -7,9 +7,7 @@
 import operator
 import pickle
 import warnings
-from collections import abc
 from functools import cached_property
-from numbers import Integral
 from typing import TYPE_CHECKING, Any, MutableMapping
 
 import cupy as cp
@@ -20,7 +18,7 @@
 import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_object_dtype
+from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.algorithms import factorize
@@ -64,6 +62,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
     return indices
 
 
+def _compute_levels_and_codes(
+    data: MutableMapping,
+) -> tuple[list[cudf.Index], list[column.ColumnBase]]:
+    """Return MultiIndex level and codes from a ColumnAccessor-like mapping."""
+    levels = []
+    codes = []
+    for col in data.values():
+        code, cats = factorize(col)
+        codes.append(column.as_column(code.astype(np.int64)))
+        levels.append(cats)
+
+    return levels, codes
+
+
 class MultiIndex(Frame, BaseIndex, NotIterable):
     """A multi-level or hierarchical index.
 
@@ -146,50 +158,36 @@ def __init__(
             raise NotImplementedError(
                 "Use `names`, `name` is not yet supported"
             )
-        if len(levels) == 0:
-            raise ValueError("Must pass non-zero number of levels/codes")
-        if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (abc.Sequence, np.ndarray, cp.ndarray)
-        ):
-            raise TypeError("Codes is not a Sequence of sequences")
-
-        if copy:
-            if isinstance(codes, cudf.DataFrame):
-                codes = codes.copy(deep=True)
-            if len(levels) > 0 and isinstance(
-                levels[0], (cudf.Index, cudf.Series)
-            ):
-                levels = [level.copy(deep=True) for level in levels]
-
-        if not isinstance(codes, cudf.DataFrame):
-            if len(levels) == len(codes):
-                codes = cudf.DataFrame._from_data(
-                    {
-                        i: column.as_column(code).astype(np.int64)
-                        for i, code in enumerate(codes)
-                    }
-                )
-            else:
-                raise ValueError(
-                    "MultiIndex has unequal number of levels and "
-                    "codes and is inconsistent!"
-                )
-
-        levels = [ensure_index(level) for level in levels]
-
-        if len(levels) != len(codes._data):
-            raise ValueError(
-                "MultiIndex has unequal number of levels and "
-                "codes and is inconsistent!"
-            )
-        if len({c.size for c in codes._data.columns}) != 1:
+        if levels is None or codes is None:
+            raise TypeError("Must pass both levels and codes")
+        elif not (is_list_like(levels) and len(levels) > 0):
+            raise ValueError("Must pass non-zero length sequence of levels")
+        elif not (is_list_like(codes) and len(codes) > 0):
+            raise ValueError("Must pass non-zero length sequence of codes")
+        elif len(codes) != len(levels):
             raise ValueError(
-                "MultiIndex length of codes does not match "
-                "and is inconsistent!"
+                f"levels must have the same length ({len(levels)}) "
+                f"as codes ({len(codes)})."
             )
 
+        new_levels = []
+        for level in levels:
+            new_level = ensure_index(level)
+            if copy and new_level is level:
+                new_level = new_level.copy(deep=True)
+            new_levels.append(new_level)
+
+        new_codes = []
+        for code in codes:
+            if not (is_list_like(code) or is_column_like(code)):
+                raise TypeError("Each code must be list-like")
+            new_code = column.as_column(code).astype("int64")
+            if copy and new_code is code:
+                new_code = new_code.copy(deep=True)
+            new_codes.append(new_code)
+
         source_data = {}
-        for (column_name, code), level in zip(codes._data.items(), levels):
+        for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
                 lo, hi = libcudf.reduce.minmax(code)
                 if lo.value < -1 or hi.value > len(level) - 1:
@@ -202,13 +200,11 @@ def __init__(
             result_col = libcudf.copying.gather(
                 [level._column], code, nullify=True
             )
-            source_data[column_name] = result_col[0]._with_type_metadata(
-                level.dtype
-            )
+            source_data[i] = result_col[0]._with_type_metadata(level.dtype)
 
-        super().__init__(source_data)
-        self._levels = levels
-        self._codes = codes
+        super().__init__(ColumnAccessor(source_data))
+        self._levels = new_levels
+        self._codes = new_codes
         self._name = None
         self.names = names
 
@@ -350,10 +346,37 @@ def _from_data(
         data: MutableMapping,
         name: Any = None,
     ) -> MultiIndex:
-        obj = cls.from_frame(cudf.DataFrame._from_data(data=data))
-        if name is not None:
-            obj.name = name
-        return obj
+        """
+        Use when you have a ColumnAccessor-like mapping but no codes and levels.
+        """
+        levels, codes = _compute_levels_and_codes(data)
+        return cls._simple_new(
+            data=ColumnAccessor(data),
+            levels=levels,
+            codes=codes,
+            names=pd.core.indexes.frozen.FrozenList(data.keys()),
+            name=name,
+        )
+
+    @classmethod
+    def _simple_new(
+        cls,
+        data: ColumnAccessor,
+        levels: list[cudf.Index],
+        codes: list[column.ColumnBase],
+        names: pd.core.indexes.frozen.FrozenList,
+        name: Any = None,
+    ) -> Self:
+        """
+        Use when you have a ColumnAccessor-like mapping, codes, and levels.
+        """
+        mi = object.__new__(cls)
+        mi._data = data
+        mi._levels = levels
+        mi._codes = codes
+        mi._names = names
+        mi._name = name
+        return mi
 
     @property  # type: ignore
     @_performance_tracking
@@ -421,18 +444,17 @@ def copy(
         2020-08-28 AMZN  3401.80
                    MSFT   228.91
         """
-
-        mi = MultiIndex._from_data(self._data.copy(deep=deep))
-        if self._levels is not None:
-            mi._levels = [idx.copy(deep=deep) for idx in self._levels]
-        if self._codes is not None:
-            mi._codes = self._codes.copy(deep)
         if names is not None:
-            mi.names = names
-        elif self.names is not None:
-            mi.names = self.names.copy()
-
-        return mi
+            names = pd.core.indexes.frozen.FrozenList(names)
+        else:
+            names = self.names
+        return type(self)._simple_new(
+            data=self._data.copy(deep=deep),
+            levels=[idx.copy(deep=deep) for idx in self._levels],
+            codes=[code.copy(deep=deep) for code in self._codes],
+            names=names,
+            name=name,
+        )
 
     @_performance_tracking
     def __repr__(self):
@@ -478,14 +500,8 @@ def __repr__(self):
         data_output = "\n".join(lines)
         return output_prefix + data_output
 
-    @property
-    def _codes_frame(self):
-        if self._codes is None:
-            self._compute_levels_and_codes()
-        return self._codes
-
     @property  # type: ignore
-    @_external_only_api("Use ._codes_frame instead")
+    @_external_only_api("Use ._codes instead")
     @_performance_tracking
     def codes(self):
         """
@@ -505,7 +521,7 @@ def codes(self):
         FrozenList([[0, 1, 2], [0, 1, 2]])
         """
         return pd.core.indexes.frozen.FrozenList(
-            col.values for col in self._codes_frame._columns
+            col.values for col in self._codes
         )
 
     def get_slice_bound(self, label, side, kind=None):
@@ -519,13 +535,13 @@ def nlevels(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def levels(self):
+    def levels(self) -> list[cudf.Index]:
         """
         Returns list of levels in the MultiIndex
 
         Returns
         -------
-        List of Series objects
+        List of Index objects
 
         Examples
         --------
@@ -545,9 +561,9 @@ def levels(self):
         >>> midx.levels
         [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
         """  # noqa: E501
-        if self._levels is None:
-            self._compute_levels_and_codes()
-        return self._levels
+        return [
+            idx.rename(name) for idx, name in zip(self._levels, self.names)
+        ]
 
     @property  # type: ignore
     @_performance_tracking
@@ -566,11 +582,10 @@ def _get_level_label(self, level):
             else if level is index of the level, then level
             label will be returned as per the index.
         """
-
-        if level in self._data.names:
+        if level in self.names:
             return level
         else:
-            return self._data.names[level]
+            return self.names[level]
 
     @_performance_tracking
     def isin(self, values, level=None):
@@ -671,20 +686,6 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_performance_tracking
-    def _compute_levels_and_codes(self):
-        levels = []
-
-        codes = {}
-        for name, col in self._data.items():
-            code, cats = cudf.Series._from_data({None: col}).factorize()
-            cats.name = name
-            codes[name] = code.astype(np.int64)
-            levels.append(cats)
-
-        self._levels = levels
-        self._codes = cudf.DataFrame._from_data(codes)
-
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
@@ -823,7 +824,7 @@ def _index_and_downcast(self, result, index, index_key):
                 result.names = index.names[size:]
             index = MultiIndex(
                 levels=index.levels[size:],
-                codes=index._codes_frame.iloc[:, size:],
+                codes=index._codes[size:],
                 names=index.names[size:],
             )
 
@@ -933,28 +934,29 @@ def deserialize(cls, header, frames):
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
-        if isinstance(index, (Integral, abc.Sequence)):
-            index = np.array(index)
-        elif isinstance(index, slice):
+        if isinstance(index, slice):
             start, stop, step = index.indices(len(self))
-            index = column.as_column(range(start, stop, step))
-        result = MultiIndex.from_frame(
-            self.to_frame(index=False, name=range(0, self.nlevels)).take(
-                index
-            ),
-            names=self.names,
+            idx = range(start, stop, step)
+        elif is_scalar(index):
+            idx = [index]
+        else:
+            idx = index
+
+        indexer = column.as_column(idx)
+        ca = self._data._from_columns_like_self(
+            (col.take(indexer) for col in self._columns), verify=False
+        )
+        codes = [code.take(indexer) for code in self._codes]
+        result = type(self)._simple_new(
+            data=ca, codes=codes, levels=self._levels, names=self.names
         )
 
         # we are indexing into a single row of the MultiIndex,
         # return that row as a tuple:
         if flatten:
             return result.to_pandas()[0]
-
-        if self._codes_frame is not None:
-            result._codes = self._codes_frame.take(index)
-        if self._levels is not None:
-            result._levels = self._levels
-        return result
+        else:
+            return result
 
     @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
@@ -1270,25 +1272,12 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
                     ('NJ', 'Precip')],
                    names=['state', 'observation'])
         """
-        obj = cls.__new__(cls)
-        super(cls, obj).__init__()
-
-        source_data = df.copy(deep=False)
-        source_data.reset_index(drop=True, inplace=True)
-        if isinstance(source_data, pd.DataFrame):
-            source_data = cudf.DataFrame.from_pandas(source_data)
-
-        names = names if names is not None else source_data._data.names
-        # if names are unique
-        # try using those as the source_data column names:
-        if len(dict.fromkeys(names)) == len(names):
-            source_data.columns = names
-        obj._name = None
-        obj._data = source_data._data
-        obj.names = names
-        obj._codes = None
-        obj._levels = None
-        return obj
+        if isinstance(df, pd.DataFrame):
+            source_data = cudf.DataFrame.from_pandas(df)
+        else:
+            source_data = df
+        names = names if names is not None else source_data._column_names
+        return cls.from_arrays(source_data._columns, names=names)
 
     @classmethod
     @_performance_tracking
@@ -1436,7 +1425,7 @@ def _poplevels(self, level):
 
         # update self
         self.names = names
-        self._compute_levels_and_codes()
+        self._levels, self._codes = _compute_levels_and_codes(self._data)
 
         return popped
 
@@ -1560,13 +1549,19 @@ def to_pandas(
     ) -> pd.MultiIndex:
         # cudf uses np.iinfo(size_type_dtype).min as missing code
         # pandas uses -1 as missing code
-        pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
+        pd_codes = (
+            code.find_and_replace(
+                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(-1, length=1),
+            )
+            for code in self._codes
+        )
         return pd.MultiIndex(
             levels=[
                 level.to_pandas(nullable=nullable, arrow_type=arrow_type)
                 for level in self.levels
             ],
-            codes=[col.values_host for col in pd_codes._columns],
+            codes=[col.values_host for col in pd_codes],
             names=self.names,
         )
 
@@ -1741,13 +1736,9 @@ def _clean_nulls_from_index(self):
 
     @_performance_tracking
     def memory_usage(self, deep=False):
-        usage = sum(col.memory_usage for col in self._data.columns)
-        if self.levels:
-            for level in self.levels:
-                usage += level.memory_usage(deep=deep)
-        if self._codes_frame:
-            for col in self._codes_frame._data.columns:
-                usage += col.memory_usage
+        usage = sum(col.memory_usage for col in self._columns)
+        usage += sum(level.memory_usage(deep=deep) for level in self._levels)
+        usage += sum(code.memory_usage for code in self._codes)
         return usage
 
     @_performance_tracking
@@ -2043,7 +2034,7 @@ def _union(self, other, sort=None):
             ignore_index=True,
         )
 
-        midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
+        midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data)
         midx.names = self.names if self.names == other.names else None
         if sort in {None, True} and len(other):
             return midx.sort_values()
@@ -2067,7 +2058,8 @@ def _intersection(self, other, sort=None):
             self_df.columns = col_names
 
         result_df = cudf.merge(self_df, other_df, how="inner")
-        midx = self.__class__.from_frame(result_df, names=res_name)
+        midx = type(self)._from_data(result_df._data)
+        midx.names = res_name
         if sort in {None, True} and len(other):
             return midx.sort_values()
         return midx
@@ -2077,6 +2069,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
+        self._levels, self._codes = _compute_levels_and_codes(res._data)
         return res
 
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 07c2e9c3fcf..1941eec91eb 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -832,25 +832,17 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
 
         # Assert ._levels identity
         lptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi1._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels
         ]
         rptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi2._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels
         ]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._codes identity
-        lptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi1._codes._data.items()
-        ]
-        rptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi2._codes._data.items()
-        ]
+        lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes]
+        rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 193d64a9e7f..a013745f71e 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -186,13 +186,11 @@ def test_MI():
         }
     )
     levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
-    codes = cudf.DataFrame(
-        {
-            "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
-            "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
-            "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
-        }
-    )
+    codes = [
+        [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
+        [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
+        [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+    ]
     pd.options.display.max_rows = 999
     pd.options.display.max_columns = 0
     gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))

From 954ce6d5a64190b7d71cc6f94e7fa4a87ae34598 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Jul 2024 18:23:27 -0500
Subject: [PATCH 055/101] Add low memory JSON reader for `cudf.pandas` (#16204)

Fixes: #16122

This PR introduces low-memory JSON reading for `cudf.pandas` `read_json`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16204
---
 cpp/src/io/json/read_json.cu                |   3 +-
 cpp/src/io/utilities/datasource.cpp         |   2 +-
 python/cudf/cudf/_lib/json.pyx              |  63 ++++---
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd |  11 ++
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 176 +++++++++++++++++---
 python/cudf/cudf/_lib/utils.pxd             |   1 +
 python/cudf/cudf/tests/test_csv.py          |   2 +-
 python/cudf/cudf/tests/test_json.py         |  16 ++
 8 files changed, 228 insertions(+), 46 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 74001e5e01a..9cd39038348 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -193,7 +193,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t chunk_size                         = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
-               "Invalid offsetting");
+               "Invalid offsetting",
+               std::invalid_argument);
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index c8a438fc40b..91be154e09d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -217,7 +217,7 @@ class memory_mapped_source : public file_source {
 
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
 
     // Offset for `mmap()` must be page aligned
     _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 9c646e3357b..853dd431099 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -10,6 +10,7 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp cimport bool
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
 from cudf._lib.pylibcudf.io.types cimport compression_type
 from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
@@ -17,7 +18,7 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_pylibcudf_io
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 import cudf._lib.pylibcudf as plc
 
@@ -98,28 +99,48 @@ cpdef read_json(object filepaths_or_buffers,
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    table_w_meta = plc.io.json.read_json(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        processed_dtypes,
-        c_compression,
-        lines,
-        byte_range_offset = byte_range[0] if byte_range is not None else 0,
-        byte_range_size = byte_range[1] if byte_range is not None else 0,
-        keep_quotes = keep_quotes,
-        mixed_types_as_string = mixed_types_as_string,
-        prune_columns = prune_columns,
-        recovery_mode = _get_json_recovery_mode(on_bad_lines)
-    )
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(
-            table_w_meta
+    if cudf.get_option("mode.pandas_compatible") and lines:
+        res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
+        df = cudf.DataFrame._from_data(
+            *_data_from_columns(
+                columns=[Column.from_pylibcudf(plc) for plc in res_cols],
+                column_names=res_col_names,
+                index_names=None
+               )
+            )
+        add_df_col_struct_names(df, res_child_names)
+        return df
+    else:
+        table_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            lines,
+            byte_range_offset = byte_range[0] if byte_range is not None else 0,
+            byte_range_size = byte_range[1] if byte_range is not None else 0,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
+
+        df = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_io(
+                table_w_meta
+            )
         )
-    )
 
-    # Post-processing to add in struct column names
-    add_df_col_struct_names(df, table_w_meta.child_names)
-    return df
+        # Post-processing to add in struct column names
+        add_df_col_struct_names(df, table_w_meta.child_names)
+        return df
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index f7f733a493d..2e0e92a054f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -35,3 +35,14 @@ cpdef void write_json(
     str true_value = *,
     str false_value = *
 )
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+    int chunk_size= *,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 354cb4981de..2710ee60075 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -6,6 +6,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from cudf._lib.pylibcudf.concatenate cimport concatenate
 from cudf._lib.pylibcudf.io.types cimport (
     SinkInfo,
     SourceInfo,
@@ -50,6 +51,144 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
     return schema_map
 
 
+cdef json_reader_options _setup_json_reader_options(
+        SourceInfo source_info,
+        list dtypes,
+        compression_type compression,
+        bool lines,
+        size_type byte_range_offset,
+        size_type byte_range_size,
+        bool keep_quotes,
+        bool mixed_types_as_string,
+        bool prune_columns,
+        json_recovery_mode_t recovery_mode):
+
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+    return opts
+
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    int chunk_size=100_000_000,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+    chunk_size : int, default 100_000_000 bytes.
+        The number of bytes to be read in chunks.
+        The chunk_size should be set to at least row_size.
+
+    Returns
+    -------
+    tuple
+        A tuple of (columns, column_name, child_names)
+    """
+    cdef size_type c_range_size = (
+        chunk_size if chunk_size is not None else 0
+    )
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=True,
+        byte_range_offset=0,
+        byte_range_size=0,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
+    )
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    final_columns = []
+    meta_names = None
+    child_names = None
+    i = 0
+    while True:
+        opts.set_byte_range_offset(c_range_size * i)
+        opts.set_byte_range_size(c_range_size)
+
+        try:
+            with nogil:
+                c_result = move(cpp_read_json(opts))
+        except (ValueError, OverflowError):
+            break
+        if meta_names is None:
+            meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
+        if child_names is None:
+            child_names = TableWithMetadata._parse_col_names(
+                c_result.metadata.schema_info
+            )
+        new_chunk = [
+            col for col in TableWithMetadata.from_libcudf(
+                c_result).columns
+        ]
+
+        if len(final_columns) == 0:
+            final_columns = new_chunk
+        else:
+            for col_idx in range(len(meta_names)):
+                final_columns[col_idx] = concatenate(
+                    [final_columns[col_idx], new_chunk[col_idx]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[col_idx] = None
+        i += 1
+    return (final_columns, meta_names, child_names)
+
+
 cpdef TableWithMetadata read_json(
     SourceInfo source_info,
     list dtypes = None,
@@ -76,7 +215,7 @@ cpdef TableWithMetadata read_json(
         the list of child dtypes is an empty list if the child is not
         a nested type (list or struct dtype), and is of format
         (column_child_name, column_child_type, list of grandchild dtypes).
-    compression_type: CompressionType, default CompressionType.AUTO
+    compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
     byte_range_offset : size_type, default 0
         Number of bytes to skip from source start.
@@ -84,6 +223,9 @@ cpdef TableWithMetadata read_json(
         Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
     prune_columns : bool, default False
         Whether to only read columns specified in dtypes.
     recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
@@ -95,29 +237,19 @@ cpdef TableWithMetadata read_json(
     TableWithMetadata
         The Table and its corresponding metadata (column names) that were read in.
     """
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(source_info.c_obj)
-        .compression(compression)
-        .lines(lines)
-        .byte_range_offset(byte_range_offset)
-        .byte_range_size(byte_range_size)
-        .recovery_mode(recovery_mode)
-        .build()
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=lines,
+        byte_range_offset=byte_range_offset,
+        byte_range_size=byte_range_size,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
     )
 
-    if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
-        else:
-            for dtype in dtypes:
-                types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
     # Read JSON
     cdef table_with_metadata c_result
 
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 99850d549a1..1d55f7218dc 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -19,3 +19,4 @@ cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
 cdef columns_from_pylibcudf_table(tbl)
+cdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 09617306606..a22a627523f 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1191,7 +1191,7 @@ def test_csv_reader_byte_range_type_corner_case(tmpdir):
     ).to_csv(fname, chunksize=100000)
 
     byte_range = (2_147_483_648, 0)
-    with pytest.raises(RuntimeError, match="Offset is past end of file"):
+    with pytest.raises(OverflowError, match="Offset is past end of file"):
         cudf.read_csv(fname, byte_range=byte_range, header=None)
 
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 9222f6d23db..7771afd692f 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1428,3 +1428,19 @@ def test_json_reader_on_bad_lines(on_bad_lines):
                 orient="records",
                 on_bad_lines=on_bad_lines,
             )
+
+
+def test_chunked_json_reader():
+    df = cudf.DataFrame(
+        {
+            "a": ["aaaa"] * 9_00_00_00,
+            "b": list(range(0, 9_00_00_00)),
+        }
+    )
+    buf = BytesIO()
+    df.to_json(buf, lines=True, orient="records", engine="cudf")
+    buf.seek(0)
+    df = df.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.read_json(buf, lines=True)
+    assert_eq(df, gdf)

From c4ee4a7a8f7513dc31dd29124bbbf797f0d5c8fc Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 15 Jul 2024 08:26:22 -0500
Subject: [PATCH 056/101] Add multi-file support to `dask_cudf.read_json`
 (#16057)

Dask cuDF often benefits from a larger partition sizes than pandas-backed Dask DataFrame. This motivates the ability to easily "aggregate" multiple json files into each partition using `dask_cudf.read_json`. This PR introduces the `aggregate_files` argument (defaults to `True`) to make it easier to accomplish multi-file DataFrame partitions.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/16057
---
 python/dask_cudf/dask_cudf/backends.py        |  15 +-
 python/dask_cudf/dask_cudf/io/json.py         | 146 +++++++++++++++++-
 .../dask_cudf/dask_cudf/io/tests/test_json.py |  29 ++++
 3 files changed, 173 insertions(+), 17 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 1f55a59ea55..4bdb5d921ec 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,17 +667,10 @@ def from_dict(
         )
 
     @staticmethod
-    def read_json(*args, engine="auto", **kwargs):
-        return _default_backend(
-            dd.read_json,
-            *args,
-            engine=(
-                partial(cudf.read_json, engine=engine)
-                if isinstance(engine, str)
-                else engine
-            ),
-            **kwargs,
-        )
+    def read_json(*args, **kwargs):
+        from dask_cudf.io.json import read_json as read_json_impl
+
+        return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 2a6ad603414..8705d98e9d6 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,15 +1,71 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from functools import partial
 
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
 import dask
+from dask.utils import parse_bytes
 
 import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
 
 from dask_cudf.backends import _default_backend
 
 
-def read_json(url_path, engine="auto", **kwargs):
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
     """Read JSON data into a :class:`.DataFrame`.
 
     This function wraps :func:`dask.dataframe.read_json`, and passes
@@ -30,7 +86,13 @@ def read_json(url_path, engine="auto", **kwargs):
         data. The default value is "auto", so that
         ``engine=partial(cudf.read_json, engine="auto")`` will be
         passed to :func:`dask.dataframe.read_json` by default.
-
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
     **kwargs :
         Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
@@ -60,9 +122,77 @@ def read_json(url_path, engine="auto", **kwargs):
 
     """
 
-    # TODO: Add optimized code path to leverage the
-    # `byte_range` argument in `cudf.read_json` for
-    # local storage (see `dask_cudf.read_csv`)
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
     return _default_backend(
         dask.dataframe.read_json,
         url_path,
@@ -71,5 +201,9 @@ def read_json(url_path, engine="auto", **kwargs):
             if isinstance(engine, str)
             else engine
         ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
         **kwargs,
     )
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index dc780478794..abafbffd197 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
+import math
 import os
 
 import pandas as pd
@@ -97,3 +98,31 @@ def test_read_json_nested(tmp_path):
         # Ensure not passing kwargs also reads the file.
         actual = dask_cudf.read_json(f)
         dd.assert_eq(actual, actual_pd)
+
+
+def test_read_json_aggregate_files(tmp_path):
+    df1 = dask.datasets.timeseries(
+        dtypes={"x": int, "y": int}, freq="120s"
+    ).reset_index(drop=True)
+    json_path = str(tmp_path / "data-*.json")
+    df1.to_json(json_path)
+
+    df2 = dask_cudf.read_json(json_path, aggregate_files=2)
+    assert df2.npartitions == math.ceil(df1.npartitions / 2)
+    dd.assert_eq(df1, df2, check_index=False)
+
+    df2 = dask_cudf.read_json(
+        json_path, aggregate_files=True, blocksize="1GiB"
+    )
+    assert df2.npartitions == 1
+    dd.assert_eq(df1, df2, check_index=False)
+
+    for include_path_column, name in [(True, "path"), ("file", "file")]:
+        df2 = dask_cudf.read_json(
+            json_path,
+            aggregate_files=2,
+            include_path_column=include_path_column,
+        )
+        assert name in df2.columns
+        assert len(df2[name].compute().unique()) == df1.npartitions
+        dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False)

From 1889c7c0f517c95143016a6e391275144a034f7a Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 15 Jul 2024 20:32:15 +0200
Subject: [PATCH 057/101] MAINT: Adapt to NumPy 2 promotion changes (#16141)

Splitting out the non API changes from gh-15897, the Scalar API change is required for the tests to pass with NumPy 2, but almost all changes should be relatively straight forward here on their own.

(I will add inline comments.)

---

This PR does not fix integer comparisons, there are currently no tests that run into these.

xref: https://github.com/rapidsai/build-planning/issues/38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16141
---
 python/cudf/cudf/core/_internals/where.py   | 24 +++++++++++-------
 python/cudf/cudf/core/column/categorical.py |  4 ++-
 python/cudf/cudf/core/column/numerical.py   | 27 ++++++++++++++++-----
 python/cudf/cudf/tests/test_binops.py       | 21 +++++++++++++---
 python/cudf/cudf/tests/test_doctests.py     | 13 +++++++++-
 python/cudf/cudf/tests/test_dtypes.py       |  1 -
 6 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 44ce0ddef25..f3183e6029d 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -54,13 +54,17 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if (isinstance(other, float) and not np.isnan(other)) and (
-            source_dtype.type(other) != other
-        ):
-            raise TypeError(
-                f"Cannot safely cast non-equivalent "
-                f"{type(other).__name__} to {source_dtype.name}"
-            )
+        if isinstance(other, float) and not np.isnan(other):
+            try:
+                is_safe = source_dtype.type(other) == other
+            except OverflowError:
+                is_safe = False
+
+            if not is_safe:
+                raise TypeError(
+                    f"Cannot safely cast non-equivalent "
+                    f"{type(other).__name__} to {source_dtype.name}"
+                )
 
         if cudf.utils.utils.is_na_like(other):
             return _normalize_categorical(
@@ -84,8 +88,10 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast(
-        other, source_dtype
+    if (
+        _is_non_decimal_numeric_dtype(source_dtype)
+        and not other_is_scalar  # can-cast fails for Python scalars
+        and _can_cast(other, source_dtype)
     ):
         common_dtype = source_dtype
     elif (
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index f763d3b4b0c..9aaccca349d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -47,7 +47,9 @@
     )
 
 
-_DEFAULT_CATEGORICAL_VALUE = -1
+# Using np.int8(-1) to allow silent wrap-around when casting to uint
+# it may make sense to make this dtype specific or a function.
+_DEFAULT_CATEGORICAL_VALUE = np.int8(-1)
 
 
 class CategoricalAccessor(ColumnMethods):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a0550bff72b..b8fa00e9643 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -301,15 +301,28 @@ def normalize_binop_value(
         if isinstance(other, cudf.Scalar):
             if self.dtype == other.dtype:
                 return other
+
             # expensive device-host transfer just to
             # adjust the dtype
             other = other.value
+
+            # NumPy 2 needs a Python scalar to do weak promotion, but
+            # pandas forces weak promotion always
+            # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies.
+            if other.dtype.kind in "ifc":
+                other = other.item()
+        elif not isinstance(other, (int, float, complex)):
+            # Go via NumPy to get the value
+            other = np.array(other)
+            if other.dtype.kind in "ifc":
+                other = other.item()
+
         # Try and match pandas and hence numpy. Deduce the common
-        # dtype via the _value_ of other, and the dtype of self. TODO:
-        # When NEP50 is accepted, this might want changed or
-        # simplified.
-        # This is not at all simple:
-        # np.result_type(np.int64(0), np.uint8)
+        # dtype via the _value_ of other, and the dtype of self on NumPy 1.x
+        # with NumPy 2, we force weak promotion even for our/NumPy scalars
+        # to match pandas 2.2.
+        # Weak promotion is not at all simple:
+        # np.result_type(0, np.uint8)
         #   => np.uint8
         # np.result_type(np.asarray([0], dtype=np.int64), np.uint8)
         #   => np.int64
@@ -626,7 +639,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             min_, max_ = iinfo.min, iinfo.max
 
             # best we can do is hope to catch it here and avoid compare
-            if (self.min() >= min_) and (self.max() <= max_):
+            # Use Python floats, which have precise comparison for float64.
+            # NOTE(seberg): it would make sense to limit to the mantissa range.
+            if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
                 return (cudf.Series(filled) % 1 == 0).all()
             else:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 7d8c3b53115..5265278db4c 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = func(gs)
+    try:
+        gs_result = func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = gpu_func(gs)
+    try:
+        gs_result = gpu_func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            cpu_func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar(
         fill_value=fill_value,
     )
     pdf_series_result = getattr(pdf_series, func)(
-        scalar, fill_value=fill_value
+        np.array(scalar)[()] if use_cudf_scalar else scalar,
+        fill_value=fill_value,
     )
 
     assert_eq(pdf_series_result, gdf_series_result)
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
index 0da5c6b04d6..794660cffcb 100644
--- a/python/cudf/cudf/tests/test_doctests.py
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 import contextlib
 import doctest
 import inspect
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pytest
+from packaging import version
 
 import cudf
 
@@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path):
         yield
         os.chdir(original_directory)
 
+    @pytest.fixture(autouse=True)
+    def prinoptions(cls):
+        # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should
+        #       be adapted evantually.
+        if version.parse(np.__version__) >= version.parse("2.0"):
+            with np.printoptions(legacy="1.25"):
+                yield
+        else:
+            yield
+
     @pytest.mark.parametrize(
         "docstring",
         itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]),
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index edb534a3618..c62b5889fdd 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect):
         np.complex128,
         complex,
         "S",
-        "a",
         "V",
         "float16",
         np.float16,

From 128f0c917bbc3342f9eca12ca2bf714c88206256 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 15 Jul 2024 20:34:14 +0200
Subject: [PATCH 058/101] API: Check for integer overflows when creating scalar
 form python int (#16140)

This aligns with NumPy, which deprecated this since a while and raises an error now on NumPy 2, for example for `Scalar(-1, dtype=np.uint8)`.

Since it aligns with NumPy, the DeprecationWarning of earlier NumPy versions is inherited for those.

This (or similar handling) is required to be compatible with NumPy 2/pandas, since the default needs to be to reject operation when values are out of bounds for e.g. `uint8_series + 1000`, the 1000 should not be silently cast to a `uint8`.

---

Split from gh-15897

xref: https://github.com/rapidsai/build-planning/issues/38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16140
---
 python/cudf/cudf/tests/test_scalar.py | 17 +++++++++++++++++
 python/cudf/cudf/tests/test_unaops.py |  5 ++++-
 python/cudf/cudf/utils/dtypes.py      | 14 ++++++++------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 05a91a8fea3..195231e9960 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import rmm
 
@@ -253,6 +254,22 @@ def test_generic_null_scalar_construction_fails(value):
         cudf.Scalar(value)
 
 
+@pytest.mark.parametrize(
+    "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")]
+)
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_scalar_out_of_bounds_pyint_fails(value, dtype):
+    # Test that we align with NumPy on scalar creation behavior from
+    # Python integers.
+    if version.parse(np.__version__) >= version.parse("2.0"):
+        with pytest.raises(OverflowError):
+            cudf.Scalar(value, dtype)
+    else:
+        # NumPy allowed this, but it gives a DeprecationWarning on newer
+        # versions (which cudf did not used to do).
+        assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value)
+
+
 @pytest.mark.parametrize(
     "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index dbbf4fba3a6..5f5d79c1dce 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos():
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
     slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
-    slr_device = cudf.Scalar(slr, dtype=dtype)
+    # The scalar may be out of bounds, so go via array force-cast
+    # NOTE: This is a change in behavior
+    slr = np.array(slr).astype(dtype)[()]
+    slr_device = cudf.Scalar(slr)
 
     expect = op(slr_host)
     got = op(slr_device)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2aa3129ab30..0dec857ea96 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None):
     elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
 
-    val = _maybe_convert_to_default_type(
-        cudf.api.types.pandas_dtype(type(val))
-    ).type(val)
-
     if dtype is not None:
-        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+        dtype = np.dtype(dtype)
+        if isinstance(val, str) and dtype.kind == "M":
             # pd.Timestamp can handle str, but not np.str_
             val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
         else:
-            val = val.astype(dtype)
+            # At least datetimes cannot be converted to scalar via dtype.type:
+            val = np.array(val, dtype)[()]
+    else:
+        val = _maybe_convert_to_default_type(
+            cudf.api.types.pandas_dtype(type(val))
+        ).type(val)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)

From ceb73d91c090882ec69642a78b7d791a1bf220fe Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 15 Jul 2024 12:45:51 -0700
Subject: [PATCH 059/101] Make nvcomp adapter compatible with new version
 macros (#16245)

New nvcomp version changed the names of the version macros. This PR adds "aliasing" to the old names so rest of the code is not affected.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16245
---
 cpp/src/io/comp/nvcomp_adapter.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 0e34c96debd..5d0c6a8c83b 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -37,6 +37,13 @@
 #include NVCOMP_ZSTD_HEADER
 #endif
 
+// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
+#ifndef NVCOMP_MAJOR_VERSION
+#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
+#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
+#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
+#endif
+
 #define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
 
 #define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))

From 04330f2e9e73ac71a86666c55d0fe7248eaf8db6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:23:07 -1000
Subject: [PATCH 060/101] Fix convert_dtypes with
 convert_integer=False/convert_floating=True (#15964)

If `convert_integer=False`, there should be no attempt to convert to integer

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15964
---
 python/cudf/cudf/core/indexed_frame.py        | 34 +++++++++++--------
 .../cudf/cudf/tests/series/test_conversion.py | 13 +++++++
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 63fa96d0db0..30b68574960 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6235,13 +6235,13 @@ def rank(
 
     def convert_dtypes(
         self,
-        infer_objects=True,
-        convert_string=True,
-        convert_integer=True,
-        convert_boolean=True,
-        convert_floating=True,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+        convert_floating: bool = True,
         dtype_backend=None,
-    ):
+    ) -> Self:
         """
         Convert columns to the best possible nullable dtypes.
 
@@ -6252,17 +6252,21 @@ def convert_dtypes(
         All other dtypes are always returned as-is as all dtypes in
         cudf are nullable.
         """
-        result = self.copy()
-
-        if convert_floating:
-            # cast any floating columns to int64 if
-            # they are all integer data:
-            for name, col in result._data.items():
+        if not (convert_floating and convert_integer):
+            return self.copy()
+        else:
+            cols = []
+            for col in self._columns:
                 if col.dtype.kind == "f":
                     col = col.fillna(0)
-                    if cp.allclose(col, col.astype("int64")):
-                        result._data[name] = col.astype("int64")
-        return result
+                    as_int = col.astype("int64")
+                    if cp.allclose(col, as_int):
+                        cols.append(as_int)
+                        continue
+                cols.append(col)
+            return self._from_data_like_self(
+                self._data._from_columns_like_self(cols, verify=False)
+            )
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index e1dd359e1ba..1d680d7860d 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype):
     assert_eq(expect, got)
 
 
+def test_convert_integer_false_convert_floating_true():
+    data = [1.000000000000000000000000001, 1]
+    expected = pd.Series(data).convert_dtypes(
+        convert_integer=False, convert_floating=True
+    )
+    result = (
+        cudf.Series(data)
+        .convert_dtypes(convert_integer=False, convert_floating=True)
+        .to_pandas(nullable=True)
+    )
+    assert_eq(result, expected)
+
+
 # Now write the same test, but construct a DataFrame
 # as input instead of parametrizing:

From dba46e7a8957b8389b69e820485e319a1d314017 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:21:50 -1000
Subject: [PATCH 061/101] Replace is_datetime/timedelta_dtype checks with .kind
 checks (#16262)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16262
---
 python/cudf/cudf/_fuzz_testing/utils.py   |  3 +-
 python/cudf/cudf/core/column/datetime.py  |  7 +--
 python/cudf/cudf/core/column/timedelta.py |  4 +-
 python/cudf/cudf/core/dataframe.py        |  7 ++-
 python/cudf/cudf/core/scalar.py           |  8 +---
 python/cudf/cudf/core/tools/numeric.py    |  9 +---
 python/cudf/cudf/tests/test_binops.py     |  7 +--
 python/cudf/cudf/tests/test_dataframe.py  |  4 +-
 python/cudf/cudf/tests/test_list.py       |  7 +--
 python/cudf/cudf/tests/test_scalar.py     | 11 +----
 python/cudf/cudf/utils/dtypes.py          | 56 ++++++++---------------
 11 files changed, 37 insertions(+), 86 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index e6dfe2eae62..8ce92e1c0f6 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df):
         col
         for col in df.columns
         if df[col].dtype in pandas_dtypes_to_np_dtypes
-        or pd.api.types.is_datetime64_dtype(df[col].dtype)
-        or pd.api.types.is_timedelta64_dtype(df[col].dtype)
+        or df[col].dtype.kind in "mM"
     ]
 
     for record in records:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 214e84028d2..409c44f6eee 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,7 +18,6 @@
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -565,10 +564,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         # We check this on `other` before reflection since we already know the
         # dtype of `self`.
-        other_is_timedelta = is_timedelta64_dtype(other.dtype)
-        other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype(
-            other.dtype
-        )
+        other_is_timedelta = other.dtype.kind == "m"
+        other_is_datetime64 = other.dtype.kind == "M"
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 2cbed9212de..36d7d9f9614 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.api.types import is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         this: ColumnBinaryOperand = self
         out_dtype = None
 
-        if is_timedelta64_dtype(other.dtype):
+        if other.dtype.kind == "m":
             # TODO: pandas will allow these operators to work but return false
             # when comparing to non-timedelta dtypes. We should do the same.
             if op in {
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f110b788789..2aa1b95e2d1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -33,7 +33,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
-    is_datetime_dtype,
     is_dict_like,
     is_dtype_equal,
     is_list_like,
@@ -6113,7 +6112,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         else:
             filtered = self.copy(deep=False)
 
-        is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes)
+        is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes)
 
         common_dtype = find_common_type(filtered.dtypes)
         if (
@@ -6510,7 +6509,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                         cudf.utils.dtypes.get_min_float_dtype(
                             prepared._data[col]
                         )
-                        if not is_datetime_dtype(common_dtype)
+                        if common_dtype.kind != "M"
                         else cudf.dtype("float64")
                     )
                     .fillna(np.nan)
@@ -6537,7 +6536,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_dtype = (
                 common_dtype
                 if method in type_coerced_methods
-                or is_datetime_dtype(common_dtype)
+                or (common_dtype is not None and common_dtype.kind == "M")
                 else None
             )
             result = column.as_column(result, dtype=result_dtype)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 29460d8c67e..f6331aa1f49 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
@@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
-            value = (
-                NaT
-                if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
-                else NA
-            )
+            value = NaT if dtype.kind in "mM" else NA
 
         return value, dtype
 
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index ef6b86a04a7..466d46f7dca 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -8,12 +8,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import strings as libstrings
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_datetime_dtype,
-    is_string_dtype,
-    is_timedelta_dtype,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import can_convert_to_column
@@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     col = as_column(arg)
     dtype = col.dtype
 
-    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
+    if dtype.kind in "mM":
         col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 5265278db4c..503b1a975b4 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1694,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)
 
     result = op(lhs, rhs)
-    assert result.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(result.dtype)
-        or cudf.api.types.is_timedelta64_dtype(result.dtype)
-        else cudf.NA
-    )
+    assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA)
 
     # make sure dtype is the same as had there been a valid scalar
     valid_lhs = cudf.Scalar(1, dtype=dtype_l)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f40106a30f4..7ccf83e424c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
-    if not numeric_only and not all(
-        cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes
-    ):
+    if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes):
         with pytest.raises(TypeError):
             got = getattr(gdf, op)(
                 axis=1, skipna=skipna, numeric_only=numeric_only
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index ec9d7995b05..36bcaa66d7d 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -694,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
         dtype = cudf.ListDtype(dtype)
 
     slr = cudf.Scalar(None, dtype=dtype)
-    assert slr.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(slr.dtype)
-        or cudf.api.types.is_timedelta64_dtype(slr.dtype)
-        else cudf.NA
-    )
+    assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 195231e9960..f2faf4343b6 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -212,9 +212,7 @@ def test_scalar_roundtrip(value):
 )
 def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
-    if cudf.api.types.is_datetime64_dtype(
-        dtype
-    ) or cudf.api.types.is_timedelta64_dtype(dtype):
+    if s.dtype.kind in "mM":
         assert s.value is cudf.NaT
     else:
         assert s.value is cudf.NA
@@ -369,12 +367,7 @@ def test_scalar_implicit_int_conversion(value):
 @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"}))
 def test_scalar_invalid_implicit_conversion(cls, dtype):
     try:
-        cls(
-            pd.NaT
-            if cudf.api.types.is_datetime64_dtype(dtype)
-            or cudf.api.types.is_timedelta64_dtype(dtype)
-            else pd.NA
-        )
+        cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA)
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
             slr = cudf.Scalar(None, dtype=dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 0dec857ea96..59e5ec1df04 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -424,9 +424,7 @@ def get_time_unit(obj):
 
 def _get_nan_for_dtype(dtype):
     dtype = cudf.dtype(dtype)
-    if pd.api.types.is_datetime64_dtype(
-        dtype
-    ) or pd.api.types.is_timedelta64_dtype(dtype):
+    if dtype.kind in "mM":
         time_unit, _ = np.datetime_data(dtype)
         return dtype.type("nat", time_unit)
     elif dtype.kind == "f":
@@ -527,16 +525,14 @@ def find_common_type(dtypes):
             return cudf.dtype("O")
 
     # Aggregate same types
-    dtypes = set(dtypes)
+    dtypes = {cudf.dtype(dtype) for dtype in dtypes}
+    if len(dtypes) == 1:
+        return dtypes.pop()
 
     if any(
         isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes
     ):
-        if all(
-            cudf.api.types.is_decimal_dtype(dtype)
-            or cudf.api.types.is_numeric_dtype(dtype)
-            for dtype in dtypes
-        ):
+        if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes):
             return _find_common_type_decimal(
                 [
                     dtype
@@ -546,40 +542,28 @@ def find_common_type(dtypes):
             )
         else:
             return cudf.dtype("O")
-    if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            # TODO: As list dtypes allow casting
-            # to identical types, improve this logic of returning a
-            # common dtype, for example:
-            # ListDtype(int64) & ListDtype(int32) common
-            # dtype could be ListDtype(int64).
-            raise NotImplementedError(
-                "Finding a common type for `ListDtype` is currently "
-                "not supported"
-            )
-    if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            raise NotImplementedError(
-                "Finding a common type for `StructDtype` is currently "
-                "not supported"
-            )
+    elif any(
+        isinstance(dtype, (cudf.ListDtype, cudf.StructDtype))
+        for dtype in dtypes
+    ):
+        # TODO: As list dtypes allow casting
+        # to identical types, improve this logic of returning a
+        # common dtype, for example:
+        # ListDtype(int64) & ListDtype(int32) common
+        # dtype could be ListDtype(int64).
+        raise NotImplementedError(
+            "Finding a common type for `ListDtype` or `StructDtype` is currently "
+            "not supported"
+        )
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
-    dt_dtypes = set(
-        filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)
-    )
+    dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes))
     if len(dt_dtypes) > 0:
         dtypes = dtypes - dt_dtypes
         dtypes.add(np.result_type(*dt_dtypes))
 
-    td_dtypes = set(
-        filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)
-    )
+    td_dtypes = set(filter(lambda t: t.kind == "m", dtypes))
     if len(td_dtypes) > 0:
         dtypes = dtypes - td_dtypes
         dtypes.add(np.result_type(*td_dtypes))

From 47a0a87db454cc767ab5f74beb2198a480d6f2c0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:13:29 -1000
Subject: [PATCH 062/101] Type & reduce cupy usage (#16277)

There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible

closes https://github.com/rapidsai/cudf/issues/12133

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16277
---
 python/cudf/cudf/core/_base_index.py      |  4 ++-
 python/cudf/cudf/core/column/column.py    |  8 +++---
 python/cudf/cudf/core/column/datetime.py  |  6 ++--
 python/cudf/cudf/core/column/numerical.py | 10 ++-----
 python/cudf/cudf/core/cut.py              |  6 ++--
 python/cudf/cudf/core/dataframe.py        | 18 +++++++-----
 python/cudf/cudf/core/frame.py            |  6 ++--
 python/cudf/cudf/core/groupby/groupby.py  | 23 ++++++++-------
 python/cudf/cudf/core/index.py            | 34 ++++++++++++-----------
 python/cudf/cudf/core/multiindex.py       | 13 +++++----
 python/cudf/cudf/core/tools/datetimes.py  |  9 +++---
 python/cudf/cudf/tests/test_datetime.py   | 15 ++--------
 12 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e160fa697ee..9ba2d161619 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -38,6 +38,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    import cupy
+
     from cudf.core.column_accessor import ColumnAccessor
 
 
@@ -2001,7 +2003,7 @@ def drop_duplicates(
             self._column_names,
         )
 
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         """
         Indicate duplicate index values.
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f633d527681..fd3664ecac4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase:
         return result
 
     def indices_of(
-        self, value: ScalarLike | Self
+        self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
         """
         Find locations of value in the column
@@ -735,10 +735,10 @@ def indices_of(
         -------
         Column of indices that match value
         """
-        if not isinstance(value, ColumnBase):
-            value = as_column([value], dtype=self.dtype)
+        if not is_scalar(value):
+            raise ValueError("value must be a scalar")
         else:
-            assert len(value) == 1
+            value = as_column(value, dtype=self.dtype, length=1)
         mask = libcudf.search.contains(value, self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 409c44f6eee..004a059af95 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
-        value = column.as_column(
-            pd.to_datetime(value), dtype=self.dtype
-        ).astype("int64")
+        value = (
+            pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64")
+        )
         return self.astype("int64").indices_of(value)
 
     @property
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b8fa00e9643..7f05a5f91a1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -5,7 +5,6 @@
 import functools
 from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 from typing_extensions import Self
@@ -13,7 +12,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
@@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            return column.as_column(
-                cp.argwhere(
-                    cp.isnan(self.data_array_view(mode="read"))
-                ).flatten(),
-                dtype=size_type_dtype,
-            )
+            nan_col = libcudf.unary.is_nan(self)
+            return nan_col.indices_of(True)
         else:
             return super().indices_of(value)
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index d9f62f51f92..197f46ee9fe 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -188,9 +188,6 @@ def cut(
         # adjust bin edges decimal precision
         int_label_bins = np.around(bins, precision)
 
-    # the inputs is a column of the values in the array x
-    input_arr = as_column(x)
-
     # checking for the correct inclusivity values
     if right:
         closed = "right"
@@ -242,6 +239,9 @@ def cut(
                 labels if len(set(labels)) == len(labels) else None
             )
 
+    # the inputs is a column of the values in the array x
+    input_arr = as_column(x)
+
     if isinstance(bins, pd.IntervalIndex):
         # get the left and right edges of the bins as columns
         # we cannot typecast an IntervalIndex, so we need to
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2aa1b95e2d1..2121e623c1c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value):
 
             else:
                 value = cupy.asarray(value)
-                if cupy.ndim(value) == 2:
+                if value.ndim == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value):
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
             value = cupy.asarray(value)
-            if cupy.ndim(value) == 2:
+            if value.ndim == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
@@ -2199,8 +2199,8 @@ def from_dict(
 
         orient = orient.lower()
         if orient == "index":
-            if len(data) > 0 and isinstance(
-                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            if isinstance(
+                next(iter(data.values()), None), (cudf.Series, cupy.ndarray)
             ):
                 result = cls(data).T
                 result.columns = (
@@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_performance_tracking
-    def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
+    def _from_arrays(
+        cls,
+        data: np.ndarray | cupy.ndarray,
+        index=None,
+        columns=None,
+        nan_as_null=False,
+    ):
         """Convert a numpy/cupy array to DataFrame.
 
         Parameters
@@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         -------
         DataFrame
         """
-
-        data = cupy.asarray(data)
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found: {data.ndim}"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 253d200f7d4..802751e47ad 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1189,7 +1189,7 @@ def searchsorted(
         side: Literal["left", "right"] = "left",
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
-    ):
+    ) -> ScalarLike | cupy.ndarray:
         """Find indices where elements should be inserted to maintain order
 
         Parameters
@@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
-    ):
+    ) -> list[dict[Any, ColumnBase]]:
         # Note: There are some operations that may be supported by libcudf but
         # are not supported by pandas APIs. In particular, libcudf binary
         # operations support logical and/or operations as well as
@@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands(
         # without cupy.
 
         mask = None
-        data = [{} for _ in range(ufunc.nout)]
+        data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)]
         for name, (left, right, _, _) in operands.items():
             cupy_inputs = []
             for inp in (left, right) if ufunc.nin == 2 else (left,):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index eccb3acabf6..8659d7c2392 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -35,7 +35,12 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
+    from cudf._typing import (
+        AggType,
+        DataFrameOrSeries,
+        MultiColumnAggType,
+        ScalarLike,
+    )
 
 
 def _deprecate_collect():
@@ -357,7 +362,7 @@ def groups(self):
         )
 
     @cached_property
-    def indices(self):
+    def indices(self) -> dict[ScalarLike, cp.ndarray]:
         """
         Dict {group name -> group indices}.
 
@@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True):
 
         if ascending:
             # Count ascending from 0 to num_groups - 1
-            group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
+            groups = range(num_groups)
         elif has_null_group:
             # Count descending from num_groups - 1 to 0, but subtract one more
             # for the null group making it num_groups - 2 to -1.
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 2, -2, -1)}
-            )
+            groups = range(num_groups - 2, -2, -1)
         else:
             # Count descending from num_groups - 1 to 0
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 1, -1, -1)}
-            )
+            groups = range(num_groups - 1, -1, -1)
+
+        group_ids = cudf.Series._from_data({None: as_column(groups)})
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
@@ -1713,7 +1716,7 @@ def rolling_avg(val, avg):
         return grouped_values.apply_chunks(function, **kwargs)
 
     @_performance_tracking
-    def _broadcast(self, values):
+    def _broadcast(self, values: cudf.Series) -> cudf.Series:
         """
         Broadcast the results of an aggregation to the group
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b398ee2343e..4164f981fca 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass):
 
 def _lexsorted_equal_range(
     idx: Index | cudf.MultiIndex,
-    key_as_table: Frame,
+    keys: list[ColumnBase],
     is_sorted: bool,
 ) -> tuple[int, int, ColumnBase | None]:
     """Get equal range for key in lexicographically sorted index. If index
@@ -118,13 +118,13 @@ def _lexsorted_equal_range(
         sort_vals = idx
     lower_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
@@ -260,7 +260,9 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
-    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+    def factorize(
+        self, sort: bool = False, use_na_sentinel: bool = True
+    ) -> tuple[cupy.ndarray, Self]:
         if sort and self.step < 0:
             codes = cupy.arange(len(self) - 1, -1, -1)
             uniques = self[::-1]
@@ -753,15 +755,16 @@ def difference(self, other, sort=None):
             super().difference(other, sort=sort)
         )
 
-    def _try_reconstruct_range_index(self, index):
-        if isinstance(index, RangeIndex) or index.dtype.kind == "f":
+    def _try_reconstruct_range_index(
+        self, index: BaseIndex
+    ) -> Self | BaseIndex:
+        if isinstance(index, RangeIndex) or index.dtype.kind not in "iu":
             return index
         # Evenly spaced values can return a
         # RangeIndex instead of a materialized Index.
-        if not index._column.has_nulls():
+        if not index._column.has_nulls():  # type: ignore[attr-defined]
             uniques = cupy.unique(cupy.diff(index.values))
-            if len(uniques) == 1 and uniques[0].get() != 0:
-                diff = uniques[0].get()
+            if len(uniques) == 1 and (diff := uniques[0].get()) != 0:
                 new_range = range(index[0], index[-1] + diff, diff)
                 return type(self)(new_range, name=index.name)
         return index
@@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         return _return_get_indexer_result(result_series.to_cupy())
 
     @_performance_tracking
-    def get_loc(self, key):
+    def get_loc(self, key) -> int | slice | cupy.ndarray:
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
 
@@ -1317,9 +1320,8 @@ def get_loc(self, key):
             self.is_monotonic_increasing or self.is_monotonic_decreasing
         )
 
-        target_as_table = cudf.core.frame.Frame({"None": as_column([key])})
         lower_bound, upper_bound, sort_inds = _lexsorted_equal_range(
-            self, target_as_table, is_sorted
+            self, [as_column([key])], is_sorted
         )
 
         if lower_bound == upper_bound:
@@ -1330,7 +1332,7 @@ def get_loc(self, key):
             return (
                 lower_bound
                 if is_sorted
-                else sort_inds.element_indexing(lower_bound)
+                else sort_inds.element_indexing(lower_bound)  # type: ignore[union-attr]
             )
 
         if is_sorted:
@@ -1339,8 +1341,8 @@ def get_loc(self, key):
             return slice(lower_bound, upper_bound)
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
-        true_inds = sort_inds.slice(lower_bound, upper_bound).values
+        mask = cupy.full(len(self), False)
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values  # type: ignore[union-attr]
         mask[true_inds] = True
         return mask
 
@@ -2076,7 +2078,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def is_leap_year(self):
+    def is_leap_year(self) -> cupy.ndarray:
         """
         Boolean indicator if the date belongs to a leap year.
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6503dae6ff5..3ed72ff812a 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1926,17 +1926,18 @@ def get_loc(self, key):
 
         # Handle partial key search. If length of `key` is less than `nlevels`,
         # Only search levels up to `len(key)` level.
-        key_as_table = cudf.core.frame.Frame(
-            {i: column.as_column(k, length=1) for i, k in enumerate(key)}
-        )
         partial_index = self.__class__._from_data(
-            data=self._data.select_by_index(slice(key_as_table._num_columns))
+            data=self._data.select_by_index(slice(len(key)))
         )
         (
             lower_bound,
             upper_bound,
             sort_inds,
-        ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted)
+        ) = _lexsorted_equal_range(
+            partial_index,
+            [column.as_column(k, length=1) for k in key],
+            is_sorted,
+        )
 
         if lower_bound == upper_bound:
             raise KeyError(key)
@@ -1961,7 +1962,7 @@ def get_loc(self, key):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cp.full(self._data.nrows, False)
+        mask = cp.full(len(self), False)
         mask[true_inds] = True
         return mask
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 064e8fc667d..c6e2b5d10e1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -6,7 +6,6 @@
 import warnings
 from typing import Literal, Sequence
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
@@ -894,7 +893,7 @@ def date_range(
         # integers and divide the number range evenly with `periods` elements.
         start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
-        arr = cp.linspace(start=start, stop=end, num=periods)
+        arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
@@ -991,8 +990,10 @@ def date_range(
         stop = end_estim.astype("int64")
         start = start.value.astype("int64")
         step = _offset_to_nanoseconds_lower_bound(offset)
-        arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
-        res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+        arr = range(int(start), int(stop), step)
+        res = cudf.core.column.as_column(arr, dtype="int64").astype(
+            "datetime64[ns]"
+        )
 
     return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
         tz
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 092e9790c63..7ab9ff2ef23 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
-def test_date_range_start_end_freq(request, start, end, freq):
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                start == "1831-05-08 15:23:21"
-                and end == "1996-11-21 04:05:30"
-                and freq == "110546789ms"
-            ),
-            reason="https://github.com/rapidsai/cudf/issues/12133",
-        )
-    )
-
+def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:
@@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
     )
 
 
-def test_date_range_start_freq_periods(request, start, freq, periods):
+def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:

From beda22ed28030bbed2faaa5a49509255f11976aa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:29:05 -1000
Subject: [PATCH 063/101] Replace is_bool_type with checking .dtype.kind
 (#16255)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16255
---
 python/cudf/cudf/core/_base_index.py         |  9 +++------
 python/cudf/cudf/core/_internals/where.py    |  8 ++------
 python/cudf/cudf/core/column/column.py       |  7 +++----
 python/cudf/cudf/core/column/numerical.py    |  5 ++---
 python/cudf/cudf/core/dataframe.py           | 13 ++++++-------
 python/cudf/cudf/core/groupby/groupby.py     |  4 ++--
 python/cudf/cudf/core/indexing_utils.py      |  3 +--
 python/cudf/cudf/core/multiindex.py          |  4 ----
 python/cudf/cudf/core/series.py              | 11 +++++------
 python/cudf/cudf/core/single_column_frame.py |  3 +--
 python/cudf/cudf/tests/test_dataframe.py     |  2 +-
 python/cudf/cudf/tests/test_index.py         |  5 ++---
 12 files changed, 28 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 9ba2d161619..479f87bb78b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -20,7 +20,6 @@
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -610,10 +609,8 @@ def union(self, other, sort=None):
             )
 
         if cudf.get_option("mode.pandas_compatible"):
-            if (
-                is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype)
-            ) or (
-                not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype)
+            if (self.dtype.kind == "b" and other.dtype.kind != "b") or (
+                self.dtype.kind != "b" and other.dtype.kind == "b"
             ):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
@@ -2154,7 +2151,7 @@ def _apply_boolean_mask(self, boolean_mask):
         Rows corresponding to `False` is dropped.
         """
         boolean_mask = cudf.core.column.as_column(boolean_mask)
-        if not is_bool_dtype(boolean_mask.dtype):
+        if boolean_mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index f3183e6029d..4a36be76b6d 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -7,11 +7,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_bool_dtype,
-    is_scalar,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     _can_cast,
@@ -112,7 +108,7 @@ def _check_and_cast_columns_with_other(
         other = cudf.Scalar(other)
 
     if is_mixed_with_object_dtype(other, source_col) or (
-        is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype)
+        source_dtype.kind == "b" and common_dtype.kind != "b"
     ):
         raise TypeError(mixed_err)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index fd3664ecac4..dbdf501e022 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -41,7 +41,6 @@
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
-    is_bool_dtype,
     is_dtype_equal,
     is_scalar,
     is_string_dtype,
@@ -619,7 +618,7 @@ def _scatter_by_column(
         key: cudf.core.column.NumericalColumn,
         value: cudf.core.scalar.Scalar | ColumnBase,
     ) -> Self:
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             # `key` is boolean mask
             if len(key) != len(self):
                 raise ValueError(
@@ -644,7 +643,7 @@ def _scatter_by_column(
 
         self._check_scatter_key_length(num_keys, value)
 
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return libcudf.copying.boolean_mask_scatter([value], [self], key)[
                 0
             ]._with_type_metadata(self.dtype)
@@ -1083,7 +1082,7 @@ def as_decimal_column(
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask)
-        if not is_bool_dtype(mask.dtype):
+        if mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return apply_boolean_mask([self], mask)[0]._with_type_metadata(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7f05a5f91a1..cea68c88c90 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,7 +13,6 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf.api.types import (
-    is_bool_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
@@ -159,7 +158,7 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value)
         )
 
-        if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
+        if self.dtype.kind != "b" and device_value.dtype.kind == "b":
             raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
         else:
             device_value = device_value.astype(self.dtype)
@@ -264,7 +263,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                     f"{self.dtype.type.__name__} and "
                     f"{other.dtype.type.__name__}"
                 )
-            if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype):
+            if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
         if (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2121e623c1c..b3d938829c9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -32,7 +32,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_dict_like,
     is_dtype_equal,
     is_list_like,
@@ -171,7 +170,7 @@ def _can_downcast_to_series(self, df, arg):
             ):
                 return False
             else:
-                if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance(
+                if as_column(arg[0]).dtype.kind == "b" and not isinstance(
                     arg[1], slice
                 ):
                     return True
@@ -320,7 +319,7 @@ def _getitem_tuple_arg(self, arg):
                     tmp_arg[1],
                 )
 
-                if is_bool_dtype(tmp_arg[0].dtype):
+                if tmp_arg[0].dtype.kind == "b":
                     df = columns_df._apply_boolean_mask(
                         BooleanMask(tmp_arg[0], len(columns_df))
                     )
@@ -3678,8 +3677,8 @@ def agg(self, aggs, axis=None):
         """
         dtypes = [self[col].dtype for col in self._column_names]
         common_dtype = find_common_type(dtypes)
-        if not is_bool_dtype(common_dtype) and any(
-            is_bool_dtype(dtype) for dtype in dtypes
+        if common_dtype.kind != "b" and any(
+            dtype.kind == "b" for dtype in dtypes
         ):
             raise MixedTypeError("Cannot create a column with mixed types")
 
@@ -6305,8 +6304,8 @@ def _reduce(
                     and any(
                         not is_object_dtype(dtype) for dtype in source_dtypes
                     )
-                    or not is_bool_dtype(common_dtype)
-                    and any(is_bool_dtype(dtype) for dtype in source_dtypes)
+                    or common_dtype.kind != "b"
+                    and any(dtype.kind == "b" for dtype in source_dtypes)
                 ):
                     raise TypeError(
                         "Columns must all have the same dtype to "
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8659d7c2392..d2c75715be2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,7 @@
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
+from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -1534,7 +1534,7 @@ def mult(df):
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
-                    if is_bool_dtype(col.dtype):
+                    if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
 
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index a5fed02cbed..9c81b0eb607 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -10,7 +10,6 @@
 import cudf
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
 )
@@ -230,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
             key = key.astype(key.codes.dtype)
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3ed72ff812a..ff4b06c6334 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -841,10 +841,6 @@ def _get_row_major(
         | tuple[Any, ...]
         | list[tuple[Any, ...]],
     ) -> DataFrameOrSeries:
-        if pd.api.types.is_bool_dtype(
-            list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
-        ):
-            return df[row_tuple]
         if isinstance(row_tuple, slice):
             if row_tuple.start is None:
                 row_tuple = slice(self[0], row_tuple.stop, row_tuple.step)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8c8fa75918c..e12cc3d52fb 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -22,7 +22,6 @@
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_dict_like,
     is_integer,
     is_integer_dtype,
@@ -221,10 +220,10 @@ def __setitem__(self, key, value):
                     f"Cannot assign {value=} to "
                     f"non-float dtype={self._frame.dtype}"
                 )
-            elif (
-                self._frame.dtype.kind == "b"
-                and not is_bool_dtype(value)
-                and value not in {None, cudf.NA}
+            elif self._frame.dtype.kind == "b" and not (
+                value in {None, cudf.NA}
+                or isinstance(value, (np.bool_, bool))
+                or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b")
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
@@ -3221,7 +3220,7 @@ def describe(
             percentiles = np.array([0.25, 0.5, 0.75])
 
         dtype = "str"
-        if is_bool_dtype(self.dtype):
+        if self.dtype.kind == "b":
             data = _describe_categorical(self, percentiles)
         elif isinstance(self._column, cudf.core.column.NumericalColumn):
             data = _describe_numeric(self, percentiles)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f9555aee6a2..04c7db7a53c 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -11,7 +11,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
     is_numeric_dtype,
@@ -361,7 +360,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
             if is_integer_dtype(arg.dtype):
                 return self._column.take(arg)
-            if is_bool_dtype(arg.dtype):
+            if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
                     raise IndexError(
                         f"Boolean mask has wrong length: {bn} not {n}"
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 7ccf83e424c..2009fc49ce5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only):
             else (pdf[column].notna().count() == 0)
         )
         or cudf.api.types.is_numeric_dtype(pdf[column].dtype)
-        or cudf.api.types.is_bool_dtype(pdf[column].dtype)
+        or pdf[column].dtype.kind == "b"
         for column in pdf
     ):
         with pytest.raises(TypeError):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05dcd85df6a..9eba6122d26 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,7 +16,6 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible):
             expected,
             actual,
             exact=False
-            if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype))
-            or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype))
+            if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b")
+            or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b")
             else True,
         )
 

From 669db3ea4a0c24a343c5619dd00904ad22ea215b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Jul 2024 14:24:58 +0100
Subject: [PATCH 064/101] Fix logic in to_arrow for empty list column (#16279)

An empty list column need not have empty children, it just needs to have zero length. In this case, the offsets array will have zero length, and we need to create a temporary buffer.

Now that this branch runs, fix two errors in the construction of the arrow array:

1. The element type, if there are children, should be taken from the child array;
2. If the child arrays are empty, we must make an empty null array, rather than passing a null pointer as the values array, otherwise we hit a segfault inside arrow.

The previous fix in #16201 correctly handled the empty children case (except for point two), but not the first case, which we do here.

Since we we're previously going down this code path (child_arrays was never empty), we never hit the latent segfault from point two.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16279
---
 cpp/src/interop/to_arrow.cu | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 8c4be1b50a5..622a3aba4bb 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -378,13 +378,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   auto children_meta =
     metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
   auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
-  if (child_arrays.empty()) {
-    // Empty list will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
-    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
-
-    return std::make_shared<arrow::ListArray>(
-      arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr);
+  if (child_arrays.empty() || child_arrays[0]->data()->length == 0) {
+    auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type();
+    auto result       = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr);
+    CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n");
+    return result.ValueUnsafe();
   }
 
   auto offset_buffer = child_arrays[0]->data()->buffers[1];

From a6de6cc23702ed71b80625f461a90e910a33642f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Jul 2024 15:42:12 +0100
Subject: [PATCH 065/101] Introduce version file so we can conditionally handle
 things in tests (#16280)

We decided we would attempt to support a range of versions back to 1.0. We'll test with oldest and newest versions we support. To facilitate, introduce some versioning constants.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16280
---
 python/cudf_polars/cudf_polars/dsl/ir.py      |  7 ++++-
 .../cudf_polars/cudf_polars/utils/versions.py | 28 +++++++++++++++++++
 python/cudf_polars/tests/test_scan.py         |  5 ++++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/cudf_polars/utils/versions.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 5e6544ef77c..cce0c4a3d94 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -313,7 +313,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
             )  # pragma: no cover; post init trips first
-        if row_index is not None:
+        if (
+            row_index is not None
+            # TODO: remove condition when dropping support for polars 1.0
+            # https://github.com/pola-rs/polars/pull/17363
+            and row_index[0] in self.schema
+        ):
             name, offset = row_index
             dtype = self.schema[name]
             step = plc.interop.from_arrow(
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
new file mode 100644
index 00000000000..a9ac14c25aa
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Version utilities so that cudf_polars supports a range of polars versions."""
+
+# ruff: noqa: SIM300
+from __future__ import annotations
+
+from packaging.version import parse
+
+from polars import __version__
+
+POLARS_VERSION = parse(__version__)
+
+POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
+POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
+POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
+POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
+POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
+
+POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2")
+POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1")
+POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2")
+POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1")
+
+if POLARS_VERSION < parse("1.0"):  # pragma: no cover
+    raise ImportError("cudf_polars requires py-polars v1.0 or greater.")
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index c41a94da14b..d0c41090433 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -10,6 +10,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture(
@@ -97,6 +98,10 @@ def test_scan_unsupported_raises(tmp_path):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
+@pytest.mark.xfail(
+    versions.POLARS_VERSION_LT_11,
+    reason="https://github.com/pola-rs/polars/issues/15730",
+)
 def test_scan_row_index_projected_out(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 

From 3418f915d1a1ff82a72918d978924dfad2645a5a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 16 Jul 2024 12:38:47 -0500
Subject: [PATCH 066/101] Introduce dedicated options for low memory readers
 (#16289)

This PR disables low memory readers by default in `cudf.pandas` and instead gives a provision to enable them with dedicated options.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16289
---
 python/cudf/cudf/_lib/json.pyx         |  2 +-
 python/cudf/cudf/io/parquet.py         |  2 +-
 python/cudf/cudf/options.py            | 26 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_json.py    |  2 +-
 python/cudf/cudf/tests/test_parquet.py |  2 +-
 5 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 853dd431099..03bf9ed8b75 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -99,7 +99,7 @@ cpdef read_json(object filepaths_or_buffers,
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    if cudf.get_option("mode.pandas_compatible") and lines:
+    if cudf.get_option("io.json.low_memory") and lines:
         res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json(
             plc.io.SourceInfo(filepaths_or_buffers),
             processed_dtypes,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index fd0792b5edb..02b26ea1c01 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -916,7 +916,7 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        if cudf.get_option("mode.pandas_compatible"):
+        if cudf.get_option("io.parquet.low_memory"):
             return libparquet.ParquetReader(
                 filepaths_or_buffers,
                 columns=columns,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 1f539e7f266..94e73021cec 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -325,6 +325,32 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "io.parquet.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire parquet in one go.
+        If set to `True`, reads parquet file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+_register_option(
+    "io.json.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire json in one go.
+        If set to `True`, reads json file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 7771afd692f..c81c2d1d94b 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1441,6 +1441,6 @@ def test_chunked_json_reader():
     df.to_json(buf, lines=True, orient="records", engine="cudf")
     buf.seek(0)
     df = df.to_pandas()
-    with cudf.option_context("mode.pandas_compatible", True):
+    with cudf.option_context("io.json.low_memory", True):
         gdf = cudf.read_json(buf, lines=True)
     assert_eq(df, gdf)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ff0c9040737..ecb7fd44422 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3772,6 +3772,6 @@ def test_parquet_reader_pandas_compatibility():
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    with cudf.option_context("mode.pandas_compatible", True):
+    with cudf.option_context("io.parquet.low_memory", True):
         expected = cudf.read_parquet(buffer)
     assert_eq(expected, df)

From e2b7e4370c8513811e9c72b30f499a5614b49f7c Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 16 Jul 2024 14:20:00 -0400
Subject: [PATCH 067/101] Build and test with CUDA 12.5.1 (#16259)

This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16259
---
 .../cuda12.2-conda/devcontainer.json          |  8 ++--
 .devcontainer/cuda12.2-pip/devcontainer.json  | 10 ++--
 .github/workflows/build.yaml                  | 20 ++++----
 .github/workflows/pandas-tests.yaml           |  4 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 22 ++++-----
 CONTRIBUTING.md                               |  2 +-
 README.md                                     |  2 +-
 ..._64.yaml => all_cuda-125_arch-x86_64.yaml} |  4 +-
 dependencies.yaml                             |  6 ++-
 11 files changed, 68 insertions(+), 64 deletions(-)
 rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (97%)

diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 05bf9173d25..fadce01d060 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 74420214726..026eb540952 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -3,15 +3,15 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -28,7 +28,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..937080572ad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index a8643923a4d..1516cb09449 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,9 +17,9 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ceee9074b93..1fe64e7f318 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,10 +174,10 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
     with:
       arch: '["amd64"]'
-      cuda: '["12.2"]'
+      cuda: '["12.5"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,9 +194,9 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ca971dc28d..2a8ebd30993 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..73f8d726e77 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4fbc28fa6e1..f9cdde7c2b7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
diff --git a/README.md b/README.md
index 17d2df9a936..1ab6a2d7457 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.2
+    cudf=24.08 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 97%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index c32d21c5d36..3f5fae49cbb 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
-- cuda-version=12.2
+- cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
@@ -96,4 +96,4 @@ dependencies:
 - zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 27621ff9a3f..67ed3773b44 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.2"]
+      cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_base
@@ -402,6 +402,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda

From 05ea7c9cf6a0fd39384e2044b4c9b46f543d4ad0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:51:20 -0700
Subject: [PATCH 068/101] Fix tests for polars 1.2 (#16292)

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16292
---
 python/cudf_polars/tests/test_groupby.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 50adca01950..b07d8e38217 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -12,6 +12,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture
@@ -100,7 +101,7 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
         with pytest.raises(AssertionError):
             # https://github.com/pola-rs/polars/issues/17556
             assert_gpu_result_equal(q, check_exact=False)
-        if schema[sort_keys[1]] == pl.Boolean():
+        if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean():
             # https://github.com/pola-rs/polars/issues/17557
             with pytest.raises(AssertionError):
                 assert_gpu_result_equal(qsorted, check_exact=False)

From 62191103032706371d76ce83c6ec59d13376b231 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 16 Jul 2024 16:23:49 -0400
Subject: [PATCH 069/101] [BUG] Make name attr of Index fast slow attrs
 (#16270)

Debugging the spike in failures from #16234

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16270
---
 python/cudf/cudf/pandas/_wrappers/pandas.py | 36 ++++++++-------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index d3a3488081a..59a243dd7c4 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -260,18 +260,14 @@ def Index__new__(cls, *args, **kwargs):
     return self
 
 
-def name(self):
-    return self._fsproxy_wrapped._name
-
-
 def Index__setattr__(self, name, value):
     if name.startswith("_"):
         object.__setattr__(self, name, value)
         return
     if name == "name":
-        setattr(self._fsproxy_wrapped, "_name", value)
+        setattr(self._fsproxy_wrapped, "name", value)
     if name == "names":
-        setattr(self._fsproxy_wrapped, "_names", value)
+        setattr(self._fsproxy_wrapped, "names", value)
     return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
         name, value
     )
@@ -300,7 +296,7 @@ def Index__setattr__(self, name, value):
         "_accessors": set(),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -314,7 +310,7 @@ def Index__setattr__(self, name, value):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -345,7 +341,7 @@ def Index__setattr__(self, name, value):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -375,10 +371,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -412,10 +408,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -470,10 +466,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -508,10 +504,6 @@ def Index__setattr__(self, name, value):
 )
 
 
-def names(self):
-    return self._fsproxy_wrapped._names
-
-
 MultiIndex = make_final_proxy_type(
     "MultiIndex",
     cudf.MultiIndex,
@@ -522,7 +514,7 @@ def names(self):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(names),
+        "names": _FastSlowAttribute("names"),
     },
 )
 
@@ -709,10 +701,10 @@ def names(self):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 

From 6a954e299d97f69a62fd184529fa7d5f29c0e09f Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 16 Jul 2024 15:02:20 -0700
Subject: [PATCH 070/101] Migrate expressions to pylibcudf (#16056)

xref #15162

Migrates expresions to use pylibcudf.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16056
---
 .../api_docs/pylibcudf/datetime.rst           |   6 +-
 .../api_docs/pylibcudf/expressions.rst        |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   3 +-
 python/cudf/cudf/_lib/expressions.pyx         | 156 --------------
 python/cudf/cudf/_lib/parquet.pyx             |   2 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   1 +
 .../cudf/_lib/{ => pylibcudf}/expressions.pxd |  29 ++-
 .../cudf/cudf/_lib/pylibcudf/expressions.pyx  | 195 ++++++++++++++++++
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |   4 +-
 .../_lib/pylibcudf/libcudf/expressions.pxd    | 103 ++++-----
 .../_lib/pylibcudf/libcudf/expressions.pyx    |   0
 python/cudf/cudf/_lib/transform.pyx           |   2 +-
 .../cudf/cudf/core/_internals/expressions.py  |  11 +-
 .../cudf/pylibcudf_tests/test_expressions.py  |  50 +++++
 18 files changed, 335 insertions(+), 237 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
 delete mode 100644 python/cudf/cudf/_lib/expressions.pyx
 rename python/cudf/cudf/_lib/{ => pylibcudf}/expressions.pxd (50%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/expressions.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_expressions.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
index ebf5fab3052..558268ea495 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -1,6 +1,6 @@
-=======
-copying
-=======
+========
+datetime
+========
 
 .. automodule:: cudf._lib.pylibcudf.datetime
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
new file mode 100644
index 00000000000..03f769ee861
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
@@ -0,0 +1,6 @@
+===========
+expressions
+===========
+
+.. automodule:: cudf._lib.pylibcudf.expressions
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 5899d272160..505765bba0f 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -15,6 +15,7 @@ This page provides API documentation for pylibcudf.
     concatenate
     copying
     datetime
+    expressions
     filling
     gpumemoryview
     groupby
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 5a067e84f56..38b7e9ebe04 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -21,7 +21,6 @@ set(cython_sources
     copying.pyx
     csv.pyx
     datetime.pyx
-    expressions.pyx
     filling.pyx
     groupby.pyx
     hash.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 18b95f5f2e1..34c0e29d0b1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -8,7 +8,6 @@
     copying,
     csv,
     datetime,
-    expressions,
     filling,
     groupby,
     hash,
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
deleted file mode 100644
index 3fb29279ed7..00000000000
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from enum import Enum
-
-import numpy as np
-
-from cython.operator cimport dereference
-from libc.stdint cimport int64_t
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_us,
-)
-
-# Necessary for proper casting, see below.
-ctypedef int32_t underlying_type_ast_operator
-
-
-# Aliases for simplicity
-ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
-
-
-class ASTOperator(Enum):
-    ADD = libcudf_exp.ast_operator.ADD
-    SUB = libcudf_exp.ast_operator.SUB
-    MUL = libcudf_exp.ast_operator.MUL
-    DIV = libcudf_exp.ast_operator.DIV
-    TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV
-    FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV
-    MOD = libcudf_exp.ast_operator.MOD
-    PYMOD = libcudf_exp.ast_operator.PYMOD
-    POW = libcudf_exp.ast_operator.POW
-    EQUAL = libcudf_exp.ast_operator.EQUAL
-    NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL
-    NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL
-    LESS = libcudf_exp.ast_operator.LESS
-    GREATER = libcudf_exp.ast_operator.GREATER
-    LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL
-    GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL
-    BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND
-    BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR
-    BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR
-    LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND
-    NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND
-    LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR
-    NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
-    # Unary operators
-    IDENTITY = libcudf_exp.ast_operator.IDENTITY
-    IS_NULL = libcudf_exp.ast_operator.IS_NULL
-    SIN = libcudf_exp.ast_operator.SIN
-    COS = libcudf_exp.ast_operator.COS
-    TAN = libcudf_exp.ast_operator.TAN
-    ARCSIN = libcudf_exp.ast_operator.ARCSIN
-    ARCCOS = libcudf_exp.ast_operator.ARCCOS
-    ARCTAN = libcudf_exp.ast_operator.ARCTAN
-    SINH = libcudf_exp.ast_operator.SINH
-    COSH = libcudf_exp.ast_operator.COSH
-    TANH = libcudf_exp.ast_operator.TANH
-    ARCSINH = libcudf_exp.ast_operator.ARCSINH
-    ARCCOSH = libcudf_exp.ast_operator.ARCCOSH
-    ARCTANH = libcudf_exp.ast_operator.ARCTANH
-    EXP = libcudf_exp.ast_operator.EXP
-    LOG = libcudf_exp.ast_operator.LOG
-    SQRT = libcudf_exp.ast_operator.SQRT
-    CBRT = libcudf_exp.ast_operator.CBRT
-    CEIL = libcudf_exp.ast_operator.CEIL
-    FLOOR = libcudf_exp.ast_operator.FLOOR
-    ABS = libcudf_exp.ast_operator.ABS
-    RINT = libcudf_exp.ast_operator.RINT
-    BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT
-    NOT = libcudf_exp.ast_operator.NOT
-
-
-class TableReference(Enum):
-    LEFT = libcudf_exp.table_reference.LEFT
-    RIGHT = libcudf_exp.table_reference.RIGHT
-
-
-# Note that this function only currently supports numeric literals. libcudf
-# expressions don't really support other types yet though, so this isn't
-# restrictive at the moment.
-cdef class Literal(Expression):
-    def __cinit__(self, value):
-        if isinstance(value, int):
-            self.c_scalar.reset(new numeric_scalar[int64_t](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[int64_t] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, float):
-            self.c_scalar.reset(new numeric_scalar[double](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[double] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, str):
-            self.c_scalar.reset(new string_scalar(value.encode(), True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <string_scalar &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, np.datetime64):
-            scale, _ = np.datetime_data(value.dtype)
-            int_value = value.astype(np.int64)
-            if scale == "ms":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_ms](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_ms] &>dereference(self.c_scalar)
-                ))
-            elif scale == "us":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_us](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_us] &>dereference(self.c_scalar)
-                ))
-            else:
-                raise NotImplementedError(
-                    f"Unhandled datetime scale {scale=}"
-                )
-        else:
-            raise NotImplementedError(
-                f"Don't know how to make literal with type {type(value)}"
-            )
-
-
-cdef class ColumnReference(Expression):
-    def __cinit__(self, size_type index):
-        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
-            index
-        ))
-
-
-cdef class Operation(Expression):
-    def __cinit__(self, op, Expression left, Expression right=None):
-        cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
-            <underlying_type_ast_operator> op.value
-        )
-
-        if right is None:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj)
-            ))
-        else:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj), dereference(right.c_obj)
-            ))
-
-cdef class ColumnNameReference(Expression):
-    def __cinit__(self, string name):
-        self.c_obj = <expression_ptr> \
-            move(make_unique[libcudf_exp.column_name_reference](name))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 158fb6051c3..e7959d21e01 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
 from cudf._lib.io.utils cimport (
     make_sinks_info,
     make_source_info,
     update_struct_field_names,
 )
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index a2d11bbea6e..0800fa18e94 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    expressions.pyx
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index da2b7806203..26e89b818d3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     join,
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index acbc84d7177..e89a5ed9f96 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     interop,
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd
index 4a20c5fc545..64825b89d9f 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
@@ -1,36 +1,31 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
+# Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._lib.pylibcudf.libcudf.expressions cimport (
-    column_reference,
+    ast_operator,
     expression,
-    literal,
-    operation,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    numeric_scalar,
-    scalar,
-    string_scalar,
-    timestamp_scalar,
+    table_reference,
 )
 
+from .scalar cimport Scalar
+
 
 cdef class Expression:
     cdef unique_ptr[expression] c_obj
 
-
 cdef class Literal(Expression):
-    cdef unique_ptr[scalar] c_scalar
-
+    # Hold on to input scalar so it doesn't get gc'ed
+    cdef Scalar scalar
 
 cdef class ColumnReference(Expression):
     pass
 
-
 cdef class Operation(Expression):
-    pass
+    # Hold on to the input expressions so
+    # they don't get gc'ed
+    cdef Expression right
+    cdef Expression left
 
 cdef class ColumnNameReference(Expression):
     pass
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
new file mode 100644
index 00000000000..38de11406ad
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
@@ -0,0 +1,195 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    ast_operator as ASTOperator  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    table_reference as TableReference  # no-cython-lint
+
+from cython.operator cimport dereference
+from libc.stdint cimport int32_t, int64_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    duration_scalar,
+    numeric_scalar,
+    string_scalar,
+    timestamp_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+    duration_ms,
+    duration_ns,
+    duration_s,
+    duration_us,
+)
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_ns,
+    timestamp_s,
+    timestamp_us,
+)
+
+from .scalar cimport Scalar
+from .traits cimport is_chrono, is_numeric
+from .types cimport DataType
+
+# Aliases for simplicity
+ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
+
+cdef class Literal(Expression):
+    """
+    A literal value used in an abstract syntax tree.
+
+    For details, see :cpp:class:`cudf::ast::literal`.
+
+    Parameters
+    ----------
+    value : Scalar
+        The Scalar value of the Literal.
+        Must be either numeric, string, or a timestamp/duration scalar.
+    """
+    def __cinit__(self, Scalar value):
+        self.scalar = value
+        cdef DataType typ = value.type()
+        cdef type_id tid = value.type().id()
+        if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING):
+            raise ValueError(
+                "Only numeric, string, or timestamp/duration scalars are accepted"
+            )
+        # TODO: Accept type-erased scalar in AST C++ code
+        # Then a lot of this code can be deleted
+        if tid == type_id.INT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int64_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.INT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int32_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[double] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[float] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.STRING:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <string_scalar &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_s] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_s] &>dereference(self.scalar.c_obj)
+            ))
+        else:
+            raise NotImplementedError(
+                f"Don't know how to make literal with type id {tid}"
+            )
+
+cdef class ColumnReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_reference`.
+
+    Parameters
+    ----------
+    index : size_type
+        The index of this column in the table
+        (provided when the expression is evaluated).
+    table_source : TableReference, default TableReferenece.LEFT
+        Which table to use in cases with two tables (e.g. joins)
+    """
+    def __cinit__(
+        self,
+        size_type index,
+        table_reference table_source=table_reference.LEFT
+    ):
+        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
+            index, table_source
+        ))
+
+
+cdef class Operation(Expression):
+    """
+    An operation expression holds an operator and zero or more operands.
+
+    For details, see :cpp:class:`cudf::ast::operation`.
+
+    Parameters
+    ----------
+    op : Operator
+    left : Expression
+        Left input expression (left operand)
+    right: Expression, default None
+        Right input expression (right operand).
+        You should only pass this if the input expression is a binary operation.
+    """
+    def __cinit__(self, ast_operator op, Expression left, Expression right=None):
+        self.left = left
+        self.right = right
+        if right is None:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj)
+            ))
+        else:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj), dereference(right.c_obj)
+            ))
+
+cdef class ColumnNameReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_name_reference`.
+
+    Parameters
+    ----------
+    column_name : str
+        Name of this column in the table metadata
+        (provided when the expression is evaluated).
+    """
+    def __cinit__(self, str name):
+        self.c_obj = <expression_ptr> \
+            move(make_unique[libcudf_exp.column_name_reference](
+                <string>(name.encode("utf-8"))
+            ))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 699e85ce567..b04e94f1546 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx
-                   stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx
+                   round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 279d969db50..427e16d4ff8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
-    ctypedef enum ast_operator:
+    cpdef enum class ast_operator(int32_t):
         # Binary operators
-        ADD "cudf::ast::ast_operator::ADD"
-        SUB "cudf::ast::ast_operator::SUB"
-        MUL "cudf::ast::ast_operator::MUL"
-        DIV "cudf::ast::ast_operator::DIV"
-        TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV"
-        FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV"
-        MOD "cudf::ast::ast_operator::MOD"
-        PYMOD "cudf::ast::ast_operator::PYMOD"
-        POW "cudf::ast::ast_operator::POW"
-        EQUAL "cudf::ast::ast_operator::EQUAL"
-        NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL"
-        NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL"
-        LESS "cudf::ast::ast_operator::LESS"
-        GREATER "cudf::ast::ast_operator::GREATER"
-        LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL"
-        GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL"
-        BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND"
-        BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR"
-        BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR"
-        NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND"
-        LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND"
-        NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR"
-        LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
+        ADD
+        SUB
+        MUL
+        DIV
+        TRUE_DIV
+        FLOOR_DIV
+        MOD
+        PYMOD
+        POW
+        EQUAL
+        NULL_EQUAL
+        NOT_EQUAL
+        LESS
+        GREATER
+        LESS_EQUAL
+        GREATER_EQUAL
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        NULL_LOGICAL_AND
+        LOGICAL_AND
+        NULL_LOGICAL_OR
+        LOGICAL_OR
         # Unary operators
-        IDENTITY "cudf::ast::ast_operator::IDENTITY"
-        IS_NULL "cudf::ast::ast_operator::IS_NULL"
-        SIN "cudf::ast::ast_operator::SIN"
-        COS "cudf::ast::ast_operator::COS"
-        TAN "cudf::ast::ast_operator::TAN"
-        ARCSIN "cudf::ast::ast_operator::ARCSIN"
-        ARCCOS "cudf::ast::ast_operator::ARCCOS"
-        ARCTAN "cudf::ast::ast_operator::ARCTAN"
-        SINH "cudf::ast::ast_operator::SINH"
-        COSH "cudf::ast::ast_operator::COSH"
-        TANH "cudf::ast::ast_operator::TANH"
-        ARCSINH "cudf::ast::ast_operator::ARCSINH"
-        ARCCOSH "cudf::ast::ast_operator::ARCCOSH"
-        ARCTANH "cudf::ast::ast_operator::ARCTANH"
-        EXP "cudf::ast::ast_operator::EXP"
-        LOG "cudf::ast::ast_operator::LOG"
-        SQRT "cudf::ast::ast_operator::SQRT"
-        CBRT "cudf::ast::ast_operator::CBRT"
-        CEIL "cudf::ast::ast_operator::CEIL"
-        FLOOR "cudf::ast::ast_operator::FLOOR"
-        ABS "cudf::ast::ast_operator::ABS"
-        RINT "cudf::ast::ast_operator::RINT"
-        BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT"
-        NOT "cudf::ast::ast_operator::NOT"
+        IDENTITY
+        IS_NULL
+        SIN
+        COS
+        TAN
+        ARCSIN
+        ARCCOS
+        ARCTAN
+        SINH
+        COSH
+        TANH
+        ARCSINH
+        ARCCOSH
+        ARCTANH
+        EXP
+        LOG
+        SQRT
+        CBRT
+        CEIL
+        FLOOR
+        ABS
+        RINT
+        BIT_INVERT
+        NOT
 
     cdef cppclass expression:
         pass
 
-    ctypedef enum table_reference:
-        LEFT "cudf::ast::table_reference::LEFT"
-        RIGHT "cudf::ast::table_reference::RIGHT"
+    cpdef enum class table_reference(int32_t):
+        LEFT
+        RIGHT
 
     cdef cppclass literal(expression):
         # Due to https://github.com/cython/cython/issues/3198, we need to
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 86a4a60eef1..622725e06a3 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -19,8 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
 from cudf._lib.pylibcudf cimport transform as plc_transform
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 393a68dd844..63714a78572 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -4,7 +4,10 @@
 import ast
 import functools
 
-from cudf._lib.expressions import (
+import pyarrow as pa
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.expressions import (
     ASTOperator,
     ColumnReference,
     Expression,
@@ -122,7 +125,9 @@ def visit_Constant(self, node):
                 f"Unsupported literal {repr(node.value)} of type "
                 "{type(node.value).__name__}"
             )
-        self.stack.append(Literal(node.value))
+        self.stack.append(
+            Literal(plc.interop.from_arrow(pa.scalar(node.value)))
+        )
 
     def visit_UnaryOp(self, node):
         self.visit(node.operand)
@@ -132,7 +137,7 @@ def visit_UnaryOp(self, node):
             # operand, so there's no way to know whether this should be a float
             # or an int. We should maybe see what Spark does, and this will
             # probably require casting.
-            self.nodes.append(Literal(-1))
+            self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1))))
             op = ASTOperator.MUL
             self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
         elif isinstance(node.op, ast.UAdd):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
new file mode 100644
index 00000000000..f661512caad
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+# We can't really evaluate these expressions, so just make sure
+# construction works properly
+
+
+def test_literal_construction_invalid():
+    with pytest.raises(ValueError):
+        plc.expressions.Literal(
+            plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64())))
+        )
+
+
+@pytest.mark.parametrize(
+    "tableref",
+    [
+        plc.expressions.TableReference.LEFT,
+        plc.expressions.TableReference.RIGHT,
+    ],
+)
+def test_columnref_construction(tableref):
+    plc.expressions.ColumnReference(1.0, tableref)
+
+
+def test_columnnameref_construction():
+    plc.expressions.ColumnNameReference("abc")
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        # Unary op
+        {
+            "op": plc.expressions.ASTOperator.IDENTITY,
+            "left": plc.expressions.ColumnReference(1),
+        },
+        # Binop
+        {
+            "op": plc.expressions.ASTOperator.ADD,
+            "left": plc.expressions.ColumnReference(1),
+            "right": plc.expressions.ColumnReference(2),
+        },
+    ],
+)
+def test_astoperation_construction(kwargs):
+    plc.expressions.Operation(**kwargs)

From 2f8d514b1687164a94bbe89da1dab8eb37682b35 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 Jul 2024 20:15:25 -0400
Subject: [PATCH 071/101] Remove xml from sort_ninja_log.py utility (#16274)

Removes xml support from the `sort_ninja_log.py` utility. The xml support was experimental for possible use with Jenkins reporting that never materialized.
This script is used in build.sh generally when running local builds.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16274
---
 cpp/scripts/sort_ninja_log.py | 58 ++++++-----------------------------
 1 file changed, 9 insertions(+), 49 deletions(-)

diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 3fe503f749e..42f84e4d0c7 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -9,14 +9,12 @@
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
-)
+parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file")
 parser.add_argument(
     "--fmt",
     type=str,
     default="csv",
-    choices=["csv", "xml", "html"],
+    choices=["csv", "html"],
     help="output format (to stdout)",
 )
 parser.add_argument(
@@ -37,6 +35,7 @@
 output_fmt = args.fmt
 cmp_file = args.cmp_log
 
+
 # build a map of the log entries
 def build_log_map(log_file):
     entries = {}
@@ -68,37 +67,6 @@ def build_log_map(log_file):
     return entries
 
 
-# output results in XML format
-def output_xml(entries, sorted_list, args):
-    root = ET.Element("testsuites")
-    testsuite = ET.Element(
-        "testsuite",
-        attrib={
-            "name": "build-time",
-            "tests": str(len(sorted_list)),
-            "failures": str(0),
-            "errors": str(0),
-        },
-    )
-    root.append(testsuite)
-    for name in sorted_list:
-        entry = entries[name]
-        build_time = float(entry[1] - entry[0]) / 1000
-        item = ET.Element(
-            "testcase",
-            attrib={
-                "classname": "BuildTime",
-                "name": name,
-                "time": str(build_time),
-            },
-        )
-        testsuite.append(item)
-
-    tree = ET.ElementTree(root)
-    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
-    print(xmlstr)
-
-
 # utility converts a millisecond value to a column width in pixels
 def time_to_width(value, end):
     # map a value from (0,end) to (0,1000)
@@ -282,9 +250,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
 
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
-    print(
-        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
-    )
+    print("<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep="")
     if cmp_entries:
         print("<th>t-cmp</th>", sep="")
     print("</tr>")
@@ -303,9 +269,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
             diff_time_str = format_build_time(diff_time)
@@ -353,7 +317,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print(
             "<tr><td",
             white,
-            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+            ">time change &lt; 20% or build time &lt; 1 minute</td></tr>",
         )
         print("</table>")
 
@@ -370,9 +334,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         print(build_time, file_size, name, sep=",", end="")
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
@@ -396,9 +358,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
 # load the comparison build log if available
 cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
-if output_fmt == "xml":
-    output_xml(entries, sorted_list, args)
-elif output_fmt == "html":
+if output_fmt == "html":
     output_html(entries, sorted_list, cmp_entries, args)
 else:
     output_csv(entries, sorted_list, cmp_entries, args)

From 093bcc94ccf156a7e39339a7c4bb7e86543187de Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 Jul 2024 20:16:07 -0400
Subject: [PATCH 072/101] Update cudf::detail::grid_1d to use thread_index_type
 (#16276)

Updates the `cudf::detail::grid_1d` to use `thread_index_type` instead of `int` and `size_type` for the number threads and blocks.
This has become important for launching kernels with more threads than max `size_type` total bytes for warp-per-row and thread-per-byte algorithms.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16276
---
 cpp/include/cudf/detail/utilities/cuda.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index f1775c6d6d7..5007af7f9f1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -41,8 +41,8 @@ static constexpr size_type warp_size{32};
  */
 class grid_1d {
  public:
-  int const num_threads_per_block;
-  int const num_blocks;
+  thread_index_type const num_threads_per_block;
+  thread_index_type const num_blocks;
   /**
    * @param overall_num_elements The number of elements the kernel needs to
    * handle/process, in its main, one-dimensional/linear input (e.g. one or more
@@ -55,9 +55,9 @@ class grid_1d {
    * than a single element; this affects the number of threads the grid must
    * contain
    */
-  grid_1d(cudf::size_type overall_num_elements,
-          cudf::size_type num_threads_per_block,
-          cudf::size_type elements_per_thread = 1)
+  grid_1d(thread_index_type overall_num_elements,
+          thread_index_type num_threads_per_block,
+          thread_index_type elements_per_thread = 1)
     : num_threads_per_block(num_threads_per_block),
       num_blocks(util::div_rounding_up_safe(overall_num_elements,
                                             elements_per_thread * num_threads_per_block))

From aa466aaf91bc329cc4fced9b9a3426d79bfe7ffc Mon Sep 17 00:00:00 2001
From: Robert Maynard <robertjmaynard@gmail.com>
Date: Wed, 17 Jul 2024 10:48:17 -0400
Subject: [PATCH 073/101] Move kernel vis over to CUDF_HIDDEN (#16165)

Use CUDF_HIDDEN instead of the raw `__attribute__((visibility("hidden")))`  for symbol visibility controls on the CUDA kernels that we call from multiple TUs.  This is primarily a style change so that we have consistent visibility markup across the entire project

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16165
---
 cpp/src/join/mixed_join_kernel.cuh      |  3 ++-
 cpp/src/join/mixed_join_kernels_semi.cu |  3 ++-
 cpp/src/join/mixed_join_size_kernel.cuh | 28 ++++++++++++-------------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 0fc1c3718b1..ea59f23c77f 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -24,6 +24,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -38,7 +39,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join(table_device_view left_table,
                   table_device_view right_table,
                   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 01e3fe09b38..1f31eaa7878 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cub/cub.cuh>
@@ -34,7 +35,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join_semi(table_device_view left_table,
                        table_device_view right_table,
                        table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 618e7a9082e..00a90f8273f 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -35,20 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::mixed_multimap_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest

From 9db6723f2f2fe3451f0a5b81b7a43597358913ea Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 17 Jul 2024 09:54:09 -0700
Subject: [PATCH 074/101] Rename `.devcontainer`s for CUDA 12.5 (#16293)

Follow up to PR: https://github.com/rapidsai/cudf/pull/16259
Partially addresses issue: https://github.com/rapidsai/build-planning/issues/73

Renames the `.devcontainer`s for CUDA 12.5

Authors:
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/16293
---
 .../{cuda12.2-conda => cuda12.5-conda}/devcontainer.json          | 0
 .devcontainer/{cuda12.2-pip => cuda12.5-pip}/devcontainer.json    | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename .devcontainer/{cuda12.2-conda => cuda12.5-conda}/devcontainer.json (100%)
 rename .devcontainer/{cuda12.2-pip => cuda12.5-pip}/devcontainer.json (100%)

diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
similarity index 100%
rename from .devcontainer/cuda12.2-conda/devcontainer.json
rename to .devcontainer/cuda12.5-conda/devcontainer.json
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
similarity index 100%
rename from .devcontainer/cuda12.2-pip/devcontainer.json
rename to .devcontainer/cuda12.5-pip/devcontainer.json

From 1dd63ea8b28339c3b4a351b82dd81d425d985ba3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Jul 2024 08:47:32 -1000
Subject: [PATCH 075/101] Short circuit some Column methods (#16246)

Adds some short circuiting, possibly cached checks (e.g. all values unique, no-NAs, monotonicity), to `dropna`, `isnull`, `notnull`, `argsort`, `unique` and `sort_values` allowing these ops to just copy / return a "simplified" result

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16246
---
 python/cudf/cudf/_lib/column.pyx       | 12 ++++---
 python/cudf/cudf/core/column/column.py | 50 ++++++++++++++++++++------
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 7155017b7af..e030147fdd3 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -202,11 +202,13 @@ cdef class Column:
 
     def _clear_cache(self):
         self._distinct_count = {}
-        try:
-            del self.memory_usage
-        except AttributeError:
-            # `self.memory_usage` was never called before, So ignore.
-            pass
+        attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing")
+        for attr in attrs:
+            try:
+                delattr(self, attr)
+            except AttributeError:
+                # attr was not called yet, so ignore.
+                pass
         self._null_count = None
 
     def set_mask(self, value):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dbdf501e022..9467bbeed15 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -274,7 +274,10 @@ def any(self, skipna: bool = True) -> bool:
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
     def dropna(self) -> Self:
-        return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        if self.has_nulls():
+            return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        else:
+            return self.copy()
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -699,6 +702,9 @@ def fillna(
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(False, length=len(self))
+
         result = libcudf.unary.is_null(self)
 
         if self.dtype.kind == "f":
@@ -710,6 +716,9 @@ def isnull(self) -> ColumnBase:
 
     def notnull(self) -> ColumnBase:
         """Identify non-missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(True, length=len(self))
+
         result = libcudf.unary.is_valid(self)
 
         if self.dtype.kind == "f":
@@ -922,15 +931,16 @@ def as_mask(self) -> Buffer:
 
     @property
     def is_unique(self) -> bool:
+        # distinct_count might already be cached
         return self.distinct_count(dropna=False) == len(self)
 
-    @property
+    @cached_property
     def is_monotonic_increasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
-    @property
+    @cached_property
     def is_monotonic_decreasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
@@ -941,6 +951,10 @@ def sort_values(
         ascending: bool = True,
         na_position: str = "last",
     ) -> ColumnBase:
+        if (not ascending and self.is_monotonic_decreasing) or (
+            ascending and self.is_monotonic_increasing
+        ):
+            return self.copy()
         return libcudf.sort.sort(
             [self], column_order=[ascending], null_precedence=[na_position]
         )[0]
@@ -1090,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         )
 
     def argsort(
-        self, ascending: bool = True, na_position: str = "last"
-    ) -> "cudf.core.column.NumericalColumn":
-        return libcudf.sort.order_by(
-            [self], [ascending], na_position, stable=True
-        )
+        self,
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> cudf.core.column.NumericalColumn:
+        if (ascending and self.is_monotonic_increasing) or (
+            not ascending and self.is_monotonic_decreasing
+        ):
+            return as_column(range(len(self)))
+        elif (ascending and self.is_monotonic_decreasing) or (
+            not ascending and self.is_monotonic_increasing
+        ):
+            return as_column(range(len(self) - 1, -1, -1))
+        else:
+            return libcudf.sort.order_by(
+                [self], [ascending], na_position, stable=True
+            )
 
     def __arrow_array__(self, type=None):
         raise TypeError(
@@ -1157,9 +1182,12 @@ def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
-        return drop_duplicates([self], keep="first")[0]._with_type_metadata(
-            self.dtype
-        )
+        if self.is_unique:
+            return self.copy()
+        else:
+            return drop_duplicates([self], keep="first")[
+                0
+            ]._with_type_metadata(self.dtype)
 
     def serialize(self) -> tuple[dict, list]:
         # data model:

From 8b767e5c237840e0a35848bff7ed479ec5c56bb1 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:33:45 -0600
Subject: [PATCH 076/101] Remove decimal/floating 64/128bit switches due to
 register pressure (#16287)

The decimal <--> floating conversion PR reduced the performance of some of the AST and BINARYOP kernels due to register pressure.  This removes the switches that are the primary source of the register pressure, falling back to the old ipow() method for 64bit and 128bit integers.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16287
---
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   4 +-
 .../cudf/fixed_point/floating_conversion.hpp  | 138 +-----------------
 2 files changed, 6 insertions(+), 136 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6c3c3b4da07..c9cbc603226 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -84,8 +84,8 @@ template <typename Rep,
           Radix Base,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
-                                           is_supported_representation_type<Rep>())>* = nullptr>
-CUDF_HOST_DEVICE inline Rep ipow(T exponent)
+                                           cuda::std::is_integral_v<Rep>)>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
 
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index c64ae8877d4..f12177c6a4b 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -392,30 +392,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value / ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -429,49 +406,7 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for an introduction.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    case 20: return value / large_power_of_10<20>();
-    case 21: return value / large_power_of_10<21>();
-    case 22: return value / large_power_of_10<22>();
-    case 23: return value / large_power_of_10<23>();
-    case 24: return value / large_power_of_10<24>();
-    case 25: return value / large_power_of_10<25>();
-    case 26: return value / large_power_of_10<26>();
-    case 27: return value / large_power_of_10<27>();
-    case 28: return value / large_power_of_10<28>();
-    case 29: return value / large_power_of_10<29>();
-    case 30: return value / large_power_of_10<30>();
-    case 31: return value / large_power_of_10<31>();
-    case 32: return value / large_power_of_10<32>();
-    case 33: return value / large_power_of_10<33>();
-    case 34: return value / large_power_of_10<34>();
-    case 35: return value / large_power_of_10<35>();
-    case 36: return value / large_power_of_10<36>();
-    case 37: return value / large_power_of_10<37>();
-    case 38: return value / large_power_of_10<38>();
-    default: return 0;
-  }
+  return value / ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -512,30 +447,7 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value * ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -549,49 +461,7 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_128bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    case 20: return value * large_power_of_10<20>();
-    case 21: return value * large_power_of_10<21>();
-    case 22: return value * large_power_of_10<22>();
-    case 23: return value * large_power_of_10<23>();
-    case 24: return value * large_power_of_10<24>();
-    case 25: return value * large_power_of_10<25>();
-    case 26: return value * large_power_of_10<26>();
-    case 27: return value * large_power_of_10<27>();
-    case 28: return value * large_power_of_10<28>();
-    case 29: return value * large_power_of_10<29>();
-    case 30: return value * large_power_of_10<30>();
-    case 31: return value * large_power_of_10<31>();
-    case 32: return value * large_power_of_10<32>();
-    case 33: return value * large_power_of_10<33>();
-    case 34: return value * large_power_of_10<34>();
-    case 35: return value * large_power_of_10<35>();
-    case 36: return value * large_power_of_10<36>();
-    case 37: return value * large_power_of_10<37>();
-    case 38: return value * large_power_of_10<38>();
-    default: return 0;
-  }
+  return value * ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**

From 34dea6fe40fc20966b48257853865111df4a687f Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 17 Jul 2024 15:27:40 -0700
Subject: [PATCH 077/101] Add TPC-H inspired examples for Libcudf (#16088)

This PR adds a suite of `libcudf` examples with queries inspired from the TPC-H benchmarks. This PR also adds some reusable helper functions to perform operations such as joins, groubys, and orderbys for a cleaner and modular implementation of the queries.

# Queries implemented so far:
- [x] Query 1
- [X] Query 5
- [X] Query 6
- [X] Query 9

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16088
---
 cpp/examples/build.sh                  |   1 +
 cpp/examples/parquet_io/parquet_io.cpp |   4 +-
 cpp/examples/parquet_io/parquet_io.hpp |  31 --
 cpp/examples/tpch/CMakeLists.txt       |  32 ++
 cpp/examples/tpch/README.md            |  38 ++
 cpp/examples/tpch/q1.cpp               | 174 ++++++++++
 cpp/examples/tpch/q5.cpp               | 169 +++++++++
 cpp/examples/tpch/q6.cpp               | 137 ++++++++
 cpp/examples/tpch/q9.cpp               | 182 ++++++++++
 cpp/examples/tpch/utils.hpp            | 457 +++++++++++++++++++++++++
 cpp/examples/utilities/timer.hpp       |  54 +++
 11 files changed, 1247 insertions(+), 32 deletions(-)
 create mode 100644 cpp/examples/tpch/CMakeLists.txt
 create mode 100644 cpp/examples/tpch/README.md
 create mode 100644 cpp/examples/tpch/q1.cpp
 create mode 100644 cpp/examples/tpch/q5.cpp
 create mode 100644 cpp/examples/tpch/q6.cpp
 create mode 100644 cpp/examples/tpch/q9.cpp
 create mode 100644 cpp/examples/tpch/utils.hpp
 create mode 100644 cpp/examples/utilities/timer.hpp

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index bde6ef7d69c..dce81fb1677 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,6 +57,7 @@ build_example() {
 }
 
 build_example basic
+build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 8be17db3781..274a2599189 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -16,6 +16,8 @@
 
 #include "parquet_io.hpp"
 
+#include "../utilities/timer.hpp"
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -140,7 +142,7 @@ int main(int argc, char const** argv)
             << page_stat_string << ".." << std::endl;
 
   // `timer` is automatically started here
-  Timer timer;
+  cudf::examples::timer timer;
   write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
   timer.print_elapsed_millis();
 
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
index d2fc359a2fe..e27cbec4fce 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -124,34 +124,3 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   return std::nullopt;
 }
-
-/**
- * @brief Light-weight timer for parquet reader and writer instrumentation
- *
- * Timer object constructed from std::chrono, instrumenting at microseconds
- * precision. Can display elapsed durations at milli and micro second
- * scales. Timer starts at object construction.
- */
-class Timer {
- public:
-  using micros = std::chrono::microseconds;
-  using millis = std::chrono::milliseconds;
-
-  Timer() { reset(); }
-  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
-  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
-  void print_elapsed_micros()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
-              << "us\n\n";
-  }
-  void print_elapsed_millis()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
-              << "ms\n\n";
-  }
-
- private:
-  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
-  time_point_t start_time;
-};
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
new file mode 100644
index 00000000000..1b91d07e148
--- /dev/null
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(tpch_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  tpch_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(tpch_q1 q1.cpp)
+target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
+target_compile_features(tpch_q1 PRIVATE cxx_std_17)
+
+add_executable(tpch_q5 q5.cpp)
+target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
+target_compile_features(tpch_q5 PRIVATE cxx_std_17)
+
+add_executable(tpch_q6 q6.cpp)
+target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
+target_compile_features(tpch_q6 PRIVATE cxx_std_17)
+
+add_executable(tpch_q9 q9.cpp)
+target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
+target_compile_features(tpch_q9 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
new file mode 100644
index 00000000000..1ea71ae9824
--- /dev/null
+++ b/cpp/examples/tpch/README.md
@@ -0,0 +1,38 @@
+# TPC-H Inspired Examples
+
+Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
+
+## Requirements
+
+- Rust
+
+## Generating the Dataset
+
+1. Clone the datafusion repository.
+```bash
+git clone git@github.com:apache/datafusion.git
+```
+
+2. Run the data generator. The data will be placed in a `data/` subdirectory.
+```bash
+cd datafusion/benchmarks/
+./bench.sh data tpch
+
+# for scale factor 10,
+./bench.sh data tpch10
+```
+
+## Running Queries
+
+1. Build the examples.
+```bash
+cd cpp/examples
+./build.sh
+```
+The TPC-H query binaries would be built inside `examples/tpch/build`.
+
+2. Execute the queries.
+```bash
+./tpch/build/tpch_q1
+```
+A parquet file named `q1.parquet` would be generated holding the results of the query.
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
new file mode 100644
index 00000000000..1bdf039da4a
--- /dev/null
+++ b/cpp/examples/tpch/q1.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q1.cpp
+ * @brief Implement query 1 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    l_returnflag,
+ *    l_linestatus,
+ *    sum(l_quantity) as sum_qty,
+ *    sum(l_extendedprice) as sum_base_price,
+ *    sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ *    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ *    avg(l_quantity) as avg_qty,
+ *    avg(l_extendedprice) as avg_price,
+ *    avg(l_discount) as avg_disc,
+ *    count(*) as count_order
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate <= date '1998-09-02'
+ * group by
+ *    l_returnflag,
+ *    l_linestatus
+ * order by
+ *    l_returnflag,
+ *    l_linestatus;
+ */
+
+/**
+ * @brief Calculate the discount price column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const disc_price_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto disc_price            = cudf::binary_operation(extendedprice,
+                                           one_minus_discount->view(),
+                                           cudf::binary_operator::MUL,
+                                           disc_price_type,
+                                           stream,
+                                           mr);
+  return disc_price;
+}
+
+/**
+ * @brief Calculate the charge column
+ *
+ * @param tax The tax column
+ * @param disc_price The discount price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+  cudf::column_view const& tax,
+  cudf::column_view const& disc_price,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_plus_tax =
+    cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr);
+  auto const charge_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto charge            = cudf::binary_operation(
+    disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, charge_type, stream, mr);
+  return charge;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projections and filter predicate for `lineitem` table
+  std::vector<std::string> const lineitem_cols = {"l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_shipdate",
+                                                  "l_orderkey",
+                                                  "l_tax"};
+  auto const shipdate_ref                      = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
+
+  // Read out the `lineitem` table from parquet file
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Calculate the discount price and charge columns and append to lineitem table
+  auto disc_price =
+    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+  (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
+
+  // Perform the group by operation
+  auto const groupedby_table = apply_groupby(
+    lineitem,
+    groupby_context_t{
+      {"l_returnflag", "l_linestatus"},
+      {
+        {"l_extendedprice",
+         {{cudf::aggregation::Kind::SUM, "sum_base_price"},
+          {cudf::aggregation::Kind::MEAN, "avg_price"}}},
+        {"l_quantity",
+         {{cudf::aggregation::Kind::SUM, "sum_qty"}, {cudf::aggregation::Kind::MEAN, "avg_qty"}}},
+        {"l_discount",
+         {
+           {cudf::aggregation::Kind::MEAN, "avg_disc"},
+         }},
+        {"disc_price",
+         {
+           {cudf::aggregation::Kind::SUM, "sum_disc_price"},
+         }},
+        {"charge",
+         {{cudf::aggregation::Kind::SUM, "sum_charge"},
+          {cudf::aggregation::Kind::COUNT_ALL, "count_order"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table = apply_orderby(groupedby_table,
+                                             {"l_returnflag", "l_linestatus"},
+                                             {cudf::order::ASCENDING, cudf::order::ASCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q1.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
new file mode 100644
index 00000000000..e56850b94d6
--- /dev/null
+++ b/cpp/examples/tpch/q5.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q5.cpp
+ * @brief Implement query 5 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ * create view region as select * from '/tables/scale-1/region.parquet';
+ *
+ * select
+ *    n_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    supplier,
+ *    nation,
+ *    region
+ * where
+ *     c_custkey = o_custkey
+ *    and l_orderkey = o_orderkey
+ *    and l_suppkey = s_suppkey
+ *    and c_nationkey = s_nationkey
+ *    and s_nationkey = n_nationkey
+ *    and n_regionkey = r_regionkey
+ *    and r_name = 'ASIA'
+ *    and o_orderdate >= date '1994-01-01'
+ *    and o_orderdate < date '1995-01-01'
+ * group by
+ *    n_name
+ * order by
+ *    revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  // Define the column projection and filter predicate for the `region` table
+  std::vector<std::string> const region_cols = {"r_regionkey", "r_name"};
+  auto const r_name_ref                      = cudf::ast::column_reference(std::distance(
+    region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
+  auto r_name_value                          = cudf::string_scalar("ASIA");
+  auto const r_name_literal                  = cudf::ast::literal(r_name_value);
+  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer =
+    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+                                     {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+  auto const nation =
+    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+  auto const region =
+    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
+  auto const join_b = apply_inner_join(join_a, customer, {"n_nationkey"}, {"c_nationkey"});
+  auto const join_c = apply_inner_join(join_b, orders, {"c_custkey"}, {"o_custkey"});
+  auto const join_d = apply_inner_join(join_c, lineitem, {"o_orderkey"}, {"l_orderkey"});
+  auto joined_table =
+    apply_inner_join(supplier, join_d, {"s_suppkey", "s_nationkey"}, {"l_suppkey", "n_nationkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table =
+    apply_groupby(joined_table,
+                  groupby_context_t{{"n_name"},
+                                    {
+                                      {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+                                    }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q5.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
new file mode 100644
index 00000000000..f11b3d6ab3b
--- /dev/null
+++ b/cpp/examples/tpch/q6.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q6.cpp
+ * @brief Implement query 6 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    sum(l_extendedprice * l_discount) as revenue
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate >= date '1994-01-01'
+ *    and l_shipdate < date '1995-01-01'
+ *    and l_discount >= 0.05
+ *    and l_discount <= 0.07
+ *    and l_quantity < 24;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(
+    extendedprice, discount, cudf::binary_operator::MUL, revenue_type, stream, mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the `lineitem` table from parquet file
+  std::vector<std::string> const lineitem_cols = {
+    "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
+  auto const shipdate_ref = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const shipdate_lower_literal = cudf::ast::literal(shipdate_lower);
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto const shipdate_pred_a        = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
+  auto const shipdate_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
+  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Cast the discount and quantity columns to float32 and append to lineitem table
+  auto discout_float =
+    cudf::cast(lineitem->column("l_discount"), cudf::data_type{cudf::type_id::FLOAT32});
+  auto quantity_float =
+    cudf::cast(lineitem->column("l_quantity"), cudf::data_type{cudf::type_id::FLOAT32});
+
+  (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
+
+  // Apply the filters
+  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+
+  auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
+  auto const discount_lower_literal = cudf::ast::literal(discount_lower);
+  auto discount_upper               = cudf::numeric_scalar<float_t>(0.07);
+  auto const discount_upper_literal = cudf::ast::literal(discount_upper);
+  auto quantity_upper               = cudf::numeric_scalar<float_t>(24);
+  auto const quantity_upper_literal = cudf::ast::literal(quantity_upper);
+
+  auto const discount_pred_a = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, discount_ref, discount_lower_literal);
+
+  auto const discount_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, discount_ref, discount_upper_literal);
+  auto const discount_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred_a, discount_pred_b);
+  auto const quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, quantity_ref, quantity_upper_literal);
+  auto const discount_quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred, quantity_pred);
+  auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
+
+  // Calculate the `revenue` column
+  auto revenue =
+    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+
+  // Sum the `revenue` column
+  auto const revenue_view = revenue->view();
+  auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  result_table->to_parquet("q6.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp
new file mode 100644
index 00000000000..d3c218253f9
--- /dev/null
+++ b/cpp/examples/tpch/q9.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+/**
+ * @file q9.cpp
+ * @brief Implement query 9 of the TPC-H benchmark.
+ *
+ * create view part as select * from '/tables/scale-1/part.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view partsupp as select * from '/tables/scale-1/partsupp.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    nation,
+ *    o_year,
+ *    sum(amount) as sum_profit
+ * from
+ *     (
+ *        select
+ *            n_name as nation,
+ *            extract(year from o_orderdate) as o_year,
+ *            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ *        from
+ *            part,
+ *            supplier,
+ *            lineitem,
+ *            partsupp,
+ *            orders,
+ *            nation
+ *        where
+ *           s_suppkey = l_suppkey
+ *           and ps_suppkey = l_suppkey
+ *           and ps_partkey = l_partkey
+ *           and p_partkey = l_partkey
+ *           and o_orderkey = l_orderkey
+ *           and s_nationkey = n_nationkey
+ *           and p_name like '%green%'
+ *     ) as profit
+ * group by
+ *     nation,
+ *     o_year
+ * order by
+ *     nation,
+ *     o_year desc;
+ */
+
+/**
+ * @brief Calculate the amount column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param supplycost The supply cost column
+ * @param quantity The quantity column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& supplycost,
+  cudf::column_view const& quantity,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type());
+  auto const extendedprice_discounted_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const extendedprice_discounted      = cudf::binary_operation(extendedprice,
+                                                               one_minus_discount->view(),
+                                                               cudf::binary_operator::MUL,
+                                                               extendedprice_discounted_type,
+                                                               stream,
+                                                               mr);
+  auto const supplycost_quantity_type      = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const supplycost_quantity           = cudf::binary_operation(
+    supplycost, quantity, cudf::binary_operator::MUL, supplycost_quantity_type);
+  auto amount = cudf::binary_operation(extendedprice_discounted->view(),
+                                       supplycost_quantity->view(),
+                                       cudf::binary_operator::SUB,
+                                       extendedprice_discounted->type(),
+                                       stream,
+                                       mr);
+  return amount;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the table from parquet files
+  auto const lineitem = read_parquet(
+    args.dataset_dir + "/lineitem.parquet",
+    {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+                                     {"ps_suppkey", "ps_partkey", "ps_supplycost"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+
+  // Generating the `profit` table
+  // Filter the part table using `p_name like '%green%'`
+  auto const p_name = part->table().column(1);
+  auto const mask =
+    cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%"));
+  auto const part_filtered = apply_mask(part, mask);
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(supplier, nation, {"s_nationkey"}, {"n_nationkey"});
+  auto const join_b = apply_inner_join(partsupp, join_a, {"ps_suppkey"}, {"s_suppkey"});
+  auto const join_c = apply_inner_join(lineitem, part_filtered, {"l_partkey"}, {"p_partkey"});
+  auto const join_d = apply_inner_join(orders, join_c, {"o_orderkey"}, {"l_orderkey"});
+  auto const joined_table =
+    apply_inner_join(join_d, join_b, {"l_suppkey", "l_partkey"}, {"s_suppkey", "ps_partkey"});
+
+  // Calculate the `nation`, `o_year`, and `amount` columns
+  auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
+  auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
+  auto amount = calc_amount(joined_table->column("l_discount"),
+                            joined_table->column("l_extendedprice"),
+                            joined_table->column("ps_supplycost"),
+                            joined_table->column("l_quantity"));
+
+  // Put together the `profit` table
+  std::vector<std::unique_ptr<cudf::column>> profit_columns;
+  profit_columns.push_back(std::move(n_name));
+  profit_columns.push_back(std::move(o_year));
+  profit_columns.push_back(std::move(amount));
+
+  auto profit_table = std::make_unique<cudf::table>(std::move(profit_columns));
+  auto const profit = std::make_unique<table_with_names>(
+    std::move(profit_table), std::vector<std::string>{"nation", "o_year", "amount"});
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    profit,
+    groupby_context_t{{"nation", "o_year"},
+                      {{"amount", {{cudf::groupby_aggregation::SUM, "sum_profit"}}}}});
+
+  // Perform the orderby operation
+  auto const orderedby_table = apply_orderby(
+    groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q9.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
new file mode 100644
index 00000000000..e586da2c802
--- /dev/null
+++ b/cpp/examples/tpch/utils.hpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <ctime>
+
+// RMM memory resource creation utilities
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda(), rmm::percent_of_free_device_memory(50));
+}
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_managed(), rmm::percent_of_free_device_memory(50));
+}
+inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+  std::string const& mode)
+{
+  if (mode == "cuda") return make_cuda();
+  if (mode == "pool") return make_pool();
+  if (mode == "managed") return make_managed();
+  if (mode == "managed_pool") return make_managed_pool();
+  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
+            "\nExpecting: cuda, pool, managed, or managed_pool");
+}
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names)
+  {
+  }
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
+  {
+    return tbl->view().column(col_id(col_name));
+  }
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
+  {
+    CUDF_FUNC_RANGE();
+    auto it = std::find(col_names.begin(), col_names.end(), col_name);
+    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
+    return std::distance(col_names.begin(), it);
+  }
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
+  {
+    CUDF_FUNC_RANGE();
+    auto cols = tbl->release();
+    cols.push_back(std::move(col));
+    tbl = std::make_unique<cudf::table>(std::move(cols));
+    col_names.push_back(col_name);
+    return (*this);
+  }
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
+  {
+    CUDF_FUNC_RANGE();
+    std::vector<cudf::size_type> col_indices;
+    for (auto const& col_name : col_names) {
+      col_indices.push_back(col_id(col_name));
+    }
+    return tbl->select(col_indices);
+  }
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const
+  {
+    CUDF_FUNC_RANGE();
+    auto const sink_info = cudf::io::sink_info(filepath);
+    cudf::io::table_metadata metadata;
+    metadata.schema_info =
+      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    cudf::io::write_parquet(options);
+  }
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Concatenate two vectors
+ *
+ * @param lhs The left vector
+ * @param rhs The right vector
+ */
+template <typename T>
+std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
+{
+  std::vector<T> result;
+  result.reserve(lhs.size() + rhs.size());
+  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
+  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
+  return result;
+}
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->col_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->col_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  return std::make_unique<table_with_names>(
+    std::move(table), concat(left_input->column_names(), right_input->column_names()));
+}
+
+/**
+ * @brief Apply a filter predicated to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param filename The path to the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  std::string const& filename,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
+{
+  CUDF_FUNC_RANGE();
+  auto const source = cudf::io::source_info(filename);
+  auto builder      = cudf::io::parquet_reader_options_builder(source);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+struct tpch_example_args {
+  std::string dataset_dir;
+  std::string memory_resource_type;
+};
+
+/**
+ * @brief Parse command line arguments into a struct
+ *
+ * @param argc The number of command line arguments
+ * @param argv The command line arguments
+ */
+tpch_example_args parse_args(int argc, char const** argv)
+{
+  if (argc < 3) {
+    std::string usage_message = "Usage: " + std::string(argv[0]) +
+                                " <dataset_dir> <memory_resource_type>\n The query result will be "
+                                "saved to a parquet file named q{query_no}.parquet in the current "
+                                "working directory ";
+    throw std::runtime_error(usage_message);
+  }
+  tpch_example_args args;
+  args.dataset_dir          = argv[1];
+  args.memory_resource_type = argv[2];
+  return args;
+}
diff --git a/cpp/examples/utilities/timer.hpp b/cpp/examples/utilities/timer.hpp
new file mode 100644
index 00000000000..65fa92e74cf
--- /dev/null
+++ b/cpp/examples/utilities/timer.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <chrono>
+#include <iostream>
+
+namespace cudf {
+namespace examples {
+/**
+ * @brief Light-weight timer for measuring elapsed time.
+ *
+ * A timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. The timer starts at object construction.
+ */
+class timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() const { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};
+
+}  // namespace examples
+};  // namespace cudf

From c4471c4ee81ed967f1818bc03c5f7829b15cfe56 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 10:44:04 -0400
Subject: [PATCH 078/101] Fix split_record for all empty strings column
 (#16291)

Fixes `cudf::strings::split_record` handling of an all empty strings column. This caused a kernel launch with no threads eventually reporting a CUDA error. A new gtest was added to check this condition and includes tests for `rsplit_record` as well.

Closes #16284

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16291
---
 cpp/src/strings/split/split.cuh   |  6 ++++++
 cpp/tests/strings/split_tests.cpp | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 23614ac0733..4d7096c02ca 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -357,6 +357,12 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
+  if (chars_bytes == 0) {
+    auto offsets = cudf::make_column_from_scalar(
+      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
+    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
+    return std::pair{std::move(offsets), std::move(tokens)};
+  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d53c64ed539..4c020cb4c29 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
+  auto delimiter = cudf::string_scalar("s");
+  auto empty     = cudf::string_scalar("");
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  auto result = cudf::strings::split_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::split_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+  result = cudf::strings::rsplit_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::rsplit_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, MultiByteDelimiters)
 {
   // Overlapping delimiters

From faddc8c3d37e5cf8ec69341118218c245e087c26 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:03:55 -0700
Subject: [PATCH 079/101] Migrate CSV reader to pylibcudf (#16011)

xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16011
---
 .../user_guide/api_docs/pylibcudf/io/csv.rst  |   6 +
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 python/cudf/cudf/_lib/csv.pyx                 | 436 ++++++------------
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   6 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   1 +
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   2 +-
 python/cudf/cudf/_lib/pylibcudf/io/csv.pyx    | 264 +++++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |   2 +-
 python/cudf/cudf/_lib/types.pyx               |   3 +
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  43 +-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  14 +
 .../cudf/cudf/pylibcudf_tests/io/test_csv.py  | 280 +++++++++++
 13 files changed, 751 insertions(+), 310 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_csv.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
new file mode 100644
index 00000000000..5a2276f8b2d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
@@ -0,0 +1,6 @@
+===
+CSV
+===
+
+.. automodule:: cudf._lib.pylibcudf.io.csv
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index bde6d8094ce..697bce739de 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,4 +16,5 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    csv
     json
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 9fecff5f5f6..099b61d62ae 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -9,8 +8,12 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
-from cudf._lib.types cimport dtype_to_data_type
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
+import errno
+import os
+from collections import abc
+from io import BytesIO, StringIO
 
 import numpy as np
 import pandas as pd
@@ -18,65 +21,24 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
-import errno
-import os
-from collections import abc
-from enum import IntEnum
-from io import BytesIO, StringIO
-
-from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.pylibcudf.libcudf.io.csv cimport (
-    csv_reader_options,
     csv_writer_options,
-    read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    quote_style,
-    sink_info,
-    source_info,
-    table_with_metadata,
-)
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+import cudf._lib.pylibcudf as plc
 from cudf.api.types import is_hashable
 
-ctypedef int32_t underlying_type_t_compression
-
-
-class Compression(IntEnum):
-    INFER = (
-        <underlying_type_t_compression> compression_type.AUTO
-    )
-    SNAPPY = (
-        <underlying_type_t_compression> compression_type.SNAPPY
-    )
-    GZIP = (
-        <underlying_type_t_compression> compression_type.GZIP
-    )
-    BZ2 = (
-        <underlying_type_t_compression> compression_type.BZIP2
-    )
-    BROTLI = (
-        <underlying_type_t_compression> compression_type.BROTLI
-    )
-    ZIP = (
-        <underlying_type_t_compression> compression_type.ZIP
-    )
-    XZ = (
-        <underlying_type_t_compression> compression_type.XZ
-    )
-
+from cudf._lib.pylibcudf.types cimport DataType
 
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
@@ -84,234 +46,6 @@ CSV_HEX_TYPE_MAP = {
     "hex32": np.dtype("int32")
 }
 
-cdef csv_reader_options make_csv_reader_options(
-    object datasource,
-    object lineterminator,
-    object quotechar,
-    int quoting,
-    bool doublequote,
-    object header,
-    bool mangle_dupe_cols,
-    object usecols,
-    object delimiter,
-    bool delim_whitespace,
-    bool skipinitialspace,
-    object names,
-    object dtype,
-    int skipfooter,
-    int skiprows,
-    bool dayfirst,
-    object compression,
-    object thousands,
-    object decimal,
-    object true_values,
-    object false_values,
-    object nrows,
-    object byte_range,
-    bool skip_blank_lines,
-    object parse_dates,
-    object comment,
-    object na_values,
-    bool keep_default_na,
-    bool na_filter,
-    object prefix,
-    object index_col,
-) except *:
-    cdef source_info c_source_info = make_source_info([datasource])
-    cdef compression_type c_compression
-    cdef vector[string] c_names
-    cdef size_t c_byte_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_t c_byte_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef vector[int] c_use_cols_indexes
-    cdef vector[string] c_use_cols_names
-    cdef size_type c_nrows = nrows if nrows is not None else -1
-    cdef quote_style c_quoting
-    cdef vector[string] c_parse_dates_names
-    cdef vector[int] c_parse_dates_indexes
-    cdef vector[string] c_hex_col_names
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, data_type] c_dtypes_map
-    cdef vector[int] c_hex_col_indexes
-    cdef vector[string] c_true_values
-    cdef vector[string] c_false_values
-    cdef vector[string] c_na_values
-
-    # Reader settings
-    if compression is None:
-        c_compression = compression_type.NONE
-    else:
-        compression = str(compression)
-        compression = Compression[compression.upper()]
-        c_compression = <compression_type> (
-            <underlying_type_t_compression> compression
-        )
-
-    if quoting == 1:
-        c_quoting = quote_style.ALL
-    elif quoting == 2:
-        c_quoting = quote_style.NONNUMERIC
-    elif quoting == 3:
-        c_quoting = quote_style.NONE
-    else:
-        # Default value
-        c_quoting = quote_style.MINIMAL
-
-    cdef csv_reader_options csv_reader_options_c = move(
-        csv_reader_options.builder(c_source_info)
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(c_byte_range_offset)
-        .byte_range_size(c_byte_range_size)
-        .nrows(c_nrows)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(c_quoting)
-        .lineterminator(ord(lineterminator))
-        .quotechar(ord(quotechar))
-        .decimal(ord(decimal))
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            csv_reader_options_c.set_header(-1)
-        else:
-            csv_reader_options_c.set_header(header)
-
-        c_names.reserve(len(names))
-        for name in names:
-            c_names.push_back(str(name).encode())
-        csv_reader_options_c.set_names(c_names)
-    else:
-        if header is None:
-            csv_reader_options_c.set_header(-1)
-        elif header == 'infer':
-            csv_reader_options_c.set_header(0)
-        else:
-            csv_reader_options_c.set_header(header)
-
-    if prefix is not None:
-        csv_reader_options_c.set_prefix(prefix.encode())
-
-    if usecols is not None:
-        all_int = all(isinstance(col, int) for col in usecols)
-        if all_int:
-            c_use_cols_indexes.reserve(len(usecols))
-            c_use_cols_indexes = usecols
-            csv_reader_options_c.set_use_cols_indexes(c_use_cols_indexes)
-        else:
-            c_use_cols_names.reserve(len(usecols))
-            for col_name in usecols:
-                c_use_cols_names.push_back(
-                    str(col_name).encode()
-                )
-            csv_reader_options_c.set_use_cols_names(c_use_cols_names)
-
-    if delimiter is not None:
-        csv_reader_options_c.set_delimiter(ord(delimiter))
-
-    if thousands is not None:
-        csv_reader_options_c.set_thousands(ord(thousands))
-
-    if comment is not None:
-        csv_reader_options_c.set_comment(ord(comment))
-
-    if parse_dates is not None:
-        if isinstance(parse_dates, abc.Mapping):
-            raise NotImplementedError(
-                "`parse_dates`: dictionaries are unsupported")
-        if not isinstance(parse_dates, abc.Iterable):
-            raise NotImplementedError(
-                "`parse_dates`: an iterable is required")
-        for col in parse_dates:
-            if isinstance(col, str):
-                c_parse_dates_names.push_back(str(col).encode())
-            elif isinstance(col, int):
-                c_parse_dates_indexes.push_back(col)
-            else:
-                raise NotImplementedError(
-                    "`parse_dates`: Nesting is unsupported")
-        csv_reader_options_c.set_parse_dates(c_parse_dates_names)
-        csv_reader_options_c.set_parse_dates(c_parse_dates_indexes)
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    c_hex_col_names.push_back(str(k).encode())
-
-                c_dtypes_map[str(k).encode()] = \
-                    _get_cudf_data_type_from_dtype(
-                        cudf.dtype(col_type))
-            csv_reader_options_c.set_dtypes(c_dtypes_map)
-            csv_reader_options_c.set_parse_hex(c_hex_col_names)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            c_dtypes_list.reserve(1)
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                c_hex_col_indexes.push_back(0)
-
-            c_dtypes_list.push_back(
-                _get_cudf_data_type_from_dtype(dtype)
-            )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        elif isinstance(dtype, abc.Collection):
-            c_dtypes_list.reserve(len(dtype))
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    c_hex_col_indexes.push_back(index)
-
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(col_dtype)
-                )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-
-    if true_values is not None:
-        c_true_values.reserve(len(true_values))
-        for tv in true_values:
-            c_true_values.push_back(tv.encode())
-        csv_reader_options_c.set_true_values(c_true_values)
-
-    if false_values is not None:
-        c_false_values.reserve(len(false_values))
-        for fv in false_values:
-            c_false_values.push_back(fv.encode())
-        csv_reader_options_c.set_false_values(c_false_values)
-
-    if na_values is not None:
-        c_na_values.reserve(len(na_values))
-        for nv in na_values:
-            c_na_values.push_back(nv.encode())
-        csv_reader_options_c.set_na_values(c_na_values)
-
-    return csv_reader_options_c
-
 
 def validate_args(
     object delimiter,
@@ -381,7 +115,6 @@ def read_csv(
     bool na_filter=True,
     object prefix=None,
     object index_col=None,
-    **kwargs,
 ):
     """
     Cython function to call into libcudf API, see `read_csv`.
@@ -413,23 +146,120 @@ def read_csv(
     if delimiter is None:
         delimiter = sep
 
-    cdef csv_reader_options read_csv_options_c = make_csv_reader_options(
-        datasource, lineterminator, quotechar, quoting, doublequote,
-        header, mangle_dupe_cols, usecols, delimiter, delim_whitespace,
-        skipinitialspace, names, dtype, skipfooter, skiprows, dayfirst,
-        compression, thousands, decimal, true_values, false_values, nrows,
-        byte_range, skip_blank_lines, parse_dates, comment, na_values,
-        keep_default_na, na_filter, prefix, index_col)
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = compression_type.NONE
+    else:
+        compression_map = {
+            "infer": compression_type.AUTO,
+            "gzip": compression_type.GZIP,
+            "bz2": compression_type.BZIP2,
+            "zip": compression_type.ZIP,
+        }
+        c_compression = compression_map[compression]
 
-    cdef table_with_metadata c_result
-    with nogil:
-        c_result = move(cpp_read_csv(read_csv_options_c))
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == 'infer':
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == 'infer':
+            header = 0
 
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
+    hex_cols = []
+
+    new_dtypes = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = dict()
+            for k, v in dtype.items():
+                col_type = v
+                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
+                    col_type = CSV_HEX_TYPE_MAP[v]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif (
+            cudf.api.types.is_scalar(dtype) or
+            isinstance(dtype, (
+                np.dtype, pd.api.extensions.ExtensionDtype, type
+            ))
+        ):
+            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
+                dtype = CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            new_dtypes.append(
+                _get_plc_data_type_from_dtype(dtype)
+            )
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
+                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(
+                    _get_plc_data_type_from_dtype(col_dtype)
+                )
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+
+    lineterminator = str(lineterminator)
+
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([datasource]),
+                lineterminator=lineterminator,
+                quotechar = quotechar,
+                quoting = quoting,
+                doublequote = doublequote,
+                header = header,
+                mangle_dupe_cols = mangle_dupe_cols,
+                usecols = usecols,
+                delimiter = delimiter,
+                delim_whitespace = delim_whitespace,
+                skipinitialspace = skipinitialspace,
+                col_names = names,
+                dtypes = new_dtypes,
+                skipfooter = skipfooter,
+                skiprows = skiprows,
+                dayfirst = dayfirst,
+                compression = c_compression,
+                thousands = thousands,
+                decimal = decimal,
+                true_values = true_values,
+                false_values = false_values,
+                nrows = nrows if nrows is not None else -1,
+                byte_range_offset = byte_range[0],
+                byte_range_size = byte_range[1],
+                skip_blank_lines = skip_blank_lines,
+                parse_dates = parse_dates,
+                parse_hex = hex_cols,
+                comment = comment,
+                na_values = na_values,
+                keep_default_na = keep_default_na,
+                na_filter = na_filter,
+                prefix = prefix,
+            )
+        )
+    )
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
@@ -459,7 +289,7 @@ def read_csv(
             index_col_name = df._data.select_by_index(index_col).names[0]
             df = df.set_index(index_col_name)
             if isinstance(index_col_name, str) and \
-                    names is None and header in ("infer",):
+                    names is None and orig_header == "infer":
                 if index_col_name.startswith("Unnamed:"):
                     # TODO: Try to upstream it to libcudf
                     # csv reader in future
@@ -550,7 +380,7 @@ def write_csv(
         )
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
+cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
@@ -561,36 +391,36 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_DAYS
             )
         elif str(dtype) in ("date", "date64"):
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[us]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MICROSECONDS
             )
         elif str(dtype) == "timestamp[s]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_SECONDS
             )
         elif str(dtype) == "timestamp[ms]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[ns]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_NANOSECONDS
             )
 
     dtype = cudf.dtype(dtype)
-    return dtype_to_data_type(dtype)
+    return dtype_to_pylibcudf_type(dtype)
 
 
 def columns_apply_na_rep(column_names, na_rep):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 084b341ec48..8dd08d11dc8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,7 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
-                                pylibcudf_io_types
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
+                                pylibcudf_io_json pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index ef4c65b277e..5b3272d60e0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+# CSV is removed since it is def not cpdef (to force kw-only arguments)
 from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index fb4e4c7e4bb..e17deaa4663 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, json, types
+from . import avro, csv, datasource, json, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
new file mode 100644
index 00000000000..e9efb5befee
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
@@ -0,0 +1,264 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.map cimport map
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+    csv_reader_options,
+    read_csv as cpp_read_csv,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    quote_style,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef tuple _process_parse_dates_hex(list cols):
+    cdef vector[string] str_cols
+    cdef vector[int] int_cols
+    for col in cols:
+        if isinstance(col, str):
+            str_cols.push_back(col.encode())
+        else:
+            int_cols.push_back(col)
+    return str_cols, int_cols
+
+cdef vector[string] _make_str_vector(list vals):
+    cdef vector[string] res
+    for val in vals:
+        res.push_back((<str?>val).encode())
+    return res
+
+
+def read_csv(
+    SourceInfo source_info,
+    *,
+    compression_type compression = compression_type.AUTO,
+    size_t byte_range_offset = 0,
+    size_t byte_range_size = 0,
+    list col_names = None,
+    str prefix = "",
+    bool mangle_dupe_cols = True,
+    list usecols = None,
+    size_type nrows = -1,
+    size_type skiprows = 0,
+    size_type skipfooter = 0,
+    size_type header = 0,
+    str lineterminator = "\n",
+    str delimiter = None,
+    str thousands = None,
+    str decimal = ".",
+    str comment = None,
+    bool delim_whitespace = False,
+    bool skipinitialspace = False,
+    bool skip_blank_lines = True,
+    quote_style quoting = quote_style.MINIMAL,
+    str quotechar = '"',
+    bool doublequote = True,
+    list parse_dates = None,
+    list parse_hex = None,
+    # Technically this should be dict/list
+    # but using a fused type prevents using None as default
+    object dtypes = None,
+    list true_values = None,
+    list false_values = None,
+    list na_values = None,
+    bool keep_default_na = True,
+    bool na_filter = True,
+    bool dayfirst = False,
+    # Note: These options are supported by the libcudf reader
+    # but are not exposed here since there is no demand for them
+    # on the Python side yet.
+    # bool detect_whitespace_around_quotes = False,
+    # DataType timestamp_type = DataType(type_id.EMPTY),
+):
+    """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo to read the CSV file from.
+    compression : compression_type, default CompressionType.AUTO
+        The compression format of the CSV source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    col_names : list, default None
+        The column names to use.
+    prefix : string, default ''
+        The prefix to apply to the column names.
+    mangle_dupe_cols : bool, default True
+        If True, rename duplicate column names.
+    usecols : list, default None
+        Specify the string column names/integer column indices of columns to be read.
+    nrows : size_type, default -1
+        The number of rows to read.
+    skiprows : size_type, default 0
+        The number of rows to skip from the start before reading
+    skipfooter : size_type, default 0
+        The number of rows to skip from the end
+    header : size_type, default 0
+        The index of the row that will be used for header names.
+        Pass -1 to use default column names.
+    lineterminator : str, default '\\n'
+        The character used to determine the end of a line.
+    delimiter : str, default ","
+        The character used to separate fields in a row.
+    thousands : str, default None
+        The character used as the thousands separator.
+        Cannot match delimiter.
+    decimal : str, default '.'
+        The character used as the decimal separator.
+        Cannot match delimiter.
+    comment : str, default None
+        The character used to identify the start of a comment line.
+        (which will be skipped by the reader)
+    delim_whitespace : bool, default False
+        If True, treat whitespace as the field delimiter.
+    skipinitialspace : bool, default False
+        If True, skip whitespace after the delimiter.
+    skip_blank_lines : bool, default True
+        If True, ignore empty lines (otherwise line values are parsed as null).
+    quoting : QuoteStyle, default QuoteStyle.MINIMAL
+        The quoting style used in the input CSV data. One of
+        { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE }
+    quotechar : str, default '"'
+        The character used to indicate quoting.
+    doublequote : bool, default True
+        If True, a quote inside a value is double-quoted.
+    parse_dates : list, default None
+        A list of integer column indices/string column names
+        of columns to read as datetime.
+    parse_hex : list, default None
+        A list of integer column indices/string column names
+        of columns to read as hexadecimal.
+    dtypes : Union[Dict[str, DataType], List[DataType]], default None
+        A list of data types or a dictionary mapping column names
+        to a DataType.
+    true_values : List[str], default None
+        A list of additional values to recognize as True.
+    false_values : List[str], default None
+        A list of additional values to recognize as False.
+    na_values : List[str], default None
+        A list of additional values to recognize as null.
+    keep_default_na : bool, default True
+        Whether to keep the built-in default N/A values.
+    na_filter : bool, default True
+        Whether to detect missing values. If False, can
+        improve performance.
+    dayfirst : bool, default False
+        If True, interpret dates as being in the DD/MM format.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef vector[string] c_parse_dates_names
+    cdef vector[int] c_parse_dates_indexes
+    cdef vector[int] c_parse_hex_names
+    cdef vector[int] c_parse_hex_indexes
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
+
+    cdef csv_reader_options options = move(
+        csv_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .nrows(nrows)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(ord(lineterminator))
+        .quotechar(ord(quotechar))
+        .decimal(ord(decimal))
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
+    )
+
+    options.set_header(header)
+
+    if col_names is not None:
+        options.set_names([str(name).encode() for name in col_names])
+
+    if prefix is not None:
+        options.set_prefix(prefix.encode())
+
+    if usecols is not None:
+        if all([isinstance(col, int) for col in usecols]):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name).encode() for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(ord(delimiter))
+
+    if thousands is not None:
+        options.set_thousands(ord(thousands))
+
+    if comment is not None:
+        options.set_comment(ord(comment))
+
+    if parse_dates is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_dates]):
+            raise NotImplementedError(
+                    "`parse_dates`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_dates_names, c_parse_dates_indexes = \
+            _process_parse_dates_hex(parse_dates)
+        options.set_parse_dates(c_parse_dates_names)
+        options.set_parse_dates(c_parse_dates_indexes)
+
+    if parse_hex is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_hex]):
+            raise NotImplementedError(
+                    "`parse_hex`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex)
+        options.set_parse_hex(c_parse_hex_names)
+        options.set_parse_hex(c_parse_hex_indexes)
+
+    if isinstance(dtypes, list):
+        for dtype in dtypes:
+            c_dtypes_list.push_back((<DataType?>dtype).c_obj)
+        options.set_dtypes(c_dtypes_list)
+    elif isinstance(dtypes, dict):
+        # dtypes_t is dict
+        for k, v in dtypes.items():
+            c_dtypes_map[str(k).encode()] = (<DataType?>v).c_obj
+        options.set_dtypes(c_dtypes_map)
+    elif dtypes is not None:
+        raise TypeError("dtypes must either by a list/dict")
+
+    if true_values is not None:
+        options.set_true_values(_make_str_vector(true_values))
+
+    if false_values is not None:
+        options.set_false_values(_make_str_vector(false_values))
+
+    if na_values is not None:
+        options.set_na_values(_make_str_vector(na_values))
+
+    cdef table_with_metadata c_result
+    with nogil:
+        c_result = move(cpp_read_csv(options))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index ab223c16a72..0094bf6032c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -38,6 +38,9 @@ cdef class TableWithMetadata:
 
 cdef class SourceInfo:
     cdef source_info c_obj
+    # Keep the bytes converted from stringio alive
+    # (otherwise we end up with a use after free when they get gc'ed)
+    cdef list byte_sources
 
 cdef class SinkInfo:
     # This vector just exists to keep the unique_ptrs to the sinks alive
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index df0b729b711..68498ff88f4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -178,7 +178,7 @@ cdef class SourceInfo:
                     raise ValueError("All sources must be of the same type!")
                 new_sources.append(buffer.read().encode())
             sources = new_sources
-
+            self.byte_sources = sources
         if isinstance(sources[0], bytes):
             empty_buffer = True
             for buffer in sources:
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 895e1afc502..fc672caa574 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -239,6 +239,9 @@ cdef dtype_from_column_view(column_view cv):
         ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    # Note: This function is to be phased out in favor of
+    # dtype_to_pylibcudf_type which will return a pylibcudf
+    # DataType object
     cdef libcudf_types.type_id tid
     if isinstance(dtype, cudf.ListDtype):
         tid = libcudf_types.type_id.LIST
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index efb192b3251..e029edfa2ed 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -4,6 +4,7 @@
 import io
 import os
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -109,7 +110,10 @@ def _make_fields_nullable(typ):
         lhs_type = _make_fields_nullable(lhs.type)
         lhs = rhs.cast(lhs_type)
 
-    assert lhs.equals(rhs)
+    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        np.testing.assert_array_almost_equal(lhs, rhs)
+    else:
+        assert lhs.equals(rhs)
 
 
 def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
@@ -125,6 +129,8 @@ def assert_table_and_meta_eq(
     pa_table: pa.Table,
     plc_table_w_meta: plc.io.types.TableWithMetadata,
     check_field_nullability=True,
+    check_types_if_empty=True,
+    check_names=True,
 ) -> None:
     """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
 
@@ -135,11 +141,17 @@ def assert_table_and_meta_eq(
         plc_shape == pa_table.shape
     ), f"{plc_shape} is not equal to {pa_table.shape}"
 
+    if not check_types_if_empty and plc_table.num_rows() == 0:
+        return
+
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
         assert_column_eq(pa_col, plc_col, check_field_nullability)
 
     # Check column name equality
-    assert plc_table_w_meta.column_names() == pa_table.column_names
+    if check_names:
+        assert (
+            plc_table_w_meta.column_names() == pa_table.column_names
+        ), f"{plc_table_w_meta.column_names()} != {pa_table.column_names}"
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
@@ -174,6 +186,33 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
+def _convert_numeric_types_to_floating(pa_table):
+    """
+    Useful little helper for testing the
+    dtypes option in I/O readers.
+
+    Returns a tuple containing the pylibcudf dtypes
+    and the new pyarrow schema
+    """
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+    return dtypes, new_fields
+
+
 def write_source_str(source, input_str):
     """
     Write a string to the source
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 53e207f29cb..4a7194a6d8d 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -141,6 +141,20 @@ def _generate_nested_data(typ):
     ), pa_table
 
 
+@pytest.fixture(params=[(0, 0), ("half", 0), (-1, "half")])
+def nrows_skiprows(table_data, request):
+    """
+    Parametrized nrows fixture that accompanies table_data
+    """
+    _, pa_table = table_data
+    nrows, skiprows = request.param
+    if nrows == "half":
+        nrows = len(pa_table) // 2
+    if skiprows == "half":
+        skiprows = (len(pa_table) - nrows) // 2
+    return nrows, skiprows
+
+
 @pytest.fixture(
     params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
new file mode 100644
index 00000000000..95326a8b681
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+import os
+from io import StringIO
+
+import pandas as pd
+import pyarrow as pa
+import pytest
+from utils import (
+    _convert_numeric_types_to_floating,
+    assert_table_and_meta_eq,
+    make_source,
+    write_source_str,
+)
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+# Shared kwargs to pass to make_source
+_COMMON_CSV_SOURCE_KWARGS = {
+    "format": "csv",
+    "index": False,
+}
+
+
+@pytest.fixture(scope="module")
+def csv_table_data(table_data):
+    """
+    Like the table_data but with nested types dropped
+    since the CSV reader can't handle that
+    uint64 is also dropped since it can get confused with int64
+    """
+    _, pa_table = table_data
+    pa_table = pa_table.drop_columns(
+        [
+            "col_uint64",
+            "col_list<item: int64>",
+            "col_list<item: list<item: int64>>",
+            "col_struct<v: int64 not null>",
+            "col_struct<a: int64 not null, b_struct: struct<b: double not null> not null>",
+        ]
+    )
+    return plc.interop.from_arrow(pa_table), pa_table
+
+
+@pytest.mark.parametrize("delimiter", [",", ";"])
+def test_read_csv_basic(
+    csv_table_data,
+    source_or_sink,
+    text_compression_type,
+    nrows_skiprows,
+    delimiter,
+):
+    _, pa_table = csv_table_data
+    compression_type = text_compression_type
+    nrows, skiprows = nrows_skiprows
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        compression=compression_type,
+        sep=delimiter,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    # Rename the table (by reversing the names) to test names argument
+    pa_table = pa_table.rename_columns(pa_table.column_names[::-1])
+    column_names = pa_table.column_names
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]),
+        delimiter=delimiter,
+        compression=compression_type,
+        col_names=column_names,
+        nrows=nrows,
+        skiprows=skiprows,
+    )
+
+    assert_table_and_meta_eq(
+        pa_table,
+        res,
+        check_types_if_empty=False,
+        check_names=False if skiprows > 0 and column_names is None else True,
+    )
+
+
+# Note: make sure chunk size is big enough so that dtype inference
+# infers correctly
+@pytest.mark.parametrize("chunk_size", [1000, 5999])
+def test_read_csv_byte_range(table_data, chunk_size, tmp_path):
+    _, pa_table = table_data
+    if len(pa_table) == 0:
+        # pandas writes nothing when we have empty table
+        # and header=None
+        pytest.skip("Don't test empty table case")
+    source = f"{tmp_path}/a.csv"
+    source = make_source(
+        source, pa_table, header=False, **_COMMON_CSV_SOURCE_KWARGS
+    )
+    file_size = os.stat(source).st_size
+    tbls_w_meta = []
+    for segment in range((file_size + chunk_size - 1) // chunk_size):
+        tbls_w_meta.append(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([source]),
+                byte_range_offset=segment * chunk_size,
+                byte_range_size=chunk_size,
+                header=-1,
+                col_names=pa_table.column_names,
+            )
+        )
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_csv(source, names=pa_table.column_names, header=None)
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("usecols", [None, ["col_int64", "col_bool"], [0, 1]])
+def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+    # Adjust table for usecols
+    if usecols is not None:
+        pa_table = pa_table.select(usecols)
+
+    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    # Extract the dtype out of the (name, type, child_types) tuple
+    # (read_csv doesn't support this format since it doesn't support nested columns)
+    dtypes = {name: dtype for name, dtype, _ in dtypes}
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols
+    )
+    new_table = pa_table.cast(new_schema)
+
+    assert_table_and_meta_eq(new_table, res)
+
+
+@pytest.mark.parametrize("skip_blanks", [True, False])
+@pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')])
+@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"])
+def test_read_csv_parse_options(
+    source_or_sink, decimal, quotechar, skip_blanks, lineterminator
+):
+    lines = [
+        "# first comment line",
+        "# third comment line",
+        "1,2,3,4_4,'z'",
+        '4,5,6,5_5,""',
+        "7,8,9,9_87,'123'",
+        "# last comment line",
+        "1,1,1,10_11,abc",
+    ]
+    buffer = lineterminator.join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("na_filter", [True, False])
+@pytest.mark.parametrize("na_values", [["n/a"], ["NV_NAN"]])
+@pytest.mark.parametrize("keep_default_na", [True, False])
+def test_read_csv_na_values(
+    source_or_sink, na_filter, na_values, keep_default_na
+):
+    lines = ["a,b,c", "n/a,NaN,NV_NAN", "1.0,2.0,3.0"]
+    buffer = "\n".join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("header", [0, 10, -1])
+def test_read_csv_header(csv_table_data, source_or_sink, header):
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), header=header
+    )
+    if header > 0:
+        if header < len(pa_table):
+            names_row = pa_table.take([header - 1]).to_pylist()[0].values()
+            pa_table = pa_table.slice(header)
+            col_names = [str(name) for name in names_row]
+            pa_table = pa_table.rename_columns(col_names)
+        else:
+            pa_table = pa.table([])
+    elif header < 0:
+        # neg header means use user-provided names (in this case nothing)
+        # (the original column names are now data)
+        tbl_dict = pa_table.to_pydict()
+        new_tbl_dict = {}
+        for i, (name, vals) in enumerate(tbl_dict.items()):
+            str_vals = [str(val) for val in vals]
+            new_tbl_dict[str(i)] = [name] + str_vals
+        pa_table = pa.table(new_tbl_dict)
+
+    assert_table_and_meta_eq(
+        pa_table,
+        plc_table_w_meta,
+        check_types_if_empty=False,
+    )
+
+
+# TODO: test these
+# str prefix = "",
+# bool mangle_dupe_cols = True,
+# size_type skipfooter = 0,
+# str thousands = None,
+# bool delim_whitespace = False,
+# bool skipinitialspace = False,
+# quote_style quoting = quote_style.MINIMAL,
+# bool doublequote = True,
+# bool detect_whitespace_around_quotes = False,
+# list parse_dates = None,
+# list true_values = None,
+# list false_values = None,
+# bool dayfirst = False,

From c6c21d7f9281f295e32ff72c95f95b600470df0e Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 18 Jul 2024 12:41:21 -0700
Subject: [PATCH 080/101] Drop `{{ pin_compatible('numpy', max_pin='x') }}`
 (#16301)

Part of issue: https://github.com/rapidsai/build-planning/issues/82

Drop `{{ pin_compatible('numpy', max_pin='x') }}` as it is no longer needed. `numpy` has its own `run_exports`, which constraints `numpy` to an API compatible version. More details in issue: https://github.com/orgs/rapidsai/projects/132

So `cudf` now uses that in its recipe builds. Also update `requirements/run` to set the `numpy` lower bound to `1.23` as required by us.

Lastly add todo comments for NumPy 2 update lines.

Authors:
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16301
---
 conda/recipes/cudf/meta.yaml | 4 +++-
 dependencies.yaml            | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 3cdc2050631..9137f099ad1 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,6 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
+    # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
@@ -82,7 +83,8 @@ requirements:
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
-    - {{ pin_compatible('numpy', max_pin='x') }}
+    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
+    - numpy >=1.23,<2.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 67ed3773b44..a19574b7658 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -323,6 +323,7 @@ dependencies:
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
+          # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
   build_python_cudf:
     common:
@@ -551,6 +552,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
+          # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
           - numpy>=1.23,<2.0a0
           - pandas>=2.0,<2.2.3dev0
   run_cudf:

From aeef0a1f4159d4c87f987d20225401040973d10f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:56:30 -0400
Subject: [PATCH 081/101] Remove hash_character_ngrams dependency from
 jaccard_index (#16241)

Removes internal dependency of `nvtext::hash_character_ngrams` from `nvtext::jaccard_index`.
Works around the size-type limit imposed by `hash_character_ngrams` which returns a `list` column.
This also specializes the hashing logic for the jaccard calculation specifically.

The overall algorithm has not changed. Code has moved around a bit and internal list-columns have been replaced with just offsets and values vectors.

Closes #16157

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16241
---
 cpp/benchmarks/text/jaccard.cpp |   4 +-
 cpp/src/text/jaccard.cu         | 478 ++++++++++++++++++++++----------
 2 files changed, 339 insertions(+), 143 deletions(-)

diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d05c195d077..d5b74da6773 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {32768, 131072, 262144})
+  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 9cf934165f6..e465fb79c89 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -19,16 +19,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
 #include <nvtext/jaccard.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -36,127 +39,375 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
 /**
  * @brief Retrieve the row data (span) for the given column/row-index
  *
- * @param d_input Input lists column
+ * @param values Flat vector of all values
+ * @param offsets Offsets identifying rows within values
  * @param idx Row index to retrieve
  * @return A device-span of the row values
  */
-__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx)
 {
-  auto const offsets =
-    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[idx];
-  auto const size   = offsets[idx + 1] - offset;
-  auto const begin =
-    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  auto const offset = offsets[row_idx];
+  auto const size   = offsets[row_idx + 1] - offset;
+  auto const begin  = values + offset;
   return cudf::device_span<uint32_t const>(begin, size);
 }
 
 /**
- * @brief Count the unique values within each row of the input column
+ * @brief Kernel to count the unique values within each row of the input column
+ *
+ * This is called with a warp per row.
  *
- * This is called with a warp per row
+ * @param d_values Sorted hash values to count uniqueness
+ * @param d_offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param d_results Number of unique values in each row
  */
-struct sorted_unique_fn {
-  cudf::column_device_view const d_input;
-  cudf::size_type* d_results;
+CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values,
+                                  int64_t const* d_offsets,
+                                  cudf::size_type rows,
+                                  cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-    auto const row      = get_row(d_input, row_idx);
-    auto const begin    = row.begin();
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const row      = get_row(d_values, d_offsets, row_idx);
+  auto const begin    = row.begin();
 
-    cudf::size_type count = 0;
-    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
-      count += (itr == begin || *itr != *(itr - 1));
-    }
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+  cudf::size_type count = 0;
+  for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+    count += (itr == begin || *itr != *(itr - 1));
   }
-};
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
 
-rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * @param values Sorted hash values to count uniqueness
+ * @param offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of unique values
+ */
+rmm::device_uvector<cudf::size_type> compute_unique_counts(uint32_t const* values,
+                                                           int64_t const* offsets,
+                                                           cudf::size_type rows,
                                                            rmm::cuda_stream_view stream)
 {
-  auto const d_input = cudf::column_device_view::create(input, stream);
-  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  sorted_unique_fn fn{*d_input, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size() * cudf::detail::warp_size,
-                     fn);
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_unique_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values, offsets, rows, d_results.data());
   return d_results;
 }
 
+/**
+ * @brief Kernel to count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row.
+ *
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param d_results Number of common values in each row
+ */
+CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1,
+                                     int64_t const* d_offsets1,
+                                     uint32_t const* d_values2,
+                                     int64_t const* d_offsets2,
+                                     cudf::size_type rows,
+                                     cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
+
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  auto const needles  = get_row(d_values1, d_offsets1, row_idx);
+  auto const haystack = get_row(d_values2, d_offsets2, row_idx);
+
+  auto begin     = haystack.begin();
+  auto const end = haystack.end();
+
+  cudf::size_type count = 0;
+  for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+       itr += cudf::detail::warp_size) {
+    if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+    // search haystack for this needle (*itr)
+    auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+    count += (found != end) && (*found == *itr);  // increment if found;
+    begin = found;                                // shorten the next lower-bound range
+  }
+  // sum up the counts across this warp
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
+
 /**
  * @brief Count the number of common values within each row of the 2 input columns
  *
- * This is called with a warp per row
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of common values
  */
-struct sorted_intersect_fn {
-  cudf::column_device_view const d_input1;
-  cudf::column_device_view const d_input2;
-  cudf::size_type* d_results;
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(uint32_t const* values1,
+                                                              int64_t const* offsets1,
+                                                              uint32_t const* values2,
+                                                              int64_t const* offsets2,
+                                                              cudf::size_type rows,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_intersect_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values1, offsets1, values2, offsets2, rows, d_results.data());
+  return d_results;
+}
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+/**
+ * @brief Counts the number of substrings in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(1, str.length() - width + 1)`
+ * If a string has less than width characters (but not empty), the count is 1
+ * since the entire string is still hashed.
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_counts Output number of substring per row of input
+ */
+CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings,
+                                         cudf::size_type width,
+                                         int64_t* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto const needles  = get_row(d_input1, row_idx);
-    auto const haystack = get_row(d_input2, row_idx);
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto begin     = haystack.begin();
-    auto const end = haystack.end();
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    // TODO: investigate cuCollections device-side static-map to match row values
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+}
+
+/**
+ * @brief Kernel to hash the substrings for each input row
+ *
+ * Each warp processes a single string.
+ * Substrings of string "hello world" with width=4 produce:
+ *   "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld"
+ * Each of these substrings is hashed and the hash stored in d_results
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_output_offsets Offsets into d_results
+ * @param d_results Hash values for each substring
+ */
+CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings,
+                                       cudf::size_type width,
+                                       int64_t const* d_output_offsets,
+                                       uint32_t* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    cudf::size_type count = 0;
-    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
-         itr += cudf::detail::warp_size) {
-      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
-      // search haystack for this needle (*itr)
-      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
-      count += (found != end) && (*found == *itr);  // increment if found;
-      begin = found;                                // shorten the next lower-bound range
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ uint32_t hvs[block_size];  // temp store for hash values
+
+  auto const hasher     = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+
+  auto d_hashes = d_results + d_output_offsets[str_idx];
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    uint32_t hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width);
+      // hash only if we have the full width of characters or this is the beginning of the string
+      if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); }
     }
-    // sum up the counts across this warp
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values for this warp into d_hashes
+      auto const hashes     = &hvs[threadIdx.x];
+      auto const hashes_end = hashes + cudf::detail::warp_size;
+      d_hashes =
+        thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; });
+    }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 
-rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
-                                                              cudf::column_view const& input2,
-                                                              rmm::cuda_stream_view stream)
+void segmented_sort(uint32_t const* input,
+                    uint32_t* output,
+                    int64_t items,
+                    cudf::size_type segments,
+                    int64_t const* offsets,
+                    rmm::cuda_stream_view stream)
 {
-  auto const d_input1 = cudf::column_device_view::create(input1, stream);
-  auto const d_input2 = cudf::column_device_view::create(input2, stream);
-  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input1.size() * cudf::detail::warp_size,
-                     fn);
-  return d_results;
+  rmm::device_buffer temp;
+  std::size_t temp_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+  temp = rmm::device_buffer(temp_bytes, stream);
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+}
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * The hashes are sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ *
+ * @param input Input strings column to hash
+ * @param width Substring width in characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The sorted hash values and offsets to each row
+ */
+std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_substrings(
+  cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream)
+{
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // count substrings
+  auto offsets          = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+  count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data());
+  auto const total_hashes =
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+
+  // hash substrings
+  rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
+  substring_hash_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data(), hashes.data());
+
+  // sort hashes
+  rmm::device_uvector<uint32_t> sorted(total_hashes, stream);
+  if (total_hashes < static_cast<int64_t>(std::numeric_limits<int>::max())) {
+    segmented_sort(
+      hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream);
+  } else {
+    // The CUB segmented sort can only handle max<int> total values
+    // so this code calls it in sections.
+    auto const section_size   = std::numeric_limits<int>::max() / 2L;
+    auto const sort_sections  = cudf::util::div_rounding_up_safe(total_hashes, section_size);
+    auto const offset_indices = [&] {
+      // build a set of indices that point to offsets subsections
+      auto sub_offsets = rmm::device_uvector<int64_t>(sort_sections + 1, stream);
+      thrust::sequence(
+        rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size);
+      auto indices = rmm::device_uvector<int64_t>(sub_offsets.size(), stream);
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          offsets.begin(),
+                          offsets.end(),
+                          sub_offsets.begin(),
+                          sub_offsets.end(),
+                          indices.begin());
+      return cudf::detail::make_std_vector_sync(indices, stream);
+    }();
+
+    // Call segmented sort with the sort sections
+    for (auto i = 0L; i < sort_sections; ++i) {
+      auto const index1 = offset_indices[i];
+      auto const index2 = std::min(offset_indices[i + 1], static_cast<int64_t>(offsets.size() - 1));
+      auto const offset1 = offsets.element(index1, stream);
+      auto const offset2 = offsets.element(index2, stream);
+
+      auto const num_items    = offset2 - offset1;
+      auto const num_segments = index2 - index1;
+
+      // There is a bug in the CUB segmented sort and the workaround is to
+      // shift the offset values so the first offset is 0.
+      // This transform can be removed once the bug is fixed.
+      auto sort_offsets = rmm::device_uvector<int64_t>(num_segments + 1, stream);
+      thrust::transform(rmm::exec_policy(stream),
+                        offsets.begin() + index1,
+                        offsets.begin() + index2 + 1,
+                        sort_offsets.begin(),
+                        [offset1] __device__(auto const o) { return o - offset1; });
+
+      segmented_sort(hashes.begin() + offset1,
+                     sorted.begin() + offset1,
+                     num_items,
+                     num_segments,
+                     sort_offsets.begin(),
+                     stream);
+    }
+  }
+  return std::make_pair(std::move(sorted), std::move(offsets));
 }
 
 /**
@@ -186,62 +437,6 @@ struct jaccard_fn {
   }
 };
 
-/**
- * @brief Create hashes for each substring
- *
- * Uses the hash_character_ngrams to hash substrings of the input column.
- * This returns a lists column where each row is the hashes for the substrings
- * of the corresponding input string row.
- *
- * The hashes are then sorted using a segmented-sort as setup to
- * perform the unique and intersect operations.
- */
-std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
-                                              cudf::size_type width,
-                                              rmm::cuda_stream_view stream)
-{
-  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
-  auto const input   = cudf::lists_column_view(hashes->view());
-  auto const offsets = input.offsets_begin();
-  auto const data    = input.child().data<uint32_t>();
-
-  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
-
-  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-
-  auto contents = hashes->release();
-  // the offsets are taken from the hashes column since they are the same
-  // before and after the segmented-sort
-  return cudf::make_lists_column(
-    col.size(),
-    std::move(contents.children.front()),
-    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
-    0,
-    rmm::device_buffer{},
-    stream,
-    rmm::mr::get_current_device_resource());
-}
 }  // namespace
 
 std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
@@ -261,13 +456,14 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
 
   auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
     // build hashes of the substrings
-    auto const hash1 = hash_substrings(input1, width, stream);
-    auto const hash2 = hash_substrings(input2, width, stream);
+    auto const [hash1, offsets1] = hash_substrings(input1, width, stream);
+    auto const [hash2, offsets2] = hash_substrings(input2, width, stream);
 
     // compute the unique counts in each set and the intersection counts
-    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
-    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
-    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+    auto d_uniques1   = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream);
+    auto d_intersects = compute_intersect_counts(
+      hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream);
 
     return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
   }();

From 4acca4d57303f52907aa158a2ef996c9d42a73d6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jul 2024 11:07:07 -1000
Subject: [PATCH 082/101] Use Column.can_cast_safely instead of some ad-hoc
 dtype functions in .where (#16303)

There were a couple of dedicated functions in `python/cudf/cudf/utils/dtypes.py` specific to `.where` that could be subsumed by `Column.can_cast_safely`.

The minor downside is that we need to cast where's argument to a Column first, but IMO it's probably OK given the deduplication

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16303
---
 python/cudf/cudf/core/_internals/where.py | 78 ++++++++++++++----
 python/cudf/cudf/utils/dtypes.py          | 96 +----------------------
 2 files changed, 62 insertions(+), 112 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 4a36be76b6d..6003a0f6aea 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -9,12 +9,7 @@
 import cudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import (
-    _can_cast,
-    _dtype_can_hold_element,
-    find_common_type,
-    is_mixed_with_object_dtype,
-)
+from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
@@ -44,6 +39,8 @@ def _check_and_cast_columns_with_other(
     inplace: bool,
 ) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
+    from cudf.core.column import as_column
+
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
         return _normalize_categorical(source_col, other)
@@ -84,17 +81,9 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if (
-        _is_non_decimal_numeric_dtype(source_dtype)
-        and not other_is_scalar  # can-cast fails for Python scalars
-        and _can_cast(other, source_dtype)
-    ):
-        common_dtype = source_dtype
-    elif (
-        isinstance(source_col, cudf.core.column.NumericalColumn)
-        and other_is_scalar
-        and _dtype_can_hold_element(source_dtype, other)
-    ):
+    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+        other
+    ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
@@ -130,3 +119,58 @@ def _make_categorical_like(result, column):
             ordered=column.ordered,
         )
     return result
+
+
+def _can_cast(from_dtype, to_dtype):
+    """
+    Utility function to determine if we can cast
+    from `from_dtype` to `to_dtype`. This function primarily calls
+    `np.can_cast` but with some special handling around
+    cudf specific dtypes.
+    """
+    if cudf.utils.utils.is_na_like(from_dtype):
+        return True
+    if isinstance(from_dtype, type):
+        from_dtype = cudf.dtype(from_dtype)
+    if isinstance(to_dtype, type):
+        to_dtype = cudf.dtype(to_dtype)
+
+    # TODO : Add precision & scale checking for
+    # decimal types in future
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+    elif isinstance(from_dtype, np.dtype):
+        if isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype, to_dtype)
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
+            return True
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
+        # TODO: Add level based checks too once casting of
+        # list columns is supported
+        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
+            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype._categories.dtype, to_dtype)
+        else:
+            return False
+    else:
+        return np.can_cast(from_dtype, to_dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 59e5ec1df04..af912bee342 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -10,8 +10,6 @@
 from pandas.core.dtypes.common import infer_dtype_from_object
 
 import cudf
-from cudf._typing import DtypeObj
-from cudf.api.types import is_bool, is_float, is_integer
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -584,61 +582,6 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _can_cast(from_dtype, to_dtype):
-    """
-    Utility function to determine if we can cast
-    from `from_dtype` to `to_dtype`. This function primarily calls
-    `np.can_cast` but with some special handling around
-    cudf specific dtypes.
-    """
-    if cudf.utils.utils.is_na_like(from_dtype):
-        return True
-    if isinstance(from_dtype, type):
-        from_dtype = cudf.dtype(from_dtype)
-    if isinstance(to_dtype, type):
-        to_dtype = cudf.dtype(to_dtype)
-
-    # TODO : Add precision & scale checking for
-    # decimal types in future
-
-    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-    elif isinstance(from_dtype, np.dtype):
-        if isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
-            return True
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
-        # TODO: Add level based checks too once casting of
-        # list columns is supported
-        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
-            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype._categories.dtype, to_dtype)
-        else:
-            return False
-    else:
-        return np.can_cast(from_dtype, to_dtype)
-
-
 def _maybe_convert_to_default_type(dtype):
     """Convert `dtype` to default if specified by user.
 
@@ -661,44 +604,7 @@ def _maybe_convert_to_default_type(dtype):
     return dtype
 
 
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
-    if not len(rng):
-        return True
-    return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
-
-
-def _dtype_can_hold_element(dtype: np.dtype, element) -> bool:
-    if dtype.kind in {"i", "u"}:
-        if isinstance(element, range):
-            if _dtype_can_hold_range(element, dtype):
-                return True
-            return False
-
-        elif is_integer(element) or (
-            is_float(element) and element.is_integer()
-        ):
-            info = np.iinfo(dtype)
-            if info.min <= element <= info.max:
-                return True
-            return False
-
-    elif dtype.kind == "f":
-        if is_integer(element) or is_float(element):
-            casted = dtype.type(element)
-            if np.isnan(casted) or casted == element:
-                return True
-            # otherwise e.g. overflow see TestCoercionFloat32
-            return False
-
-    elif dtype.kind == "b":
-        if is_bool(element):
-            return True
-        return False
-
-    raise NotImplementedError(f"Unsupported dtype: {dtype}")
-
-
-def _get_base_dtype(dtype: DtypeObj) -> DtypeObj:
+def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
     # TODO: replace the use of this function with just `dtype.base`
     # when Pandas 2.1.0 is the minimum version we support:
     # https://github.com/pandas-dev/pandas/pull/52706

From debbef0bc12f523054740432983030dd0b24f9c4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:12:56 +0100
Subject: [PATCH 083/101] Update vendored thread_pool implementation (#16210)

Since we introduced the vendored thread_pool in #8752, upstream has introduced some new features, and particularly now uses condition variables/notification to handle when there are no tasks in the queue. This avoids the issue described in #16209 where the thread pool by default artificially introduces a delay of 1000microseconds to all tasks whenever the task queue is emptied.

- Closes #16209

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16210
---
 cpp/CMakeLists.txt                            |   4 +-
 .../groupby/group_max_multithreaded.cpp       |  10 +-
 .../io/orc/orc_reader_multithreaded.cpp       |  26 +-
 .../io/parquet/parquet_reader_multithread.cpp |  26 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake    |  31 ++
 cpp/include/cudf/utilities/thread_pool.hpp    | 381 ------------------
 cpp/src/io/utilities/file_io_utilities.cpp    |   6 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   7 +-
 8 files changed, 66 insertions(+), 425 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_thread_pool.cmake
 delete mode 100644 cpp/include/cudf/utilities/thread_pool.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 903cff27be4..65347bd6689 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
+# find thread_pool
+include(cmake/thirdparty/get_thread_pool.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -804,7 +806,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
index 3b8faba618f..bf1a1a5fcf7 100644
--- a/cpp/benchmarks/groupby/group_max_multithreaded.cpp
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -20,8 +20,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
@@ -58,7 +58,7 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
   for (auto& thread_requests : requests) {
@@ -75,10 +75,8 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
       timer.start();
-      for (int64_t i = 0; i < num_threads; ++i) {
-        threads.submit(perform_agg, i);
-      }
-      threads.wait_for_tasks();
+      threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg);
+      threads.wait();
       cudf::detail::join_streams(streams, cudf::get_default_stream());
       cudf::get_default_stream().synchronize();
       timer.stop();
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..e91bf06fdfa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -24,8 +24,8 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
@@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
                    } while (reader.has_next());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..9e76ebb71ab 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -23,10 +23,10 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
@@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
                  } while (reader.has_next());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
new file mode 100644
index 00000000000..264257c7199
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds rmm and sets any additional necessary environment variables.
+function(find_and_configure_thread_pool)
+  rapids_cpm_find(
+    BS_thread_pool 4.1.0
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
+    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
+    GIT_SHALLOW TRUE
+  )
+  if(NOT TARGET BS_thread_pool)
+    add_library(BS_thread_pool INTERFACE)
+    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
+    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
+  endif()
+endfunction()
+
+find_and_configure_thread_pool()
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
deleted file mode 100644
index c8c3eb097c4..00000000000
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-/**
- * Modified from https://github.com/bshoshany/thread-pool
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license.
- *            See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-
-#include <atomic>       // std::atomic
-#include <chrono>       // std::chrono
-#include <cstdint>      // std::int_fast64_t, std::uint_fast32_t
-#include <functional>   // std::function
-#include <future>       // std::future, std::promise
-#include <memory>       // std::shared_ptr, std::unique_ptr
-#include <mutex>        // std::mutex, std::scoped_lock
-#include <queue>        // std::queue
-#include <thread>       // std::this_thread, std::thread
-#include <type_traits>  // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>      // std::move, std::swap
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a
- * thread becomes available, it pops a task from the queue and executes it. Each task is
- * automatically assigned a future, which can be used to wait for the task to finish executing
- * and/or obtain its eventual return value.
- */
-class thread_pool {
-  using ui32 = int;
-
- public:
-  /**
-   * @brief Construct a new thread pool.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
-    : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
-      threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-  {
-    create_threads();
-  }
-
-  /**
-   * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads.
-   * Note that if the variable paused is set to true, then any tasks still in the queue will never
-   * be executed.
-   */
-  ~thread_pool()
-  {
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-  }
-
-  /**
-   * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-   *
-   * @return The number of queued tasks.
-   */
-  [[nodiscard]] size_t get_tasks_queued() const
-  {
-    std::scoped_lock const lock(queue_mutex);
-    return tasks.size();
-  }
-
-  /**
-   * @brief Get the number of tasks currently being executed by the threads.
-   *
-   * @return The number of running tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
-
-  /**
-   * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
-   * thread.
-   *
-   * @return The total number of tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
-
-  /**
-   * @brief Get the number of threads in the pool.
-   *
-   * @return The number of threads.
-   */
-  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
-
-  /**
-   * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
-   * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to:
-   * for (T i = first_index; i <= last_index; i++) loop(i);
-   *
-   * @tparam T The type of the loop index. Should be a signed or unsigned integer.
-   * @tparam F The type of the function to loop through.
-   * @param first_index The first index in the loop (inclusive).
-   * @param last_index The last index in the loop (inclusive).
-   * @param loop The function to loop through. Should take exactly one argument, the loop index.
-   * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the
-   * number of threads in the pool.
-   */
-  template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
-  {
-    if (num_tasks == 0) num_tasks = thread_count;
-    if (last_index < first_index) std::swap(last_index, first_index);
-    size_t total_size = last_index - first_index + 1;
-    size_t block_size = total_size / num_tasks;
-    if (block_size == 0) {
-      block_size = 1;
-      num_tasks  = (ui32)total_size > 1 ? (ui32)total_size : 1;
-    }
-    std::atomic<ui32> blocks_running = 0;
-    for (ui32 t = 0; t < num_tasks; t++) {
-      T start = (T)(t * block_size + first_index);
-      T end   = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1);
-      blocks_running++;
-      push_task([start, end, &loop, &blocks_running] {
-        for (T i = start; i <= end; i++)
-          loop(i);
-        blocks_running--;
-      });
-    }
-    while (blocks_running != 0) {
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief Push a function with no arguments or return value into the task queue.
-   *
-   * @tparam F The type of the function.
-   * @param task The function to push.
-   */
-  template <typename F>
-  void push_task(F const& task)
-  {
-    tasks_total++;
-    {
-      std::scoped_lock const lock(queue_mutex);
-      tasks.push(std::function<void()>(task));
-    }
-  }
-
-  /**
-   * @brief Push a function with arguments, but no return value, into the task queue.
-   * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks
-   * in the queue must be of type std::function<void()>, so they cannot have any arguments or return
-   * value. If no arguments are provided, the other overload will be used, in order to avoid the
-   * (slight) overhead of using a lambda.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the arguments.
-   * @param task The function to push.
-   * @param args The arguments to pass to the function.
-   */
-  template <typename F, typename... A>
-  void push_task(F const& task, A const&... args)
-  {
-    push_task([task, args...] { task(args...); });
-  }
-
-  /**
-   * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be
-   * completed, then destroys all threads in the pool and creates a new thread pool with the new
-   * number of threads. Any tasks that were waiting in the queue before the pool was reset will then
-   * be executed by the new threads. If the pool was paused before resetting it, the new pool will
-   * be paused as well.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
-  {
-    bool was_paused = paused;
-    paused          = true;
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-    thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads      = std::make_unique<std::thread[]>(thread_count);
-    paused       = was_paused;
-    create_threads();
-    running = true;
-  }
-
-  /**
-   * @brief Submit a function with zero or more arguments and a return value into the task queue,
-   * and get a future for its eventual returned value.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @tparam R The return type of the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to obtain the function's returned value, waiting for it to
-   * finish its execution if needed.
-   */
-  template <typename F,
-            typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(F const& task, A const&... args)
-  {
-    std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
-    std::future<R> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        if constexpr (std::is_void_v<R>) {
-          task(args...);
-          promise->set_value();
-        } else {
-          promise->set_value(task(args...));
-        }
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
-  /**
-   * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those
-   * that are currently running in the threads and those that are still waiting in the queue.
-   * However, if the variable paused is set to true, this function only waits for the currently
-   * running tasks (otherwise it would wait forever). To wait for a specific task, use submit()
-   * instead, and call the wait() member function of the generated future.
-   */
-  void wait_for_tasks()
-  {
-    while (true) {
-      if (!paused) {
-        if (tasks_total == 0) break;
-      } else {
-        if (get_tasks_running() == 0) break;
-      }
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief An atomic variable indicating to the workers to pause. When set to true, the workers
-   * temporarily stop popping new tasks out of the queue, although any tasks already executed will
-   * keep running until they are done. Set to false again to resume popping tasks.
-   */
-  std::atomic<bool> paused = false;
-
-  /**
-   * @brief The duration, in microseconds, that the worker function should sleep for when it cannot
-   * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will
-   * execute std::this_thread::yield() if there are no tasks in the queue. The default value is
-   * 1000.
-   */
-  ui32 sleep_duration = 1000;
-
- private:
-  /**
-   * @brief Create the threads in the pool and assign a worker to each thread.
-   */
-  void create_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i] = std::thread(&thread_pool::worker, this);
-    }
-  }
-
-  /**
-   * @brief Destroy the threads in the pool by joining them.
-   */
-  void destroy_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i].join();
-    }
-  }
-
-  /**
-   * @brief Try to pop a new task out of the queue.
-   *
-   * @param task A reference to the task. Will be populated with a function if the queue is not
-   * empty.
-   * @return true if a task was found, false if the queue is empty.
-   */
-  bool pop_task(std::function<void()>& task)
-  {
-    std::scoped_lock const lock(queue_mutex);
-    if (tasks.empty())
-      return false;
-    else {
-      task = std::move(tasks.front());
-      tasks.pop();
-      return true;
-    }
-  }
-
-  /**
-   * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-   *
-   */
-  void sleep_or_yield()
-  {
-    if (sleep_duration)
-      std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-    else
-      std::this_thread::yield();
-  }
-
-  /**
-   * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out
-   * of the queue and executes them, as long as the atomic variable running is set to true.
-   */
-  void worker()
-  {
-    while (running) {
-      std::function<void()> task;
-      if (!paused && pop_task(task)) {
-        task();
-        tasks_total--;
-      } else {
-        sleep_or_yield();
-      }
-    }
-  }
-
-  /**
-   * @brief A mutex to synchronize access to the task queue by different threads.
-   */
-  mutable std::mutex queue_mutex;
-
-  /**
-   * @brief An atomic variable indicating to the workers to keep running. When set to false, the
-   * workers permanently stop working.
-   */
-  std::atomic<bool> running = true;
-
-  /**
-   * @brief A queue of tasks to be executed by the threads.
-   */
-  std::queue<std::function<void()>> tasks;
-
-  /**
-   * @brief The number of threads in the pool.
-   */
-  ui32 thread_count;
-
-  /**
-   * @brief A smart pointer to manage the memory allocated for the threads.
-   */
-  std::unique_ptr<std::thread[]> threads;
-
-  /**
-   * @brief An atomic variable to keep track of the total number of unfinished tasks - either still
-   * in the queue, or running in a thread.
-   */
-  std::atomic<ui32> tasks_total = 0;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9fe5959436d..d7b54399f8d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -223,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
     // The benefit from multithreaded read plateaus around 16 threads
     pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
 {
-  pool.sleep_duration = 10;
 }
 
 namespace {
@@ -232,14 +231,15 @@ template <typename DataT,
           typename F,
           typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
 std::vector<std::future<ResultT>> make_sliced_tasks(
-  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+  F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool)
 {
   constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
   static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
   auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
-    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+    return pool.submit_task(
+      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 91ef41fba6e..441bede200d 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,8 +19,7 @@
 #ifdef CUFILE_FOUND
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/utilities/thread_pool.hpp>
-
+#include <BS_thread_pool.hpp>
 #include <cufile.h>
 #endif
 
@@ -150,7 +149,7 @@ class cufile_input_impl final : public cufile_input {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 
 /**
@@ -167,7 +166,7 @@ class cufile_output_impl final : public cufile_output {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 #else
 

From 8ff27ed5bcaf8fc5fc8d1f546dee30c59861c320 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:15:20 +0100
Subject: [PATCH 084/101] Support Literals in groupby-agg (#16218)

To do this, we just need to collect the appropriate aggregation information, and broadcast literals to the correct size.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16218
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 15 +++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py   |  4 ++--
 python/cudf_polars/tests/test_groupby.py   | 17 +++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f37cb3f475c..a034d55120a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -370,6 +370,10 @@ def do_evaluate(
         # datatype of pyarrow scalar is correct by construction.
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class LiteralColumn(Expr):
     __slots__ = ("value",)
@@ -382,6 +386,13 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
 
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -393,6 +404,10 @@ def do_evaluate(
         # datatype of pyarrow array is correct by construction.
         return Column(plc.interop.from_arrow(self.value))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class Col(Expr):
     __slots__ = ("name",)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index cce0c4a3d94..01834ab75a5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -514,7 +514,7 @@ def check_agg(agg: expr.Expr) -> int:
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
             return 0
         else:
             raise NotImplementedError(f"No handler for {agg=}")
@@ -574,7 +574,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results]).slice(self.options.slice)
+        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
 
 
 @dataclasses.dataclass
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b07d8e38217..b650fee5079 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -155,3 +155,20 @@ def test_groupby_nan_minmax_raises(op):
     q = df.group_by("key").agg(op(pl.col("value")))
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.lit(1).alias("value"),
+        pl.lit([[4, 5, 6]]).alias("value"),
+        pl.col("float") * (1 - pl.col("int")),
+        [pl.lit(2).alias("value"), pl.col("float") * 2],
+    ],
+)
+def test_groupby_literal_in_agg(df, key, expr):
+    # check_row_order=False doesn't work for list aggregations
+    # so just sort by the group key
+    q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
+    assert_gpu_result_equal(q)

From 9a713e3adb8abb1f41de0445b8ea896fdb48c560 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:34:16 -0400
Subject: [PATCH 085/101] Migrate lists/count_elements to pylibcudf (#16072)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16072
---
 python/cudf/cudf/_lib/lists.pyx               | 18 +++----------
 .../libcudf/lists/count_elements.pxd          |  2 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 27 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 10 +++++++
 5 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index ceae1b148aa..76f37c3b845 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,9 +8,6 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
-    count_elements as cpp_count_elements,
-)
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -36,19 +33,10 @@ from cudf._lib.pylibcudf cimport Scalar
 
 @acquire_spill_lock()
 def count_elements(Column col):
-
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.count_elements(
+            col.to_pylibcudf(mode="read"))
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
index 38bdd4db0bb..ba57a839fbc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -9,4 +9,4 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
+    cdef unique_ptr[column] count_elements(const lists_column_view&) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 38a479e4791..38eb575ee8d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -33,3 +33,5 @@ cpdef Column reverse(Column)
 cpdef Column segmented_gather(Column, Column)
 
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
+
+cpdef Column count_elements(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 19c961aa014..ea469642dd5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -17,6 +17,9 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+    count_elements as cpp_count_elements,
+)
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
@@ -293,3 +296,27 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
             index.view() if ColumnOrSizeType is Column else index,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+    For details, see :cpp:func:`count_elements`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_count_elements(list_view.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 07ecaed5012..7cfed884f90 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -181,3 +181,13 @@ def test_extract_list_element_column(test_data):
     expect = pa.array([0, None, None, 7])
 
     assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.count_elements(plc_column)
+
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
+
+    assert_column_eq(expect, res)

From 2bbeee95ec338c30c0c876dc6a58376fbb0a5a06 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Fri, 19 Jul 2024 12:43:49 -0400
Subject: [PATCH 086/101] DOC: use intersphinx mapping in pandas-compat ext
 (#15846)

~~If https://github.com/rapidsai/cudf/pull/15704 is merged~~

This PR changes the header in the admonition (pandas compat box) to be hyperlinked to the pandas docs instead of just text. See https://raybellwaves.github.io/compatsphinxext/compat.html which is the docs of a minimal repo where I have been testing

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15846
---
 .../source/developer_guide/documentation.md   |  2 +-
 python/cudf/cudf/core/column/lists.py         | 12 +++++-
 python/cudf/cudf/core/column/string.py        | 16 ++++----
 python/cudf/cudf/core/dataframe.py            | 37 ++++++++++---------
 python/cudf/cudf/core/frame.py                | 10 ++---
 python/cudf/cudf/core/groupby/groupby.py      |  9 +++--
 python/cudf/cudf/core/indexed_frame.py        | 28 +++++++-------
 python/cudf/cudf/core/series.py               | 14 +++----
 python/cudf/cudf/core/tools/numeric.py        |  2 +-
 python/cudf/cudf/core/window/ewm.py           |  2 +-
 10 files changed, 72 insertions(+), 60 deletions(-)

diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index c8da689479c..4f5a57fec02 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -164,7 +164,7 @@ The directive should be used inside docstrings like so:
 Docstring body
 
 .. pandas-compat::
-    **$API_NAME**
+    :meth:`pandas.DataFrame.METHOD`
 
     Explanation of differences
 ```
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index cc15e78314e..46b844413f7 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -646,9 +646,17 @@ def sort_values(
         dtype: list
 
         .. pandas-compat::
-            **ListMethods.sort_values**
+            `pandas.Series.list.sort_values`
 
-            The ``inplace`` and ``kind`` arguments are currently not supported.
+            This method does not exist in pandas but it can be run
+            as:
+
+            >>> import pandas as pd
+            >>> s = pd.Series([[3, 2, 1], [2, 4, 3]])
+            >>> print(s.apply(sorted))
+            0    [1, 2, 3]
+            1    [2, 3, 4]
+            dtype: object
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 96f9cdfd655..ec95c50f455 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -612,7 +612,7 @@ def extract(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.extract**
+            :meth:`pandas.Series.str.extract`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -738,7 +738,7 @@ def contains(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.contains**
+            :meth:`pandas.Series.str.contains`
 
             The parameters `case` and `na` are not yet supported and will
             raise a NotImplementedError if anything other than the default
@@ -974,7 +974,7 @@ def replace(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.replace**
+            :meth:`pandas.Series.str.replace`
 
             The parameters `case` and `flags` are not yet supported and will
             raise a `NotImplementedError` if anything other than the default
@@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
                    )
 
         .. pandas-compat::
-            **StringMethods.partition**
+            :meth:`pandas.Series.str.partition`
 
             The parameter `expand` is not yet supported and will raise a
             `NotImplementedError` if anything other than the default
@@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         Index([0, 0, 2, 1], dtype='int64')
 
         .. pandas-compat::
-            **StringMethods.count**
+            :meth:`pandas.Series.str.count`
 
             -   `flags` parameter currently only supports re.DOTALL
                 and re.MULTILINE.
@@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         dtype: list
 
         .. pandas-compat::
-            **StringMethods.findall**
+            :meth:`pandas.Series.str.findall`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.endswith**
+            :meth:`pandas.Series.str.endswith`
 
             `na` parameter is not yet supported, as cudf uses
             native strings instead of Python objects.
@@ -4264,7 +4264,7 @@ def match(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.match**
+            :meth:`pandas.Series.str.match`
 
             Parameters `case` and `na` are currently not supported.
             The `flags` parameter currently only supports re.DOTALL and
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b3d938829c9..f06e45277e2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2750,7 +2750,7 @@ def reindex(
         Chrome                200          0.02
 
         .. pandas-compat::
-            **DataFrame.reindex**
+            :meth:`pandas.DataFrame.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -3350,7 +3350,7 @@ def diff(self, periods=1, axis=0):
         5     2     5    20
 
         .. pandas-compat::
-            **DataFrame.diff**
+            :meth:`pandas.DataFrame.diff`
 
             Diff currently only supports numeric dtype columns.
         """
@@ -3555,7 +3555,7 @@ def rename(
         30  3  6
 
         .. pandas-compat::
-            **DataFrame.rename**
+            :meth:`pandas.DataFrame.rename`
 
             * Not Supporting: level
 
@@ -3670,7 +3670,7 @@ def agg(self, aggs, axis=None):
             ``DataFrame`` is returned.
 
         .. pandas-compat::
-            **DataFrame.agg**
+            :meth:`pandas.DataFrame.agg`
 
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
@@ -3843,7 +3843,7 @@ def nlargest(self, n, columns, keep="first"):
         Brunei      434000    12128      BN
 
         .. pandas-compat::
-            **DataFrame.nlargest**
+            :meth:`pandas.DataFrame.nlargest`
 
             - Only a single column is supported in *columns*
         """
@@ -3915,7 +3915,7 @@ def nsmallest(self, n, columns, keep="first"):
         Nauru         337000  182      NR
 
         .. pandas-compat::
-            **DataFrame.nsmallest**
+            :meth:`pandas.DataFrame.nsmallest`
 
             - Only a single column is supported in *columns*
         """
@@ -3997,7 +3997,7 @@ def transpose(self):
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
         .. pandas-compat::
-            **DataFrame.transpose, DataFrame.T**
+            :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T`
 
             Not supporting *copy* because default and only behavior is
             copy=True
@@ -4188,7 +4188,7 @@ def merge(
         from both sides.
 
         .. pandas-compat::
-            **DataFrame.merge**
+            :meth:`pandas.DataFrame.merge`
 
             DataFrames merges in cuDF result in non-deterministic row
             ordering.
@@ -4263,7 +4263,7 @@ def join(
         joined : DataFrame
 
         .. pandas-compat::
-            **DataFrame.join**
+            :meth:`pandas.DataFrame.join`
 
             - *other* must be a single DataFrame for now.
             - *on* is not supported yet due to lack of multi-index support.
@@ -4385,7 +4385,7 @@ def query(self, expr, local_dict=None):
         1 2018-10-08
 
         .. pandas-compat::
-            **DataFrame.query**
+            :meth:`pandas.DataFrame.query`
 
             One difference from pandas is that ``query`` currently only
             supports numeric, datetime, timedelta, or bool dtypes.
@@ -5447,10 +5447,11 @@ def from_arrow(cls, table):
         2  3  6
 
         .. pandas-compat::
-            **DataFrame.from_arrow**
+            `pandas.DataFrame.from_arrow`
 
-            -   Does not support automatically setting index column(s) similar
-                to how ``to_pandas`` works for PyArrow Tables.
+            This method does not exist in pandas but it is similar to
+            how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e.
+            it does not support automatically setting index column(s).
         """
         index_col = None
         col_index_names = None
@@ -5884,7 +5885,7 @@ def quantile(
         0.5  2.5  55.0
 
         .. pandas-compat::
-            **DataFrame.quantile**
+            :meth:`pandas.DataFrame.quantile`
 
             One notable difference from Pandas is when DataFrame is of
             non-numeric types and result is expected to be a Series in case of
@@ -6174,7 +6175,7 @@ def count(self, axis=0, numeric_only=False):
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.count**
+            :meth:`pandas.DataFrame.count`
 
             Parameters currently not supported are `axis` and `numeric_only`.
         """
@@ -6412,7 +6413,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         1  <NA>    2.0
 
         .. pandas-compat::
-            **DataFrame.mode**
+            :meth:`pandas.DataFrame.transpose`
 
             ``axis`` parameter is currently not supported.
         """
@@ -7594,7 +7595,7 @@ def interleave_columns(self):
         The interleaved columns as a single column
 
         .. pandas-compat::
-            **DataFrame.interleave_columns**
+            `pandas.DataFrame.interleave_columns`
 
             This method does not exist in pandas but it can be run
             as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
@@ -7696,7 +7697,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         4  5   2   7  3
 
         .. pandas-compat::
-            **DataFrame.eval**
+            :meth:`pandas.DataFrame.eval`
 
             * Additional kwargs are not supported.
             * Bitwise and logical operators are not dtype-dependent.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 802751e47ad..111225a5fc2 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.where, Series.where**
+            :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where`
 
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
@@ -1641,7 +1641,7 @@ def min(
         1
 
         .. pandas-compat::
-            **DataFrame.min, Series.min**
+            :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1689,7 +1689,7 @@ def max(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.max, Series.max**
+            :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1742,7 +1742,7 @@ def all(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.all, Series.all**
+            :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
@@ -1795,7 +1795,7 @@ def any(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.any, Series.any**
+            :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d2c75715be2..3f91be71f29 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -744,7 +744,8 @@ def _reduce(
             Computed {op} of values within each group.
 
         .. pandas-compat::
-            **{cls}.{op}**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.{op}`
 
             The numeric_only, min_count
         """
@@ -1482,7 +1483,8 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **GroupBy.apply**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.apply`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.apply`
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -2358,7 +2360,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             Object shifted within each group.
 
         .. pandas-compat::
-            **GroupBy.shift**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.shift`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.shift`
 
             Parameter ``freq`` is unsupported.
         """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 30b68574960..77675edc0f0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -497,7 +497,7 @@ def empty(self):
         True
 
         .. pandas-compat::
-            **DataFrame.empty, Series.empty**
+            :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty`
 
             If DataFrame/Series contains only `null` values, it is still not
             considered empty. See the example above.
@@ -831,7 +831,7 @@ def replace(
         4    4    9  e
 
         .. pandas-compat::
-            **DataFrame.replace, Series.replace**
+            :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace`
 
             Parameters that are currently not supported are: `limit`, `regex`,
             `method`
@@ -1372,7 +1372,7 @@ def sum(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.sum, Series.sum**
+           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1433,7 +1433,7 @@ def product(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.product, Series.product**
+            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
 
             Parameters currently not supported are level`, `numeric_only`.
         """
@@ -1530,7 +1530,7 @@ def median(
         17.0
 
         .. pandas-compat::
-            **DataFrame.median, Series.median**
+            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
 
             Parameters currently not supported are `level` and `numeric_only`.
         """
@@ -1586,7 +1586,7 @@ def std(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.std, Series.std**
+            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1645,7 +1645,7 @@ def var(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.var, Series.var**
+            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1701,7 +1701,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.kurtosis**
+            :meth:`pandas.DataFrame.kurtosis`
 
             Parameters currently not supported are `level` and `numeric_only`
         """
@@ -1763,7 +1763,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.skew, Series.skew, Frame.skew**
+            :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew`
 
             The `axis` parameter is not currently supported.
         """
@@ -2229,7 +2229,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:27  1  2
 
         .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
+            :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate`
 
             The ``copy`` parameter is only present for API compatibility, but
             ``copy=False`` is not supported. This method always generates a
@@ -2665,7 +2665,7 @@ def sort_index(
         2  3  1
 
         .. pandas-compat::
-            **DataFrame.sort_index, Series.sort_index**
+            :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index`
 
             * Not supporting: kind, sort_remaining=False
         """
@@ -3497,7 +3497,7 @@ def sort_values(
         1  1  2
 
         .. pandas-compat::
-            **DataFrame.sort_values, Series.sort_values**
+            :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * Not supporting: inplace, kind
@@ -4008,7 +4008,7 @@ def resample(
 
 
         .. pandas-compat::
-            **DataFrame.resample, Series.resample**
+            :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample`
 
             Note that the dtype of the index (or the 'on' column if using
             'on=') in the result will be of a frequency closest to the
@@ -4564,7 +4564,7 @@ def sample(
         1  2  4
 
         .. pandas-compat::
-            **DataFrame.sample, Series.sample**
+            :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample`
 
             When sampling from ``axis=0/'index'``, ``random_state`` can be
             either a numpy random state (``numpy.random.RandomState``)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e12cc3d52fb..c9d24890d15 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -960,7 +960,7 @@ def reindex(self, *args, **kwargs):
         dtype: int64
 
         .. pandas-compat::
-            **Series.reindex**
+            :meth:`pandas.Series.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -1243,7 +1243,7 @@ def map(self, arg, na_action=None) -> "Series":
         dtype: int64
 
         .. pandas-compat::
-            **Series.map**
+            :meth:`pandas.Series.map`
 
             Please note map currently only supports fixed-width numeric
             type functions.
@@ -2094,7 +2094,7 @@ def sort_values(
         dtype: int64
 
         .. pandas-compat::
-            **Series.sort_values**
+            :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * The inplace and kind argument is currently unsupported
@@ -2550,7 +2550,7 @@ def count(self):
         5
 
         .. pandas-compat::
-            **Series.count**
+            :meth:`pandas.Series.count`
 
             Parameters currently not supported is `level`.
         """
@@ -2661,7 +2661,7 @@ def cov(self, other, min_periods=None):
         -0.015750000000000004
 
         .. pandas-compat::
-            **Series.cov**
+            :meth:`pandas.Series.cov`
 
             `min_periods` parameter is not yet supported.
         """
@@ -3422,7 +3422,7 @@ def rename(self, index=None, copy=True):
         'numeric_series'
 
         .. pandas-compat::
-            **Series.rename**
+            :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
             - The ``inplace`` and ``level`` is not supported
@@ -4702,7 +4702,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
         dtype: object
 
         .. pandas-compat::
-            **series.DatetimeProperties.strftime**
+            :meth:`pandas.DatetimeIndex.strftime`
 
             The following date format identifiers are not yet
             supported: ``%c``, ``%x``,``%X``
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 466d46f7dca..07158e4ee61 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -80,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype: float64
 
     .. pandas-compat::
-        **cudf.to_numeric**
+        :func:`pandas.to_numeric`
 
         An important difference from pandas is that this function does not
         accept mixed numeric/non-numeric type sequences.
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 21693e106bd..bb153d4b549 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -56,7 +56,7 @@ class ExponentialMovingWindow(_RollingBase):
     the equivalent pandas method.
 
     .. pandas-compat::
-        **cudf.core.window.ExponentialMovingWindow**
+        :meth:`pandas.DataFrame.ewm`
 
         The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
         are not yet supported. Behavior is defined only for data that begins

From d5ab48d4f2586d2e45234463c1bbe877ce76afe8 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 14:32:54 -0400
Subject: [PATCH 087/101] Use workflow branch 24.08 again (#16314)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16314
---
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 22 +++++-----
 5 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 937080572ad..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 1516cb09449..5a937b2f362 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1fe64e7f318..d5dfc9e1ff5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 2a8ebd30993..8ca971dc28d 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 73f8d726e77..36c9088d93c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From dc62177a64a5fb4d6521f346ff0f44c2ede740f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 20:17:42 +0100
Subject: [PATCH 088/101] Preserve order in left join for cudf-polars (#16268)

Unlike all other joins, polars provides an ordering guarantee for left joins. By default libcudf does not, so we need to order the gather maps in this case.

While here, because it requires another hard-coding of `int32` for something that should be `size_type`, expose `type_to_id` in cython and plumb it through.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16268
---
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 15 +----
 .../libcudf/utilities/type_dispatcher.pxd     |  7 +++
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |  7 ++-
 python/cudf/cudf/_lib/types.pyx               |  4 +-
 .../cudf_polars/containers/column.py          |  3 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 58 +++++++++++++++++++
 python/cudf_polars/tests/test_join.py         |  2 +-
 7 files changed, 78 insertions(+), 18 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 308b1b39291..2ded84d84d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -10,12 +10,7 @@ from rmm._lib.device_buffer cimport device_buffer
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    data_type,
-    null_equality,
-    size_type,
-    type_id,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
@@ -23,15 +18,11 @@ from .table cimport Table
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
-    cdef device_buffer c_empty
-    cdef size_type size = dereference(gather_map.get()).size()
     return Column.from_libcudf(
         move(
             make_unique[column](
-                data_type(type_id.INT32),
-                size,
-                dereference(gather_map.get()).release(),
-                move(c_empty),
+                move(dereference(gather_map.get())),
+                device_buffer(),
                 0
             )
         )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
new file mode 100644
index 00000000000..890fca3a662
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.types cimport type_id
+
+
+cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
+    cdef type_id type_to_id[T]()
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index 6dbb287f3c4..c45c6071bb3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,8 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
@@ -67,3 +68,7 @@ cdef class DataType:
         cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
+
+
+SIZE_TYPE = DataType(type_to_id[size_type]())
+SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index fc672caa574..253fdf7b0d9 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -21,8 +21,6 @@ from cudf._lib.types cimport (
 import cudf
 from cudf._lib import pylibcudf
 
-size_type_dtype = np.dtype("int32")
-
 
 class TypeId(IntEnum):
     EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
@@ -150,6 +148,8 @@ datetime_unit_map = {
     TypeId.TIMESTAMP_NANOSECONDS: "ns",
 }
 
+size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+
 
 class Interpolation(IntEnum):
     LINEAR = (
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 42aba0fcdc0..02018548b2c 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -185,8 +185,7 @@ def nan_count(self) -> int:
                 plc.reduce.reduce(
                     plc.unary.is_nan(self.obj),
                     plc.aggregation.sum(),
-                    # TODO: pylibcudf needs to have a SizeType DataType singleton
-                    plc.DataType(plc.TypeId.INT32),
+                    plc.types.SIZE_TYPE,
                 )
             ).as_py()
         return 0
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 01834ab75a5..0b14530e0ed 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -653,6 +653,59 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> list[plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gather map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        list of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return plc.sorting.stable_sort_by_key(
+            plc.Table([lg, rg]),
+            plc.Table([*left_order.columns(), *right_order.columns()]),
+            [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+            [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+        ).columns()
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -693,6 +746,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 89f6fd3455b..1ffbf3c0ef4 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -53,7 +53,7 @@ def test_join(how, coalesce, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
 def test_cross_join():

From cb570fe6d7dc7ebdd6c8c030916ba27bef277b5e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:45:30 -1000
Subject: [PATCH 089/101] Deprecate dtype= parameter in reduction methods
 (#16313)

In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16313
---
 python/cudf/cudf/_lib/reduce.pyx               | 15 ++++++++++-----
 python/cudf/cudf/core/column/column.py         | 11 ++++++++---
 python/cudf/cudf/core/column/datetime.py       |  9 +++------
 python/cudf/cudf/core/column/numerical.py      | 17 +++++++++--------
 python/cudf/cudf/core/column/numerical_base.py | 11 +++--------
 python/cudf/cudf/core/column/timedelta.py      |  7 +++----
 python/cudf/cudf/tests/test_reductions.py      | 15 +++++++++------
 7 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 56bfa0ba332..64634b7a6f9 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+import warnings
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
-    col_dtype = (
-        dtype if dtype is not None
-        else incol._reduction_result_dtype(reduction_op)
-    )
+    if dtype is not None:
+        warnings.warn(
+            "dtype is deprecated and will be remove in a future release. "
+            "Cast the result (e.g. .astype) after the operation instead.",
+            FutureWarning
+        )
+        col_dtype = dtype
+    else:
+        col_dtype = incol._reduction_result_dtype(reduction_op)
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9467bbeed15..5e77aa87e4e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
         if self.null_count == self.size:
             return True
 
-        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", self)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and self.null_count == self.size:
             return False
 
-        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", self)
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -1305,7 +1305,10 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+            dtype = kwargs.pop("dtype", None)
+            return libcudf.reduce.reduce(
+                op, preprocessed, dtype=dtype, **kwargs
+            )
         return preprocessed
 
     def _process_for_reduction(
@@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         Determine the correct dtype to pass to libcudf based on
         the input dtype, data dtype, and specific reduction op
         """
+        if reduction_op in {"any", "all"}:
+            return np.dtype(np.bool_)
         return self.dtype
 
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 004a059af95..a4538179415 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -485,13 +485,11 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
                 format = format.split(" ")[0]
         return self.strftime(format)
 
-    def mean(
-        self, skipna=None, min_count: int = 0, dtype=np.float64
-    ) -> ScalarLike:
+    def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
         return pd.Timestamp(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
+            ).mean(skipna=skipna, min_count=min_count),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -499,12 +497,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+                skipna=skipna, min_count=min_count, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index cea68c88c90..ba080863722 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
         if result_col.null_count == result_col.size:
             return True
 
-        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", result_col)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and result_col.null_count == result_col.size:
             return False
 
-        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", result_col)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -684,15 +684,16 @@ def to_pandas(
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
-        col_dtype = self.dtype
         if reduction_op in {"sum", "product"}:
-            col_dtype = (
-                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
-            )
+            if self.dtype.kind == "f":
+                return self.dtype
+            return np.dtype("int64")
         elif reduction_op == "sum_of_squares":
-            col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
+            return np.result_dtype(self.dtype, np.dtype("uint64"))
+        elif reduction_op in {"var", "std", "mean"}:
+            return np.dtype("float64")
 
-        return col_dtype
+        return super()._reduction_result_dtype(reduction_op)
 
 
 def _normalize_find_and_replace_input(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 95c78c5efcb..f41010062c8 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -144,32 +144,27 @@ def mean(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
     ):
-        return self._reduce(
-            "mean", skipna=skipna, min_count=min_count, dtype=dtype
-        )
+        return self._reduce("mean", skipna=skipna, min_count=min_count)
 
     def var(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 36d7d9f9614..59ea1cc002c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
+    def mean(self, skipna=None) -> pd.Timedelta:
         return pd.Timedelta(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, dtype=dtype),
+            ).mean(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -345,12 +345,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+                skipna=skipna, min_count=min_count, ddof=ddof
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 1247fa362ce..8be6463c699 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
 
 def test_sum_boolean():
     s = Series(np.arange(100000))
-    got = (s > 1).sum(dtype=np.int32)
+    got = (s > 1).sum()
     expect = 99998
 
     assert expect == got
 
-    got = (s > 1).sum(dtype=np.bool_)
-    expect = True
-
-    assert expect == got
-
 
 def test_date_minmax():
     np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
     result = df.mean()
     expected = df.to_pandas().mean()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("op", ["sum", "product"])
+def test_dtype_deprecated(op):
+    ser = cudf.Series(range(5))
+    with pytest.warns(FutureWarning):
+        result = getattr(ser, op)(dtype=np.dtype(np.int8))
+    assert isinstance(result, np.int8)

From 3df4ac28423b99e4dd88570da8d55e2e5af2e1bc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:46:18 -1000
Subject: [PATCH 090/101] Remove squeeze argument from groupby (#16312)

In pandas, this argument was deprecated in pandas 1.x and removed in pandas 2.x. xref https://github.com/pandas-dev/pandas/pull/33218

Looks like in cudf this argument was never implemented, so to align with pandas, I think it should be OK to just remove this argument

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16312
---
 python/cudf/cudf/core/dataframe.py     | 2 --
 python/cudf/cudf/core/indexed_frame.py | 6 ------
 python/cudf/cudf/core/series.py        | 2 --
 3 files changed, 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f06e45277e2..8f8baec0af4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4306,7 +4306,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -4317,7 +4316,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 77675edc0f0..576596f6f7d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5249,7 +5249,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -5259,11 +5258,6 @@ def groupby(
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if squeeze is not False:
-            raise NotImplementedError(
-                "squeeze parameter is not yet implemented"
-            )
-
         if not observed:
             raise NotImplementedError(
                 "observed parameter is not yet implemented"
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c9d24890d15..baaa2eb46a1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3368,7 +3368,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -3379,7 +3378,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )

From 18f5fe0010fd42f604a340cd025a9ca9e122c6f5 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:41:39 -0700
Subject: [PATCH 091/101] Fix polars for 1.2.1 (#16316)

I think Polars made a breaking change in a patch release.
At least the error we're getting looks like the error from
https://github.com/pola-rs/polars/pull/17606.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16316
---
 python/cudf_polars/cudf_polars/utils/versions.py |  1 +
 python/cudf_polars/tests/test_groupby.py         | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index a9ac14c25aa..9807cffb384 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -15,6 +15,7 @@
 POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
 POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
 POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
 POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
 POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
 POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b650fee5079..a75825ef3d3 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -157,7 +157,18 @@ def test_groupby_nan_minmax_raises(op):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
+            ),
+        ),
+        pl.col("key1"),
+    ],
+)
 @pytest.mark.parametrize(
     "expr",
     [

From fa0d89d9b4b4152b919999b5f01b1e68407469c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:46:28 -1000
Subject: [PATCH 092/101] Clean unneeded/redudant dtype utils (#16309)

* Replace `min_scalar_type` with `min_signed_type` (the former just called the latter)
* Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype`
* Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16309
---
 python/cudf/cudf/core/column/column.py    |  6 +++---
 python/cudf/cudf/core/column/numerical.py | 12 +++++++----
 python/cudf/cudf/core/dataframe.py        | 22 ++++---------------
 python/cudf/cudf/core/index.py            | 22 +++++++++----------
 python/cudf/cudf/utils/dtypes.py          | 26 ++++++-----------------
 5 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5e77aa87e4e..89f0f79cb7c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -71,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -1356,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1396,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ba080863722..b55284f1aff 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -29,10 +29,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -517,11 +517,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8f8baec0af4..904bd4ccb2e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -103,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -8304,7 +8290,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4164f981fca..cd52a34e35e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -357,12 +355,10 @@ def _data(self):
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -1601,9 +1597,13 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index af912bee342..69c268db149 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -89,10 +89,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -330,32 +320,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype
 

From 910989eb8fb87b2e896aa032260705c27cce71e0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Jul 2024 15:48:37 -0600
Subject: [PATCH 093/101] Rename gather/scatter benchmarks to clarify coalesced
 behavior. (#16083)

The benchmark names `coalesce_x` and `coalesce_o` are not very clear. This PR renames them to `coalesced` and `shuffled`. This was discussed with @GregoryKimball.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16083
---
 cpp/benchmarks/copying/gather.cu              | 6 +++---
 cpp/benchmarks/copying/scatter.cu             | 6 +++---
 cpp/benchmarks/lists/copying/scatter_lists.cu | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index eeb0149fb3a..985166f7298 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                     \
     ->UseManualTime();
 
-GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+GBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+GBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a521dc82739..c27480b69f4 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {1, 8}})                      \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index dbc3234dabf..570decf410f 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
-SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);

From 6e37afc7c9e177b307c41950e52453bd5906af44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:52:27 -1000
Subject: [PATCH 094/101] Make __bool__ raise for more cudf objects (#16311)

To match pandas, this PR makes `DataFrame`, `MultiIndex` and `RangeIndex` raise on `__bool__`.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16311
---
 python/cudf/cudf/core/_base_index.py         | 6 ++++++
 python/cudf/cudf/core/frame.py               | 6 ++++++
 python/cudf/cudf/core/single_column_frame.py | 6 ------
 python/cudf/cudf/tests/test_csv.py           | 2 +-
 python/cudf/cudf/tests/test_dataframe.py     | 9 +++++++++
 python/cudf/cudf/tests/test_index.py         | 9 +++++++++
 python/cudf/cudf/tests/test_multiindex.py    | 9 +++++++++
 7 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 479f87bb78b..657acc41b18 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -62,6 +62,12 @@ def copy(self, deep: bool = True) -> Self:
     def __len__(self):
         raise NotImplementedError
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     @property
     def size(self):
         # The size of an index is always its length irrespective of dimension.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 111225a5fc2..e3a2e840902 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1587,6 +1587,12 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     # Reductions
     @classmethod
     @_performance_tracking
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 04c7db7a53c..7efe13d9b45 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -91,12 +91,6 @@ def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
-
     @property  # type: ignore
     @_performance_tracking
     def _num_columns(self) -> int:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index a22a627523f..0525b02b698 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1617,7 +1617,7 @@ def test_csv_reader_partial_dtype(dtype):
         StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"]
     )
 
-    assert names_df == header_df
+    assert_eq(names_df, header_df)
     assert all(names_df.dtypes == ["int16", "int64"])
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2009fc49ce5..53ed5d728cb 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11100,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy():
     data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
     df = cudf.DataFrame(data.view(np.recarray), index=idx)
     assert df.index is idx
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.DataFrame()]],
+        rfunc_args_and_kwargs=[[pd.DataFrame()]],
+    )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 9eba6122d26..722a64cb553 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3294,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index):
     df = cudf.DataFrame(range(1))
     df.index = index
     assert df.index is index
+
+
+def test_bool_rangeindex_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[pd.RangeIndex(0)]],
+        rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]],
+    )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 1941eec91eb..2c00d48266c 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2161,3 +2161,12 @@ def test_nunique(array, dropna):
     result = gidx.nunique(dropna=dropna)
     expected = pidx.nunique(dropna=dropna)
     assert result == expected
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
+        rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
+    )

From ecc27a1140c0c287091f6a1291dfaf7ccd82cb19 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:55:40 -1000
Subject: [PATCH 095/101] Align more DataFrame APIs with pandas (#16310)

I have a script that did some signature comparisons between `pandas.DataFrame` and `cudf.DataFrame` API and it appears some signatures have changed between the pandas 1.x and 2.x release. The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `__init__`
* `__array__`
* `__arrow_c_stream__`
* `to_dict`
* `where`
* `add_prefix`
* `join`
* `apply`
* `to_records`
* `from_records`
* `unstack`
* `pct_change`
* `sort_values`

Marking as breaking as I ensured some added keywords are in the same positions as pandas and therefore might break users who are using purely positional arguments.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16310
---
 python/cudf/cudf/core/dataframe.py     | 169 +++++++++++++++++++++++--
 python/cudf/cudf/core/frame.py         |   2 +-
 python/cudf/cudf/core/indexed_frame.py |  13 +-
 python/cudf/cudf/core/reshape.py       |   7 +-
 python/cudf/cudf/core/series.py        |  32 ++++-
 5 files changed, 202 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 904bd4ccb2e..7e07078c95b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -594,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed.
         If None, infer.
+    copy : bool or None, default None
+        Copy data from inputs.
+        Currently not implemented.
     nan_as_null : bool, Default True
         If ``None``/``True``, converts ``np.nan`` values to
         ``null`` values.
@@ -680,8 +683,11 @@ def __init__(
         index=None,
         columns=None,
         dtype=None,
+        copy=None,
         nan_as_null=no_default,
     ):
+        if copy is not None:
+            raise NotImplementedError("copy is not currently implemented.")
         super().__init__()
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
@@ -1524,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs):
             pass
         return NotImplemented
 
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the cudf DataFrame as an Arrow C stream PyCapsule.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema. Currently not implemented.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self.to_arrow().__arrow_c_stream__()
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @_performance_tracking
     def _get_numeric_data(self):
@@ -2235,6 +2260,7 @@ def to_dict(
         self,
         orient: str = "dict",
         into: type[dict] = dict,
+        index: bool = True,
     ) -> dict | list[dict]:
         """
         Convert the DataFrame to a dictionary.
@@ -2268,6 +2294,13 @@ def to_dict(
             instance of the mapping type you want.  If you want a
             collections.defaultdict, you must pass it initialized.
 
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
         Returns
         -------
         dict, list or collections.abc.Mapping
@@ -2349,7 +2382,7 @@ def to_dict(
                 raise TypeError(f"unsupported type: {into}")
             return cons(self.items())  # type: ignore[misc]
 
-        return self.to_pandas().to_dict(orient=orient, into=into)
+        return self.to_pandas().to_dict(orient=orient, into=into, index=index)
 
     @_performance_tracking
     def scatter_by_map(
@@ -3004,7 +3037,12 @@ def fillna(
         )
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
+
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
             _make_categorical_like,
@@ -3614,7 +3652,9 @@ def rename(
         return result
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -4230,6 +4270,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
+        validate: str | None = None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4243,6 +4284,16 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+            Currently not supported.
 
         Returns
         -------
@@ -4256,6 +4307,10 @@ def join(
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
+        elif validate is not None:
+            raise NotImplementedError(
+                "The validate parameter is not yet supported"
+            )
 
         df = self.merge(
             other,
@@ -4404,7 +4459,16 @@ def query(self, expr, local_dict=None):
 
     @_performance_tracking
     def apply(
-        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+        self,
+        func,
+        axis=1,
+        raw=False,
+        result_type=None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Literal["python", "numba"] = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
     ):
         """
         Apply a function along an axis of the DataFrame.
@@ -4432,6 +4496,25 @@ def apply(
             Not yet supported
         args: tuple
             Positional arguments to pass to func in addition to the dataframe.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            Currently not supported.
+
+        engine : {'python', 'numba'}, default 'python'
+            Unused. Added for compatibility with pandas.
+        engine_kwargs : dict
+            Unused. Added for compatibility with pandas.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
 
         Examples
         --------
@@ -4582,13 +4665,17 @@ def apply(
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
-            raise ValueError(
+            raise NotImplementedError(
                 "DataFrame.apply currently only supports row wise ops"
             )
         if raw:
-            raise ValueError("The `raw` kwarg is not yet supported.")
+            raise NotImplementedError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
-            raise ValueError("The `result_type` kwarg is not yet supported.")
+            raise NotImplementedError(
+                "The `result_type` kwarg is not yet supported."
+            )
+        if by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
@@ -5489,7 +5576,7 @@ def from_arrow(cls, table):
         return out
 
     @_performance_tracking
-    def to_arrow(self, preserve_index=None):
+    def to_arrow(self, preserve_index=None) -> pa.Table:
         """
         Convert to a PyArrow Table.
 
@@ -5579,18 +5666,36 @@ def to_arrow(self, preserve_index=None):
         return out.replace_schema_metadata(metadata)
 
     @_performance_tracking
-    def to_records(self, index=True):
+    def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
         """Convert to a numpy recarray
 
         Parameters
         ----------
         index : bool
             Whether to include the index in the output.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types. Currently not supported.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+            This mapping is applied only if `index=True`.
+            Currently not supported.
 
         Returns
         -------
         numpy recarray
         """
+        if column_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
+        elif index_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
         members = [("index", self.index.dtype)] if index else []
         members += [(col, self[col].dtype) for col in self._data.names]
         dtype = np.dtype(members)
@@ -5603,7 +5708,16 @@ def to_records(self, index=True):
 
     @classmethod
     @_performance_tracking
-    def from_records(cls, data, index=None, columns=None, nan_as_null=False):
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+        nan_as_null=False,
+    ):
         """
         Convert structured or record ndarray to DataFrame.
 
@@ -5613,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         index : str, array-like
             The name of the index column in *data*.
             If None, the default index is used.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+            Currently not implemented.
         columns : list of str
             List of column names to include.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+            Currently not implemented.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+            Currently not implemented.
 
         Returns
         -------
         DataFrame
         """
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported.")
+        if coerce_float is not False:
+            raise NotImplementedError(
+                "coerce_float is currently not supported."
+            )
+        if nrows is not None:
+            raise NotImplementedError("nrows is currently not supported.")
+
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found {data.ndim}"
@@ -7344,9 +7477,9 @@ def pivot_table(
 
     @_performance_tracking
     @copy_docstring(reshape.unstack)
-    def unstack(self, level=-1, fill_value=None):
+    def unstack(self, level=-1, fill_value=None, sort: bool = True):
         return cudf.core.reshape.unstack(
-            self, level=level, fill_value=fill_value
+            self, level=level, fill_value=fill_value, sort=sort
         )
 
     @_performance_tracking
@@ -7392,7 +7525,12 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -7417,6 +7555,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift`.
 
         Returns
         -------
@@ -7462,7 +7603,7 @@ def pct_change(
             data = self.fillna(method=fill_method, limit=limit)
 
         return data.diff(periods=periods) / data.shift(
-            periods=periods, freq=freq
+            periods=periods, freq=freq, **kwargs
         )
 
     def __dataframe__(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e3a2e840902..c82e073d7b7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
         return self.to_numpy()
 
     @_performance_tracking
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed, To explicitly construct a GPU matrix, consider using "
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 576596f6f7d..60cd142db4b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
         """
         Prefix labels with string `prefix`.
 
@@ -3464,6 +3464,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -3479,6 +3480,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -3518,6 +3527,8 @@ def sort_values(
             )
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
+        if key is not None:
+            raise NotImplementedError("key is not currently supported.")
 
         if len(self) == 0:
             return self
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1120642947b..b538ae34b6f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     return result
 
 
-def unstack(df, level, fill_value=None):
+def unstack(df, level, fill_value=None, sort: bool = True):
     """
     Pivot one or more levels of the (necessarily hierarchical) index labels.
 
@@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None):
         levels of the index to pivot
     fill_value
         Non-functional argument provided for compatibility with Pandas.
+    sort : bool, default True
+        Sort the level(s) in the resulting MultiIndex columns.
+
 
     Returns
     -------
@@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None):
 
     if fill_value is not None:
         raise NotImplementedError("fill_value is not supported.")
+    elif sort is False:
+        raise NotImplementedError(f"{sort=} is not supported.")
     if pd.api.types.is_list_like(level):
         if not level:
             return df
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index baaa2eb46a1..b1e63806934 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2063,6 +2063,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -2076,6 +2077,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -2107,6 +2116,7 @@ def sort_values(
             kind=kind,
             na_position=na_position,
             ignore_index=ignore_index,
+            key=key,
         )
 
     @_performance_tracking
@@ -3429,7 +3439,9 @@ def rename(self, index=None, copy=True):
         return Series._from_data(out_data, self.index, name=index)
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3527,7 +3539,12 @@ def explode(self, ignore_index=False):
 
     @_performance_tracking
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -3552,6 +3569,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `Series.shift`.
 
         Returns
         -------
@@ -3596,11 +3616,15 @@ def pct_change(
             warnings.simplefilter("ignore")
             data = self.fillna(method=fill_method, limit=limit)
         diff = data.diff(periods=periods)
-        change = diff / data.shift(periods=periods, freq=freq)
+        change = diff / data.shift(periods=periods, freq=freq, **kwargs)
         return change
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             self._from_data_like_self(

From 57ed7fce6742abc96a8fd65216f032bad5937a2f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:24:55 -0500
Subject: [PATCH 096/101] Add tests for `pylibcudf` binaryops (#15470)

This PR implements a more general approach to testing binaryops that originally came up in https://github.com/rapidsai/cudf/pull/15279. This PR can possibly supersede that one.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15470
---
 cpp/include/cudf/binaryop.hpp                 |  11 +
 cpp/src/binaryop/binaryop.cpp                 |   7 +-
 .../binaryop/binop-verify-input-test.cpp      |   4 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |   9 +
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  35 +
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  |  39 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  10 +
 .../cudf/pylibcudf_tests/test_binaryops.py    | 786 ++++++++++++++++++
 8 files changed, 889 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_binaryops.py

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 22dad11e109..c74c91e39c2 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 8ac1491547d..3ac8547baad 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 1346dcd4666..def6e94452e 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 9a8c8e49dcf..2411e28ac66 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +24,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index c1d669c3c1c..44d9f4ad04a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -84,3 +85,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 0eda7d34ff9..b34fea6a775 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -1,9 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.exception_handler cimport cudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -19,9 +21,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
@@ -29,38 +42,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e029edfa2ed..ed2c5ca06c9 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -111,6 +111,16 @@ def _make_fields_nullable(typ):
         lhs = rhs.cast(lhs_type)
 
     if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        lhs_nans = pa.compute.is_nan(lhs)
+        rhs_nans = pa.compute.is_nan(rhs)
+        assert lhs_nans.equals(rhs_nans)
+
+        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            # masks must be equal at this point
+            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+            lhs = lhs.filter(mask)
+            rhs = rhs.filter(mask)
+
         np.testing.assert_array_almost_equal(lhs, rhs)
     else:
         assert lhs.equals(rhs)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
new file mode 100644
index 00000000000..a83caf39ead
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
@@ -0,0 +1,786 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import numpy as np
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def idfn(param):
+    ltype, rtype, outtype, plc_op, _ = param
+    params = (plc_op.name, ltype, rtype, outtype)
+    return "-".join(map(str, params))
+
+
+@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"])
+def nulls(request):
+    return request.param
+
+
+def make_col(dtype, nulls):
+    if dtype == "int64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.int64()
+    elif dtype == "uint64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.uint64()
+    elif dtype == "float64":
+        data = [1.0, 2.0, 3.0, 4.0, 5.0]
+        pa_type = pa.float64()
+    elif dtype == "bool":
+        data = [True, False, True, False, True]
+        pa_type = pa.bool_()
+    elif dtype == "timestamp64[ns]":
+        data = [
+            np.datetime64("2022-01-01"),
+            np.datetime64("2022-01-02"),
+            np.datetime64("2022-01-03"),
+            np.datetime64("2022-01-04"),
+            np.datetime64("2022-01-05"),
+        ]
+        pa_type = pa.timestamp("ns")
+    elif dtype == "timedelta64[ns]":
+        data = [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+        pa_type = pa.duration("ns")
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if nulls:
+        data[3] = None
+
+    return pa.array(data, type=pa_type)
+
+
+@pytest.fixture
+def pa_data(request, nulls):
+    ltype, rtype, outtype = request.param
+    values = make_col(ltype, nulls), make_col(rtype, nulls), outtype
+    return values
+
+
+@pytest.fixture
+def plc_data(pa_data):
+    lhs, rhs, outtype = pa_data
+    return (
+        plc.interop.from_arrow(lhs),
+        plc.interop.from_arrow(rhs),
+        plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))),
+    )
+
+
+@pytest.fixture
+def tests(request, nulls):
+    ltype, rtype, py_outtype, plc_op, py_op = request.param
+    pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls)
+    plc_lhs, plc_rhs = (
+        plc.interop.from_arrow(pa_lhs),
+        plc.interop.from_arrow(pa_rhs),
+    )
+    plc_dtype = plc.interop.from_arrow(
+        pa.from_numpy_dtype(np.dtype(py_outtype))
+    )
+    return (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_dtype,
+        py_op,
+        plc_op,
+    )
+
+
+def custom_pyop(func):
+    def wrapper(x, y):
+        x = x.to_pylist()
+        y = y.to_pylist()
+
+        def inner(x, y):
+            if x is None or y is None:
+                return None
+            return func(x, y)
+
+        return pa.array([inner(x, y) for x, y in zip(x, y)])
+
+    return wrapper
+
+
+@custom_pyop
+def py_floordiv(x, y):
+    return x // y
+
+
+@custom_pyop
+def py_pmod(x, y):
+    return (x % y + y) % y
+
+
+@custom_pyop
+def py_mod(x, y):
+    return x % y
+
+
+@custom_pyop
+def py_atan2(x, y):
+    return math.atan2(x, y)
+
+
+@custom_pyop
+def py_shift_right_unsigned(x, y):
+    unsigned_x = np.uint32(x)
+    result = unsigned_x >> y
+    return result
+
+
+@pytest.mark.parametrize(
+    "tests",
+    [
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.GENERIC_BINARY,
+            None,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INVALID_BINARY,
+            None,
+        ),
+    ],
+    indirect=True,
+    ids=idfn,
+)
+def test_binaryops(tests):
+    (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_outtype,
+        py_op,
+        plc_op,
+    ) = tests
+
+    def get_result():
+        return plc.binaryop.binary_operation(
+            plc_lhs,
+            plc_rhs,
+            plc_op,
+            plc_outtype,
+        )
+
+    if not plc.binaryop.is_supported_operation(
+        plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op
+    ):
+        with pytest.raises(TypeError):
+            get_result()
+    else:
+        expect = py_op(pa_lhs, pa_rhs).cast(py_outtype)
+        got = get_result()
+        assert_column_eq(expect, got)

From 7d3083254c0503b07f82af32188120f42acef860 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:48:39 -1000
Subject: [PATCH 097/101] Replace np.isscalar/issubdtype checks with
 is_scalar/.kind checks (#16275)

* `is_scalar` also handles cudf.Scalars which should be handled internally
* `issubdtype` can largely be replaced by checking the `.kind` attribute on the dtype

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16275
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/column.py      | 10 +++----
 python/cudf/cudf/core/column/datetime.py    |  2 +-
 python/cudf/cudf/core/column/lists.py       |  9 ++++---
 python/cudf/cudf/core/column/numerical.py   | 28 +++++++-------------
 python/cudf/cudf/core/join/_join_helpers.py | 29 ++++++---------------
 python/cudf/cudf/core/series.py             |  2 +-
 python/cudf/cudf/testing/testing.py         | 10 +++----
 python/cudf/cudf/utils/dtypes.py            |  4 +--
 9 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6003a0f6aea..18ab32d2c9e 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -47,7 +47,7 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if isinstance(other, float) and not np.isnan(other):
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
             try:
                 is_safe = source_dtype.type(other) == other
             except OverflowError:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 89f0f79cb7c..da735c22c52 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1458,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2312,9 +2313,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a4538179415..73902789c11 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -639,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 46b844413f7..1b7cd95b3d0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b55284f1aff..5e07bbab40c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -225,25 +225,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index dd0a4f666a1..32c84763401 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -88,38 +88,25 @@ def _match_join_keys(
         )
 
     if (
-        np.issubdtype(ltype, np.number)
-        and np.issubdtype(rtype, np.number)
-        and not (
-            np.issubdtype(ltype, np.timedelta64)
-            or np.issubdtype(rtype, np.timedelta64)
-        )
+        is_numeric_dtype(ltype)
+        and is_numeric_dtype(rtype)
+        and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.result_type(ltype, rtype)
         )
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        and np.issubdtype(rtype, np.datetime64)
-    ) or (
-        np.issubdtype(ltype, np.timedelta64)
-        and np.issubdtype(rtype, np.timedelta64)
+    elif (ltype.kind == "M" and rtype.kind == "M") or (
+        ltype.kind == "m" and rtype.kind == "m"
     ):
         common_type = max(ltype, rtype)
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        or np.issubdtype(ltype, np.timedelta64)
-    ) and not rcol.fillna(0).can_cast_safely(ltype):
+    elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype):
         raise TypeError(
             f"Cannot join between {ltype} and {rtype}, please type-cast both "
             "columns to the same type."
         )
-    elif (
-        np.issubdtype(rtype, np.datetime64)
-        or np.issubdtype(rtype, np.timedelta64)
-    ) and not lcol.fillna(0).can_cast_safely(rtype):
+    elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype):
         raise TypeError(
             f"Cannot join between {rtype} and {ltype}, please type-cast both "
             "columns to the same type."
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b1e63806934..eb077179562 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -213,7 +213,7 @@ def __setitem__(self, key, value):
                         and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
-                and isinstance(value, (np.float32, np.float64))
+                and isinstance(value, np.floating)
                 and np.isnan(value)
             ):
                 raise MixedTypeError(
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index e56c8d867cb..c2072d90e98 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -158,12 +158,12 @@ def assert_column_equal(
             return True
 
     if check_datetimelike_compat:
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             right = right.astype(left.dtype)
-        elif np.issubdtype(right.dtype, np.datetime64):
+        elif right.dtype.kind == "M":
             left = left.astype(right.dtype)
 
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             if not left.equals(right):
                 raise AssertionError(
                     f"[datetimelike_compat=True] {left.values} "
@@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs):
                 tm.assert_index_equal(left, right, **kwargs)
 
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
+        if left.dtype.kind == "f" and right.dtype.kind == "f":
             assert np.allclose(left, right, equal_nan=True)
         else:
             assert np.array_equal(left, right)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 69c268db149..c0de5274742 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -359,10 +359,10 @@ def min_column_type(x, expected_type):
     if x.null_count == len(x):
         return x.dtype
 
-    if np.issubdtype(x.dtype, np.floating):
+    if x.dtype.kind == "f":
         return get_min_float_dtype(x)
 
-    elif np.issubdtype(expected_type, np.integer):
+    elif cudf.dtype(expected_type).kind in "iu":
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)

From 4c46628eaf7ba16a2a181ceb3311f315cd4932dc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:51:07 -1000
Subject: [PATCH 098/101] Mark cudf._typing as a typing module in ruff (#16318)

Additionally breaks up the prior, single-line of `select` rules that are enabled.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16318
---
 pyproject.toml                    | 64 ++++++++++++++++++++++++++++++-
 python/cudf/cudf/core/resample.py |  6 ++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f59864894b..e15cb7b3cdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index cdd4ec6f8e5..4e0c5bd86b9 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import pickle
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -23,7 +25,6 @@
 import cudf
 import cudf._lib.labeling
 import cudf.core.index
-from cudf._typing import DataFrameOrSeries
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -31,6 +32,9 @@
     _Grouping,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import DataFrameOrSeries
+
 
 class _Resampler(GroupBy):
     grouping: "_ResampleGrouping"

From 5dde41d7f7533180ecd355bac248a7ed18adcc10 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 13:08:36 -1000
Subject: [PATCH 099/101] Replace is_float/integer_dtype checks with .kind
 checks (#16261)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16261
---
 python/cudf/cudf/api/types.py                |  2 +-
 python/cudf/cudf/core/_base_index.py         | 19 +++----------
 python/cudf/cudf/core/column/column.py       | 29 ++++++++++----------
 python/cudf/cudf/core/column/decimal.py      |  4 +--
 python/cudf/cudf/core/column/numerical.py    | 13 +++------
 python/cudf/cudf/core/index.py               | 13 +++++----
 python/cudf/cudf/core/indexing_utils.py      |  8 ++----
 python/cudf/cudf/core/series.py              |  7 ++---
 python/cudf/cudf/core/single_column_frame.py |  3 +-
 python/cudf/cudf/tests/test_dataframe.py     |  2 +-
 python/cudf/cudf/utils/dtypes.py             | 28 +++++++++----------
 11 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index d97e9c815b6..294ae2fd985 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -90,7 +90,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer_dtype(obj.dtype)
+        return obj.dtype.kind in "iu"
     return pd.api.types.is_integer(obj)
 
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 657acc41b18..c38352009de 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,14 +19,7 @@
 )
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_scalar,
-    is_signed_integer_dtype,
-    is_unsigned_integer_dtype,
-)
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.errors import MixedTypeError
@@ -621,12 +614,8 @@ def union(self, other, sort=None):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
                 raise MixedTypeError("Cannot perform union with mixed types")
-            if (
-                is_signed_integer_dtype(self.dtype)
-                and is_unsigned_integer_dtype(other.dtype)
-            ) or (
-                is_unsigned_integer_dtype(self.dtype)
-                and is_signed_integer_dtype(other.dtype)
+            if (self.dtype.kind == "i" and other.dtype.kind == "u") or (
+                self.dtype.kind == "u" and other.dtype.kind == "i"
             ):
                 # signed + unsigned types will result in
                 # mixed type for union in pandas.
@@ -2103,7 +2092,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
+        if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
         if not _gather_map_is_valid(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index da735c22c52..32e6aade65b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2219,25 +2219,26 @@ def as_column(
                 and arbitrary.null_count > 0
             ):
                 arbitrary = arbitrary.cast(pa.float64())
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and pa.types.is_integer(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("int")
-            elif cudf.get_option(
-                "default_float_bitwidth"
-            ) and pa.types.is_floating(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("float")
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and pa.types.is_integer(arbitrary.type)
+            ) or (
+                cudf.get_option("default_float_bitwidth")
+                and pa.types.is_floating(arbitrary.type)
+            ):
+                dtype = _maybe_convert_to_default_type(
+                    cudf.dtype(arbitrary.type.to_pandas_dtype())
+                )
         except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
             arbitrary = pd.Series(arbitrary)
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and arbitrary.dtype.kind in set("iu"):
-                dtype = _maybe_convert_to_default_type("int")
-            elif (
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and arbitrary.dtype.kind in set("iu")
+            ) or (
                 cudf.get_option("default_float_bitwidth")
                 and arbitrary.dtype.kind == "f"
             ):
-                dtype = _maybe_convert_to_default_type("float")
+                dtype = _maybe_convert_to_default_type(arbitrary.dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index a63055ed527..6a7f338b065 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,7 +15,7 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf.api.types import is_integer_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -150,7 +150,7 @@ def _validate_fillna_value(
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
-                if not is_integer_dtype(other.dtype):
+                if other.dtype.kind not in "iu":
                     raise TypeError(
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 5e07bbab40c..f9404eb3b40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import (
-    is_float_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_scalar,
-)
+from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -249,7 +244,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
-            if is_float_dtype(self.dtype) or is_float_dtype(other.dtype):
+            if self.dtype.kind == "f" or other.dtype.kind == "f":
                 raise TypeError(
                     f"Operation 'bitwise {op[2:-2]}' not supported between "
                     f"{self.dtype.type.__name__} and "
@@ -260,8 +255,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         if (
             op == "__pow__"
-            and is_integer_dtype(self.dtype)
-            and (is_integer(other) or is_integer_dtype(other.dtype))
+            and self.dtype.kind in "iu"
+            and (is_integer(other) or other.dtype.kind in "iu")
         ):
             op = "INT_POW"
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd52a34e35e..ae20fcd5d9c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1456,18 +1456,19 @@ def notna(self):
     notnull = notna
 
     def _is_numeric(self):
-        return isinstance(
-            self._values, cudf.core.column.NumericalColumn
-        ) and self.dtype != cudf.dtype("bool")
+        return (
+            isinstance(self._values, cudf.core.column.NumericalColumn)
+            and self.dtype.kind != "b"
+        )
 
     def _is_boolean(self):
-        return self.dtype == cudf.dtype("bool")
+        return self.dtype.kind == "b"
 
     def _is_integer(self):
-        return cudf.api.types.is_integer_dtype(self.dtype)
+        return self.dtype.kind in "iu"
 
     def _is_floating(self):
-        return cudf.api.types.is_float_dtype(self.dtype)
+        return self.dtype.kind == "f"
 
     def _is_object(self):
         return isinstance(self._values, cudf.core.column.StringColumn)
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 9c81b0eb607..a0089242909 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -8,11 +8,7 @@
 from typing_extensions import TypeAlias
 
 import cudf
-from cudf.api.types import (
-    _is_scalar_or_zero_d_array,
-    is_integer,
-    is_integer_dtype,
-)
+from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 
@@ -233,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
-        elif is_integer_dtype(key.dtype):
+        elif key.dtype.kind in "iu":
             return MapIndexer(GatherMap(key, n, nullify=False))
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index eb077179562..d8dbaa897e7 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -24,7 +24,6 @@
     _is_scalar_or_zero_d_array,
     is_dict_like,
     is_integer,
-    is_integer_dtype,
     is_scalar,
 )
 from cudf.core import indexing_utils
@@ -356,12 +355,10 @@ def _loc_to_iloc(self, arg):
             )
             if not _is_non_decimal_numeric_dtype(index_dtype) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
-                and is_integer_dtype(index_dtype.categories.dtype)
+                and index_dtype.categories.dtype.kind in "iu"
             ):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
+                if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
                     assert (
                         PANDAS_LT_300
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7efe13d9b45..b93528f9693 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -12,7 +12,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
@@ -352,7 +351,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
-            if is_integer_dtype(arg.dtype):
+            if arg.dtype.kind in "iu":
                 return self._column.take(arg)
             if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 53ed5d728cb..e2ce5c03b70 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10833,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names):
         expectation = contains is cudf.NA and name is cudf.NA
         assert (contains in pdf) == expectation
         assert (contains in gdf) == expectation
-    elif pd.api.types.is_float_dtype(gdf.columns.dtype):
+    elif gdf.columns.dtype.kind == "f":
         # In some cases, the columns are converted to an Index[float] based on
         # the other column names. That casts name values from None to np.nan.
         expectation = contains is np.nan and (name is None or name is np.nan)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index c0de5274742..b0788bcc0fc 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import datetime
 from decimal import Decimal
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
@@ -11,6 +13,9 @@
 
 import cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import DtypeObj
+
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
 handling is required when converting a Boolean column into arrow.
@@ -568,25 +573,18 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _maybe_convert_to_default_type(dtype):
+def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """Convert `dtype` to default if specified by user.
 
     If not specified, return as is.
     """
-    if cudf.get_option("default_integer_bitwidth"):
-        if cudf.api.types.is_signed_integer_dtype(dtype):
-            return cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-        elif cudf.api.types.is_unsigned_integer_dtype(dtype):
-            return cudf.dtype(
-                f'u{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-    if cudf.get_option(
-        "default_float_bitwidth"
-    ) and cudf.api.types.is_float_dtype(dtype):
-        return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}')
-
+    if ib := cudf.get_option("default_integer_bitwidth"):
+        if dtype.kind == "i":
+            return cudf.dtype(f"i{ib//8}")
+        elif dtype.kind == "u":
+            return cudf.dtype(f"u{ib//8}")
+    if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
+        return cudf.dtype(f"f{fb//8}")
     return dtype
 
 

From e169e8e4273e4d317e3f27c810c5b137dd75adb3 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:36:03 -0700
Subject: [PATCH 100/101] Implement read_csv in cudf-polars using pylibcudf
 (#16307)

Replace cudf-classic with pylibcudf for CSV reading in cudf-polars

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16307
---
 python/cudf_polars/cudf_polars/dsl/ir.py | 50 ++++++++++++------------
 python/cudf_polars/tests/test_scan.py    | 38 ++++++++++++++++++
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 0b14530e0ed..a84fe73810e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -242,10 +242,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            dtype_map = {
-                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
-                for name, typ in self.schema.items()
-            }
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
             quote = chr(parse_options["quote_char"])
@@ -280,31 +276,37 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pieces = []
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
-                # TODO: read_csv expands globs which we should not do,
-                # because polars will already have handled them.
                 path = Path(p)
                 with path.open() as f:
                     while f.readline() == "\n":
                         skiprows += 1
-                pieces.append(
-                    cudf.read_csv(
-                        path,
-                        sep=sep,
-                        quotechar=quote,
-                        lineterminator=eol,
-                        names=column_names,
-                        header=header,
-                        usecols=usecols,
-                        na_filter=True,
-                        na_values=null_values,
-                        keep_default_na=False,
-                        skiprows=skiprows,
-                        comment=comment,
-                        decimal=decimal,
-                        dtype=dtype_map,
-                    )
+                tbl_w_meta = plc.io.csv.read_csv(
+                    plc.io.SourceInfo([path]),
+                    delimiter=sep,
+                    quotechar=quote,
+                    lineterminator=eol,
+                    col_names=column_names,
+                    header=header,
+                    usecols=usecols,
+                    na_filter=True,
+                    na_values=null_values,
+                    keep_default_na=False,
+                    skiprows=skiprows,
+                    comment=comment,
+                    decimal=decimal,
+                    dtypes=self.schema,
+                )
+                pieces.append(tbl_w_meta)
+            tables, colnames = zip(
+                *(
+                    (piece.tbl, piece.column_names(include_children=False))
+                    for piece in pieces
                 )
-            df = DataFrame.from_cudf(cudf.concat(pieces))
+            )
+            df = DataFrame.from_table(
+                plc.concatenate.concatenate(list(tables)),
+                colnames[0],
+            )
         elif self.typ == "parquet":
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index d0c41090433..0981a96a34a 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
+
 import pytest
 
 import polars as pl
@@ -129,6 +131,42 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize(
+    "filename,glob",
+    [
+        (["test1.csv", "test2.csv"], True),
+        ("test*.csv", True),
+        # Make sure we don't expand glob when
+        # trying to read a file like test*.csv
+        # when glob=False
+        ("test*.csv", False),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test*.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    os.chdir(tmp_path)
+    q = pl.scan_csv(filename, glob=glob)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_multi_differing_colnames(tmp_path):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""abc,def,ghi\n1,2\n3,4,5""")
+    q = pl.scan_csv(
+        [tmp_path / "test1.csv", tmp_path / "test2.csv"],
+    )
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.explain()
+
+
 def test_scan_csv_skip_after_header_not_implemented(tmp_path):
     with (tmp_path / "test.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")

From 535db9b26ed1a57e4275f4a6f11b04ebeee21248 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:28:14 -0700
Subject: [PATCH 101/101] Deprecate Arrow support in I/O (#16132)

Contributes to https://github.com/rapidsai/cudf/issues/15193

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16132
---
 .../cudf/_lib/pylibcudf/io/datasource.pyx     |  10 +-
 python/cudf/cudf/io/csv.py                    |   2 +-
 python/cudf/cudf/io/orc.py                    |  33 +++--
 python/cudf/cudf/io/parquet.py                |  40 ++++--
 .../io/test_source_sink_info.py               |  21 +--
 python/cudf/cudf/tests/test_csv.py            |   5 +-
 python/cudf/cudf/tests/test_gcs.py            |   3 +-
 python/cudf/cudf/tests/test_parquet.py        |  19 +--
 python/cudf/cudf/tests/test_s3.py             | 136 ++++++++++--------
 python/cudf/cudf/utils/ioutils.py             |  78 ++++++++--
 python/cudf/cudf/utils/utils.py               |  26 ++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |   6 +-
 12 files changed, 247 insertions(+), 132 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
index aa7fa0efdaf..8f265f585de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
 from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 02b26ea1c01..0f0a240b5d0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 287dd8f21c8..438c482b77a 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,11 +2,9 @@
 
 import io
 
-import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0525b02b698..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ecb7fd44422..f2820d9c112 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 76c7f2bfdb8..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -468,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -934,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1006,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1188,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1409,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1430,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..3947c69aaa5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9

File	Compile time	Size
File	Compile time	Size	t-cmp
", build_time_str, "	", file_size_str, "