From 4e34a20a31fae2546f9cfbaa520d7561b80563c7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 11:18:25 -0500
Subject: [PATCH 01/44] Backport: Fix segfault in conditional join (#16094)
 (#16100)

Backports #16094 to 24.06 for inclusion in a hotfix release.
---
 cpp/src/join/conditional_join.cu         | 13 +---
 cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++-------
 2 files changed, 70 insertions(+), 35 deletions(-)
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});

From e41242094092f9ed31fd4d04f8a30107c1ffb2ff Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 11:24:52 -0700
Subject: [PATCH 02/44] Backport #16038 to 24.06 (#16101)

Backporting #16038 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/include/cudf/ast/detail/operators.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index b618f33a6e5..c483d459833 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -819,7 +820,17 @@ struct operator_functor<ast_operator::NOT, false> {
 template <typename To>
 struct cast {
   static constexpr auto arity{1};
-  template <typename From>
+  template <typename From, typename std::enable_if_t<is_fixed_point<From>()>* = nullptr>
+  __device__ inline auto operator()(From f) -> To
+  {
+    if constexpr (cuda::std::is_floating_point_v<To>) {
+      return convert_fixed_to_floating<To>(f);
+    } else {
+      return static_cast<To>(f);
+    }
+  }
+
+  template <typename From, typename cuda::std::enable_if_t<!is_fixed_point<From>()>* = nullptr>
   __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);

From dfab1b589e5907b324dc1688f6dab862d194012c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 15:33:42 -0500
Subject: [PATCH 03/44] Backport: Use size_t to allow large conditional joins
 (#16127) (#16133)

Backports #16127 to 24.06 for inclusion in a hotfix release.

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 cpp/src/join/conditional_join.cu          |   5 +-
 cpp/src/join/conditional_join_kernels.cuh | 124 ++++++++++++++++++++--
 cpp/src/join/join_common_utils.cuh        |  95 -----------------
 3 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 97a06d5a923..d4ef2747c9d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -232,13 +232,14 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
   auto const& join_output_l = left_indices->data();
   auto const& join_output_r = right_indices->data();
+
   if (has_nulls) {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 1e16c451f5a..62769862f54 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -29,6 +29,110 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Adds a pair of indices to the shared memory cache
+ *
+ * @param[in] first The first index in the pair
+ * @param[in] second The second index in the pair
+ * @param[in,out] current_idx_shared Pointer to shared index that determines
+ * where in the shared memory cache the pair will be written
+ * @param[in] warp_id The ID of the warp of the calling the thread
+ * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
+ * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
+ */
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l,
+                                             size_type* joined_shared_r)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx = ref.fetch_add(1, cuda::memory_order_relaxed);
+  // It's guaranteed to fit into the shared cache
+  joined_shared_l[my_current_idx] = first;
+  joined_shared_r[my_current_idx] = second;
+}
+
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx      = ref.fetch_add(1, cuda::memory_order_relaxed);
+  joined_shared_l[my_current_idx] = first;
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type join_shared_r[num_warps][output_cache_size],
+                                   size_type* join_output_l,
+                                   size_type* join_output_r)
+{
+  // count how many active threads participating here which could be less than warp_size
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  // No warp sync is necessary here because we are assuming that ShuffleIndex
+  // is internally using post-CUDA 9.0 synchronization-safe primitives
+  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
+  // be safe by the compiler because it is not required by the standard to
+  // converge divergent branches before executing.
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
+    }
+  }
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 /**
  * @brief Computes the output size of joining the left table to the right table.
  *
@@ -103,14 +207,14 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     }
   }
 
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) {
     cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+    ref.fetch_add(block_counter, cuda::memory_order_relaxed);
   }
 }
 
@@ -143,13 +247,13 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
                                   join_kind join_type,
                                   cudf::size_type* join_output_l,
                                   cudf::size_type* join_output_r,
-                                  cudf::size_type* current_idx,
+                                  std::size_t* current_idx,
                                   cudf::ast::detail::expression_device_view device_expression_data,
-                                  cudf::size_type const max_size,
+                                  std::size_t const max_size,
                                   bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
@@ -183,7 +287,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
@@ -277,12 +381,12 @@ CUDF_KERNEL void conditional_join_anti_semi(
   table_device_view right_table,
   join_kind join_type,
   cudf::size_type* join_output_l,
-  cudf::size_type* current_idx,
+  std::size_t* current_idx,
   cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const max_size)
+  std::size_t const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
 
   extern __shared__ char raw_intermediate_storage[];
@@ -310,7 +414,7 @@ CUDF_KERNEL void conditional_join_anti_semi(
   for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 31f267d5cfb..3d0f3e4340d 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -262,101 +262,6 @@ struct valid_range {
   }
 };
 
-/**
- * @brief Adds a pair of indices to the shared memory cache
- *
- * @param[in] first The first index in the pair
- * @param[in] second The second index in the pair
- * @param[in,out] current_idx_shared Pointer to shared index that determines
- * where in the shared memory cache the pair will be written
- * @param[in] warp_id The ID of the warp of the calling the thread
- * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
- * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
- */
-__inline__ __device__ void add_pair_to_cache(size_type const first,
-                                             size_type const second,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l,
-                                             size_type* joined_shared_r)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-  // its guaranteed to fit into the shared cache
-  joined_shared_l[my_current_idx] = first;
-  joined_shared_r[my_current_idx] = second;
-}
-
-__inline__ __device__ void add_left_to_cache(size_type const first,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
-  joined_shared_l[my_current_idx] = first;
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type join_shared_r[num_warps][output_cache_size],
-                                   size_type* join_output_l,
-                                   size_type* join_output_r)
-{
-  // count how many active threads participating here which could be less than warp_size
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  // No warp sync is necessary here because we are assuming that ShuffleIndex
-  // is internally using post-CUDA 9.0 synchronization-safe primitives
-  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
-  // be safe by the compiler because it is not required by the standard to
-  // converge divergent branches before executing.
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
-    }
-  }
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type* join_output_l)
-{
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-    }
-  }
-}
-
 }  // namespace detail
 
 }  // namespace cudf

From 781794bb52448f617351ed96441a8e2fdb765dd7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 14:59:04 -0700
Subject: [PATCH 04/44] Backport #16045 to 24.06 (#16102)

Backporting #16045 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/tests/ast/transform_tests.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index ef1d09e5652..6b350c137d0 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionDoubleCast)
+{
+  auto c_0 = column_wrapper<double>{3, 20, 1, 50};
+  std::vector<__int128_t> data1{10, 7, 20, 0};
+  auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
+    data1.begin(), data1.end(), numeric::scale_type{0});
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+  auto expected   = column_wrapper<double>{13, 27, 21, 50};
+  auto result     = cudf::compute_column(table, expression);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, Literal)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From aeef0a1f4159d4c87f987d20225401040973d10f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:56:30 -0400
Subject: [PATCH 05/44] Remove hash_character_ngrams dependency from
 jaccard_index (#16241)

Removes internal dependency of `nvtext::hash_character_ngrams` from `nvtext::jaccard_index`.
Works around the size-type limit imposed by `hash_character_ngrams` which returns a `list` column.
This also specializes the hashing logic for the jaccard calculation specifically.

The overall algorithm has not changed. Code has moved around a bit and internal list-columns have been replaced with just offsets and values vectors.

Closes #16157

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16241
---
 cpp/benchmarks/text/jaccard.cpp |   4 +-
 cpp/src/text/jaccard.cu         | 478 ++++++++++++++++++++++----------
 2 files changed, 339 insertions(+), 143 deletions(-)

diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d05c195d077..d5b74da6773 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {32768, 131072, 262144})
+  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 9cf934165f6..e465fb79c89 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -19,16 +19,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
 #include <nvtext/jaccard.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -36,127 +39,375 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
 /**
  * @brief Retrieve the row data (span) for the given column/row-index
  *
- * @param d_input Input lists column
+ * @param values Flat vector of all values
+ * @param offsets Offsets identifying rows within values
  * @param idx Row index to retrieve
  * @return A device-span of the row values
  */
-__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx)
 {
-  auto const offsets =
-    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[idx];
-  auto const size   = offsets[idx + 1] - offset;
-  auto const begin =
-    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  auto const offset = offsets[row_idx];
+  auto const size   = offsets[row_idx + 1] - offset;
+  auto const begin  = values + offset;
   return cudf::device_span<uint32_t const>(begin, size);
 }
 
 /**
- * @brief Count the unique values within each row of the input column
+ * @brief Kernel to count the unique values within each row of the input column
+ *
+ * This is called with a warp per row.
  *
- * This is called with a warp per row
+ * @param d_values Sorted hash values to count uniqueness
+ * @param d_offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param d_results Number of unique values in each row
  */
-struct sorted_unique_fn {
-  cudf::column_device_view const d_input;
-  cudf::size_type* d_results;
+CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values,
+                                  int64_t const* d_offsets,
+                                  cudf::size_type rows,
+                                  cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-    auto const row      = get_row(d_input, row_idx);
-    auto const begin    = row.begin();
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const row      = get_row(d_values, d_offsets, row_idx);
+  auto const begin    = row.begin();
 
-    cudf::size_type count = 0;
-    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
-      count += (itr == begin || *itr != *(itr - 1));
-    }
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+  cudf::size_type count = 0;
+  for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+    count += (itr == begin || *itr != *(itr - 1));
   }
-};
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
 
-rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * @param values Sorted hash values to count uniqueness
+ * @param offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of unique values
+ */
+rmm::device_uvector<cudf::size_type> compute_unique_counts(uint32_t const* values,
+                                                           int64_t const* offsets,
+                                                           cudf::size_type rows,
                                                            rmm::cuda_stream_view stream)
 {
-  auto const d_input = cudf::column_device_view::create(input, stream);
-  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  sorted_unique_fn fn{*d_input, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size() * cudf::detail::warp_size,
-                     fn);
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_unique_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values, offsets, rows, d_results.data());
   return d_results;
 }
 
+/**
+ * @brief Kernel to count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row.
+ *
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param d_results Number of common values in each row
+ */
+CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1,
+                                     int64_t const* d_offsets1,
+                                     uint32_t const* d_values2,
+                                     int64_t const* d_offsets2,
+                                     cudf::size_type rows,
+                                     cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
+
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  auto const needles  = get_row(d_values1, d_offsets1, row_idx);
+  auto const haystack = get_row(d_values2, d_offsets2, row_idx);
+
+  auto begin     = haystack.begin();
+  auto const end = haystack.end();
+
+  cudf::size_type count = 0;
+  for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+       itr += cudf::detail::warp_size) {
+    if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+    // search haystack for this needle (*itr)
+    auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+    count += (found != end) && (*found == *itr);  // increment if found;
+    begin = found;                                // shorten the next lower-bound range
+  }
+  // sum up the counts across this warp
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
+
 /**
  * @brief Count the number of common values within each row of the 2 input columns
  *
- * This is called with a warp per row
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of common values
  */
-struct sorted_intersect_fn {
-  cudf::column_device_view const d_input1;
-  cudf::column_device_view const d_input2;
-  cudf::size_type* d_results;
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(uint32_t const* values1,
+                                                              int64_t const* offsets1,
+                                                              uint32_t const* values2,
+                                                              int64_t const* offsets2,
+                                                              cudf::size_type rows,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_intersect_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values1, offsets1, values2, offsets2, rows, d_results.data());
+  return d_results;
+}
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+/**
+ * @brief Counts the number of substrings in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(1, str.length() - width + 1)`
+ * If a string has less than width characters (but not empty), the count is 1
+ * since the entire string is still hashed.
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_counts Output number of substring per row of input
+ */
+CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings,
+                                         cudf::size_type width,
+                                         int64_t* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto const needles  = get_row(d_input1, row_idx);
-    auto const haystack = get_row(d_input2, row_idx);
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto begin     = haystack.begin();
-    auto const end = haystack.end();
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    // TODO: investigate cuCollections device-side static-map to match row values
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+}
+
+/**
+ * @brief Kernel to hash the substrings for each input row
+ *
+ * Each warp processes a single string.
+ * Substrings of string "hello world" with width=4 produce:
+ *   "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld"
+ * Each of these substrings is hashed and the hash stored in d_results
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_output_offsets Offsets into d_results
+ * @param d_results Hash values for each substring
+ */
+CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings,
+                                       cudf::size_type width,
+                                       int64_t const* d_output_offsets,
+                                       uint32_t* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    cudf::size_type count = 0;
-    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
-         itr += cudf::detail::warp_size) {
-      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
-      // search haystack for this needle (*itr)
-      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
-      count += (found != end) && (*found == *itr);  // increment if found;
-      begin = found;                                // shorten the next lower-bound range
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ uint32_t hvs[block_size];  // temp store for hash values
+
+  auto const hasher     = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+
+  auto d_hashes = d_results + d_output_offsets[str_idx];
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    uint32_t hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width);
+      // hash only if we have the full width of characters or this is the beginning of the string
+      if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); }
     }
-    // sum up the counts across this warp
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values for this warp into d_hashes
+      auto const hashes     = &hvs[threadIdx.x];
+      auto const hashes_end = hashes + cudf::detail::warp_size;
+      d_hashes =
+        thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; });
+    }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 
-rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
-                                                              cudf::column_view const& input2,
-                                                              rmm::cuda_stream_view stream)
+void segmented_sort(uint32_t const* input,
+                    uint32_t* output,
+                    int64_t items,
+                    cudf::size_type segments,
+                    int64_t const* offsets,
+                    rmm::cuda_stream_view stream)
 {
-  auto const d_input1 = cudf::column_device_view::create(input1, stream);
-  auto const d_input2 = cudf::column_device_view::create(input2, stream);
-  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input1.size() * cudf::detail::warp_size,
-                     fn);
-  return d_results;
+  rmm::device_buffer temp;
+  std::size_t temp_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+  temp = rmm::device_buffer(temp_bytes, stream);
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+}
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * The hashes are sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ *
+ * @param input Input strings column to hash
+ * @param width Substring width in characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The sorted hash values and offsets to each row
+ */
+std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_substrings(
+  cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream)
+{
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // count substrings
+  auto offsets          = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+  count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data());
+  auto const total_hashes =
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+
+  // hash substrings
+  rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
+  substring_hash_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data(), hashes.data());
+
+  // sort hashes
+  rmm::device_uvector<uint32_t> sorted(total_hashes, stream);
+  if (total_hashes < static_cast<int64_t>(std::numeric_limits<int>::max())) {
+    segmented_sort(
+      hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream);
+  } else {
+    // The CUB segmented sort can only handle max<int> total values
+    // so this code calls it in sections.
+    auto const section_size   = std::numeric_limits<int>::max() / 2L;
+    auto const sort_sections  = cudf::util::div_rounding_up_safe(total_hashes, section_size);
+    auto const offset_indices = [&] {
+      // build a set of indices that point to offsets subsections
+      auto sub_offsets = rmm::device_uvector<int64_t>(sort_sections + 1, stream);
+      thrust::sequence(
+        rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size);
+      auto indices = rmm::device_uvector<int64_t>(sub_offsets.size(), stream);
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          offsets.begin(),
+                          offsets.end(),
+                          sub_offsets.begin(),
+                          sub_offsets.end(),
+                          indices.begin());
+      return cudf::detail::make_std_vector_sync(indices, stream);
+    }();
+
+    // Call segmented sort with the sort sections
+    for (auto i = 0L; i < sort_sections; ++i) {
+      auto const index1 = offset_indices[i];
+      auto const index2 = std::min(offset_indices[i + 1], static_cast<int64_t>(offsets.size() - 1));
+      auto const offset1 = offsets.element(index1, stream);
+      auto const offset2 = offsets.element(index2, stream);
+
+      auto const num_items    = offset2 - offset1;
+      auto const num_segments = index2 - index1;
+
+      // There is a bug in the CUB segmented sort and the workaround is to
+      // shift the offset values so the first offset is 0.
+      // This transform can be removed once the bug is fixed.
+      auto sort_offsets = rmm::device_uvector<int64_t>(num_segments + 1, stream);
+      thrust::transform(rmm::exec_policy(stream),
+                        offsets.begin() + index1,
+                        offsets.begin() + index2 + 1,
+                        sort_offsets.begin(),
+                        [offset1] __device__(auto const o) { return o - offset1; });
+
+      segmented_sort(hashes.begin() + offset1,
+                     sorted.begin() + offset1,
+                     num_items,
+                     num_segments,
+                     sort_offsets.begin(),
+                     stream);
+    }
+  }
+  return std::make_pair(std::move(sorted), std::move(offsets));
 }
 
 /**
@@ -186,62 +437,6 @@ struct jaccard_fn {
   }
 };
 
-/**
- * @brief Create hashes for each substring
- *
- * Uses the hash_character_ngrams to hash substrings of the input column.
- * This returns a lists column where each row is the hashes for the substrings
- * of the corresponding input string row.
- *
- * The hashes are then sorted using a segmented-sort as setup to
- * perform the unique and intersect operations.
- */
-std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
-                                              cudf::size_type width,
-                                              rmm::cuda_stream_view stream)
-{
-  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
-  auto const input   = cudf::lists_column_view(hashes->view());
-  auto const offsets = input.offsets_begin();
-  auto const data    = input.child().data<uint32_t>();
-
-  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
-
-  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-
-  auto contents = hashes->release();
-  // the offsets are taken from the hashes column since they are the same
-  // before and after the segmented-sort
-  return cudf::make_lists_column(
-    col.size(),
-    std::move(contents.children.front()),
-    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
-    0,
-    rmm::device_buffer{},
-    stream,
-    rmm::mr::get_current_device_resource());
-}
 }  // namespace
 
 std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
@@ -261,13 +456,14 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
 
   auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
     // build hashes of the substrings
-    auto const hash1 = hash_substrings(input1, width, stream);
-    auto const hash2 = hash_substrings(input2, width, stream);
+    auto const [hash1, offsets1] = hash_substrings(input1, width, stream);
+    auto const [hash2, offsets2] = hash_substrings(input2, width, stream);
 
     // compute the unique counts in each set and the intersection counts
-    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
-    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
-    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+    auto d_uniques1   = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream);
+    auto d_intersects = compute_intersect_counts(
+      hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream);
 
     return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
   }();

From 4acca4d57303f52907aa158a2ef996c9d42a73d6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jul 2024 11:07:07 -1000
Subject: [PATCH 06/44] Use Column.can_cast_safely instead of some ad-hoc dtype
 functions in .where (#16303)

There were a couple of dedicated functions in `python/cudf/cudf/utils/dtypes.py` specific to `.where` that could be subsumed by `Column.can_cast_safely`.

The minor downside is that we need to cast where's argument to a Column first, but IMO it's probably OK given the deduplication

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16303
---
 python/cudf/cudf/core/_internals/where.py | 78 ++++++++++++++----
 python/cudf/cudf/utils/dtypes.py          | 96 +----------------------
 2 files changed, 62 insertions(+), 112 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 4a36be76b6d..6003a0f6aea 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -9,12 +9,7 @@
 import cudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import (
-    _can_cast,
-    _dtype_can_hold_element,
-    find_common_type,
-    is_mixed_with_object_dtype,
-)
+from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
@@ -44,6 +39,8 @@ def _check_and_cast_columns_with_other(
     inplace: bool,
 ) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
+    from cudf.core.column import as_column
+
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
         return _normalize_categorical(source_col, other)
@@ -84,17 +81,9 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if (
-        _is_non_decimal_numeric_dtype(source_dtype)
-        and not other_is_scalar  # can-cast fails for Python scalars
-        and _can_cast(other, source_dtype)
-    ):
-        common_dtype = source_dtype
-    elif (
-        isinstance(source_col, cudf.core.column.NumericalColumn)
-        and other_is_scalar
-        and _dtype_can_hold_element(source_dtype, other)
-    ):
+    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+        other
+    ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
@@ -130,3 +119,58 @@ def _make_categorical_like(result, column):
             ordered=column.ordered,
         )
     return result
+
+
+def _can_cast(from_dtype, to_dtype):
+    """
+    Utility function to determine if we can cast
+    from `from_dtype` to `to_dtype`. This function primarily calls
+    `np.can_cast` but with some special handling around
+    cudf specific dtypes.
+    """
+    if cudf.utils.utils.is_na_like(from_dtype):
+        return True
+    if isinstance(from_dtype, type):
+        from_dtype = cudf.dtype(from_dtype)
+    if isinstance(to_dtype, type):
+        to_dtype = cudf.dtype(to_dtype)
+
+    # TODO : Add precision & scale checking for
+    # decimal types in future
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+    elif isinstance(from_dtype, np.dtype):
+        if isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype, to_dtype)
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
+            return True
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
+        # TODO: Add level based checks too once casting of
+        # list columns is supported
+        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
+            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype._categories.dtype, to_dtype)
+        else:
+            return False
+    else:
+        return np.can_cast(from_dtype, to_dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 59e5ec1df04..af912bee342 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -10,8 +10,6 @@
 from pandas.core.dtypes.common import infer_dtype_from_object
 
 import cudf
-from cudf._typing import DtypeObj
-from cudf.api.types import is_bool, is_float, is_integer
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -584,61 +582,6 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _can_cast(from_dtype, to_dtype):
-    """
-    Utility function to determine if we can cast
-    from `from_dtype` to `to_dtype`. This function primarily calls
-    `np.can_cast` but with some special handling around
-    cudf specific dtypes.
-    """
-    if cudf.utils.utils.is_na_like(from_dtype):
-        return True
-    if isinstance(from_dtype, type):
-        from_dtype = cudf.dtype(from_dtype)
-    if isinstance(to_dtype, type):
-        to_dtype = cudf.dtype(to_dtype)
-
-    # TODO : Add precision & scale checking for
-    # decimal types in future
-
-    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-    elif isinstance(from_dtype, np.dtype):
-        if isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
-            return True
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
-        # TODO: Add level based checks too once casting of
-        # list columns is supported
-        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
-            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype._categories.dtype, to_dtype)
-        else:
-            return False
-    else:
-        return np.can_cast(from_dtype, to_dtype)
-
-
 def _maybe_convert_to_default_type(dtype):
     """Convert `dtype` to default if specified by user.
 
@@ -661,44 +604,7 @@ def _maybe_convert_to_default_type(dtype):
     return dtype
 
 
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
-    if not len(rng):
-        return True
-    return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
-
-
-def _dtype_can_hold_element(dtype: np.dtype, element) -> bool:
-    if dtype.kind in {"i", "u"}:
-        if isinstance(element, range):
-            if _dtype_can_hold_range(element, dtype):
-                return True
-            return False
-
-        elif is_integer(element) or (
-            is_float(element) and element.is_integer()
-        ):
-            info = np.iinfo(dtype)
-            if info.min <= element <= info.max:
-                return True
-            return False
-
-    elif dtype.kind == "f":
-        if is_integer(element) or is_float(element):
-            casted = dtype.type(element)
-            if np.isnan(casted) or casted == element:
-                return True
-            # otherwise e.g. overflow see TestCoercionFloat32
-            return False
-
-    elif dtype.kind == "b":
-        if is_bool(element):
-            return True
-        return False
-
-    raise NotImplementedError(f"Unsupported dtype: {dtype}")
-
-
-def _get_base_dtype(dtype: DtypeObj) -> DtypeObj:
+def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
     # TODO: replace the use of this function with just `dtype.base`
     # when Pandas 2.1.0 is the minimum version we support:
     # https://github.com/pandas-dev/pandas/pull/52706

From debbef0bc12f523054740432983030dd0b24f9c4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:12:56 +0100
Subject: [PATCH 07/44] Update vendored thread_pool implementation (#16210)

Since we introduced the vendored thread_pool in #8752, upstream has introduced some new features, and particularly now uses condition variables/notification to handle when there are no tasks in the queue. This avoids the issue described in #16209 where the thread pool by default artificially introduces a delay of 1000microseconds to all tasks whenever the task queue is emptied.

- Closes #16209

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16210
---
 cpp/CMakeLists.txt                            |   4 +-
 .../groupby/group_max_multithreaded.cpp       |  10 +-
 .../io/orc/orc_reader_multithreaded.cpp       |  26 +-
 .../io/parquet/parquet_reader_multithread.cpp |  26 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake    |  31 ++
 cpp/include/cudf/utilities/thread_pool.hpp    | 381 ------------------
 cpp/src/io/utilities/file_io_utilities.cpp    |   6 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   7 +-
 8 files changed, 66 insertions(+), 425 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_thread_pool.cmake
 delete mode 100644 cpp/include/cudf/utilities/thread_pool.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 903cff27be4..65347bd6689 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
+# find thread_pool
+include(cmake/thirdparty/get_thread_pool.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -804,7 +806,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
index 3b8faba618f..bf1a1a5fcf7 100644
--- a/cpp/benchmarks/groupby/group_max_multithreaded.cpp
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -20,8 +20,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
@@ -58,7 +58,7 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
   for (auto& thread_requests : requests) {
@@ -75,10 +75,8 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
       timer.start();
-      for (int64_t i = 0; i < num_threads; ++i) {
-        threads.submit(perform_agg, i);
-      }
-      threads.wait_for_tasks();
+      threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg);
+      threads.wait();
       cudf::detail::join_streams(streams, cudf::get_default_stream());
       cudf::get_default_stream().synchronize();
       timer.stop();
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..e91bf06fdfa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -24,8 +24,8 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
@@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
                    } while (reader.has_next());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..9e76ebb71ab 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -23,10 +23,10 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
@@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
                  } while (reader.has_next());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
new file mode 100644
index 00000000000..264257c7199
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds rmm and sets any additional necessary environment variables.
+function(find_and_configure_thread_pool)
+  rapids_cpm_find(
+    BS_thread_pool 4.1.0
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
+    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
+    GIT_SHALLOW TRUE
+  )
+  if(NOT TARGET BS_thread_pool)
+    add_library(BS_thread_pool INTERFACE)
+    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
+    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
+  endif()
+endfunction()
+
+find_and_configure_thread_pool()
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
deleted file mode 100644
index c8c3eb097c4..00000000000
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-/**
- * Modified from https://github.com/bshoshany/thread-pool
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license.
- *            See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-
-#include <atomic>       // std::atomic
-#include <chrono>       // std::chrono
-#include <cstdint>      // std::int_fast64_t, std::uint_fast32_t
-#include <functional>   // std::function
-#include <future>       // std::future, std::promise
-#include <memory>       // std::shared_ptr, std::unique_ptr
-#include <mutex>        // std::mutex, std::scoped_lock
-#include <queue>        // std::queue
-#include <thread>       // std::this_thread, std::thread
-#include <type_traits>  // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>      // std::move, std::swap
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a
- * thread becomes available, it pops a task from the queue and executes it. Each task is
- * automatically assigned a future, which can be used to wait for the task to finish executing
- * and/or obtain its eventual return value.
- */
-class thread_pool {
-  using ui32 = int;
-
- public:
-  /**
-   * @brief Construct a new thread pool.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
-    : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
-      threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-  {
-    create_threads();
-  }
-
-  /**
-   * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads.
-   * Note that if the variable paused is set to true, then any tasks still in the queue will never
-   * be executed.
-   */
-  ~thread_pool()
-  {
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-  }
-
-  /**
-   * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-   *
-   * @return The number of queued tasks.
-   */
-  [[nodiscard]] size_t get_tasks_queued() const
-  {
-    std::scoped_lock const lock(queue_mutex);
-    return tasks.size();
-  }
-
-  /**
-   * @brief Get the number of tasks currently being executed by the threads.
-   *
-   * @return The number of running tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
-
-  /**
-   * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
-   * thread.
-   *
-   * @return The total number of tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
-
-  /**
-   * @brief Get the number of threads in the pool.
-   *
-   * @return The number of threads.
-   */
-  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
-
-  /**
-   * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
-   * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to:
-   * for (T i = first_index; i <= last_index; i++) loop(i);
-   *
-   * @tparam T The type of the loop index. Should be a signed or unsigned integer.
-   * @tparam F The type of the function to loop through.
-   * @param first_index The first index in the loop (inclusive).
-   * @param last_index The last index in the loop (inclusive).
-   * @param loop The function to loop through. Should take exactly one argument, the loop index.
-   * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the
-   * number of threads in the pool.
-   */
-  template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
-  {
-    if (num_tasks == 0) num_tasks = thread_count;
-    if (last_index < first_index) std::swap(last_index, first_index);
-    size_t total_size = last_index - first_index + 1;
-    size_t block_size = total_size / num_tasks;
-    if (block_size == 0) {
-      block_size = 1;
-      num_tasks  = (ui32)total_size > 1 ? (ui32)total_size : 1;
-    }
-    std::atomic<ui32> blocks_running = 0;
-    for (ui32 t = 0; t < num_tasks; t++) {
-      T start = (T)(t * block_size + first_index);
-      T end   = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1);
-      blocks_running++;
-      push_task([start, end, &loop, &blocks_running] {
-        for (T i = start; i <= end; i++)
-          loop(i);
-        blocks_running--;
-      });
-    }
-    while (blocks_running != 0) {
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief Push a function with no arguments or return value into the task queue.
-   *
-   * @tparam F The type of the function.
-   * @param task The function to push.
-   */
-  template <typename F>
-  void push_task(F const& task)
-  {
-    tasks_total++;
-    {
-      std::scoped_lock const lock(queue_mutex);
-      tasks.push(std::function<void()>(task));
-    }
-  }
-
-  /**
-   * @brief Push a function with arguments, but no return value, into the task queue.
-   * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks
-   * in the queue must be of type std::function<void()>, so they cannot have any arguments or return
-   * value. If no arguments are provided, the other overload will be used, in order to avoid the
-   * (slight) overhead of using a lambda.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the arguments.
-   * @param task The function to push.
-   * @param args The arguments to pass to the function.
-   */
-  template <typename F, typename... A>
-  void push_task(F const& task, A const&... args)
-  {
-    push_task([task, args...] { task(args...); });
-  }
-
-  /**
-   * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be
-   * completed, then destroys all threads in the pool and creates a new thread pool with the new
-   * number of threads. Any tasks that were waiting in the queue before the pool was reset will then
-   * be executed by the new threads. If the pool was paused before resetting it, the new pool will
-   * be paused as well.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
-  {
-    bool was_paused = paused;
-    paused          = true;
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-    thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads      = std::make_unique<std::thread[]>(thread_count);
-    paused       = was_paused;
-    create_threads();
-    running = true;
-  }
-
-  /**
-   * @brief Submit a function with zero or more arguments and a return value into the task queue,
-   * and get a future for its eventual returned value.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @tparam R The return type of the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to obtain the function's returned value, waiting for it to
-   * finish its execution if needed.
-   */
-  template <typename F,
-            typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(F const& task, A const&... args)
-  {
-    std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
-    std::future<R> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        if constexpr (std::is_void_v<R>) {
-          task(args...);
-          promise->set_value();
-        } else {
-          promise->set_value(task(args...));
-        }
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
-  /**
-   * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those
-   * that are currently running in the threads and those that are still waiting in the queue.
-   * However, if the variable paused is set to true, this function only waits for the currently
-   * running tasks (otherwise it would wait forever). To wait for a specific task, use submit()
-   * instead, and call the wait() member function of the generated future.
-   */
-  void wait_for_tasks()
-  {
-    while (true) {
-      if (!paused) {
-        if (tasks_total == 0) break;
-      } else {
-        if (get_tasks_running() == 0) break;
-      }
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief An atomic variable indicating to the workers to pause. When set to true, the workers
-   * temporarily stop popping new tasks out of the queue, although any tasks already executed will
-   * keep running until they are done. Set to false again to resume popping tasks.
-   */
-  std::atomic<bool> paused = false;
-
-  /**
-   * @brief The duration, in microseconds, that the worker function should sleep for when it cannot
-   * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will
-   * execute std::this_thread::yield() if there are no tasks in the queue. The default value is
-   * 1000.
-   */
-  ui32 sleep_duration = 1000;
-
- private:
-  /**
-   * @brief Create the threads in the pool and assign a worker to each thread.
-   */
-  void create_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i] = std::thread(&thread_pool::worker, this);
-    }
-  }
-
-  /**
-   * @brief Destroy the threads in the pool by joining them.
-   */
-  void destroy_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i].join();
-    }
-  }
-
-  /**
-   * @brief Try to pop a new task out of the queue.
-   *
-   * @param task A reference to the task. Will be populated with a function if the queue is not
-   * empty.
-   * @return true if a task was found, false if the queue is empty.
-   */
-  bool pop_task(std::function<void()>& task)
-  {
-    std::scoped_lock const lock(queue_mutex);
-    if (tasks.empty())
-      return false;
-    else {
-      task = std::move(tasks.front());
-      tasks.pop();
-      return true;
-    }
-  }
-
-  /**
-   * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-   *
-   */
-  void sleep_or_yield()
-  {
-    if (sleep_duration)
-      std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-    else
-      std::this_thread::yield();
-  }
-
-  /**
-   * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out
-   * of the queue and executes them, as long as the atomic variable running is set to true.
-   */
-  void worker()
-  {
-    while (running) {
-      std::function<void()> task;
-      if (!paused && pop_task(task)) {
-        task();
-        tasks_total--;
-      } else {
-        sleep_or_yield();
-      }
-    }
-  }
-
-  /**
-   * @brief A mutex to synchronize access to the task queue by different threads.
-   */
-  mutable std::mutex queue_mutex;
-
-  /**
-   * @brief An atomic variable indicating to the workers to keep running. When set to false, the
-   * workers permanently stop working.
-   */
-  std::atomic<bool> running = true;
-
-  /**
-   * @brief A queue of tasks to be executed by the threads.
-   */
-  std::queue<std::function<void()>> tasks;
-
-  /**
-   * @brief The number of threads in the pool.
-   */
-  ui32 thread_count;
-
-  /**
-   * @brief A smart pointer to manage the memory allocated for the threads.
-   */
-  std::unique_ptr<std::thread[]> threads;
-
-  /**
-   * @brief An atomic variable to keep track of the total number of unfinished tasks - either still
-   * in the queue, or running in a thread.
-   */
-  std::atomic<ui32> tasks_total = 0;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9fe5959436d..d7b54399f8d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -223,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
     // The benefit from multithreaded read plateaus around 16 threads
     pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
 {
-  pool.sleep_duration = 10;
 }
 
 namespace {
@@ -232,14 +231,15 @@ template <typename DataT,
           typename F,
           typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
 std::vector<std::future<ResultT>> make_sliced_tasks(
-  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+  F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool)
 {
   constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
   static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
   auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
-    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+    return pool.submit_task(
+      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 91ef41fba6e..441bede200d 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,8 +19,7 @@
 #ifdef CUFILE_FOUND
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/utilities/thread_pool.hpp>
-
+#include <BS_thread_pool.hpp>
 #include <cufile.h>
 #endif
 
@@ -150,7 +149,7 @@ class cufile_input_impl final : public cufile_input {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 
 /**
@@ -167,7 +166,7 @@ class cufile_output_impl final : public cufile_output {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 #else
 

From 8ff27ed5bcaf8fc5fc8d1f546dee30c59861c320 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:15:20 +0100
Subject: [PATCH 08/44] Support Literals in groupby-agg (#16218)

To do this, we just need to collect the appropriate aggregation information, and broadcast literals to the correct size.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16218
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 15 +++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py   |  4 ++--
 python/cudf_polars/tests/test_groupby.py   | 17 +++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f37cb3f475c..a034d55120a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -370,6 +370,10 @@ def do_evaluate(
         # datatype of pyarrow scalar is correct by construction.
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class LiteralColumn(Expr):
     __slots__ = ("value",)
@@ -382,6 +386,13 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
 
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -393,6 +404,10 @@ def do_evaluate(
         # datatype of pyarrow array is correct by construction.
         return Column(plc.interop.from_arrow(self.value))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class Col(Expr):
     __slots__ = ("name",)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index cce0c4a3d94..01834ab75a5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -514,7 +514,7 @@ def check_agg(agg: expr.Expr) -> int:
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
             return 0
         else:
             raise NotImplementedError(f"No handler for {agg=}")
@@ -574,7 +574,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results]).slice(self.options.slice)
+        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
 
 
 @dataclasses.dataclass
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b07d8e38217..b650fee5079 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -155,3 +155,20 @@ def test_groupby_nan_minmax_raises(op):
     q = df.group_by("key").agg(op(pl.col("value")))
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.lit(1).alias("value"),
+        pl.lit([[4, 5, 6]]).alias("value"),
+        pl.col("float") * (1 - pl.col("int")),
+        [pl.lit(2).alias("value"), pl.col("float") * 2],
+    ],
+)
+def test_groupby_literal_in_agg(df, key, expr):
+    # check_row_order=False doesn't work for list aggregations
+    # so just sort by the group key
+    q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
+    assert_gpu_result_equal(q)

From 9a713e3adb8abb1f41de0445b8ea896fdb48c560 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:34:16 -0400
Subject: [PATCH 09/44] Migrate lists/count_elements to pylibcudf (#16072)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16072
---
 python/cudf/cudf/_lib/lists.pyx               | 18 +++----------
 .../libcudf/lists/count_elements.pxd          |  2 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 27 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 10 +++++++
 5 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index ceae1b148aa..76f37c3b845 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,9 +8,6 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
-    count_elements as cpp_count_elements,
-)
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -36,19 +33,10 @@ from cudf._lib.pylibcudf cimport Scalar
 
 @acquire_spill_lock()
 def count_elements(Column col):
-
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.count_elements(
+            col.to_pylibcudf(mode="read"))
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
index 38bdd4db0bb..ba57a839fbc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -9,4 +9,4 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
+    cdef unique_ptr[column] count_elements(const lists_column_view&) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 38a479e4791..38eb575ee8d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -33,3 +33,5 @@ cpdef Column reverse(Column)
 cpdef Column segmented_gather(Column, Column)
 
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
+
+cpdef Column count_elements(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 19c961aa014..ea469642dd5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -17,6 +17,9 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+    count_elements as cpp_count_elements,
+)
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
@@ -293,3 +296,27 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
             index.view() if ColumnOrSizeType is Column else index,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+    For details, see :cpp:func:`count_elements`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_count_elements(list_view.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 07ecaed5012..7cfed884f90 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -181,3 +181,13 @@ def test_extract_list_element_column(test_data):
     expect = pa.array([0, None, None, 7])
 
     assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.count_elements(plc_column)
+
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
+
+    assert_column_eq(expect, res)

From 2bbeee95ec338c30c0c876dc6a58376fbb0a5a06 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Fri, 19 Jul 2024 12:43:49 -0400
Subject: [PATCH 10/44] DOC: use intersphinx mapping in pandas-compat ext
 (#15846)

~~If https://github.com/rapidsai/cudf/pull/15704 is merged~~

This PR changes the header in the admonition (pandas compat box) to be hyperlinked to the pandas docs instead of just text. See https://raybellwaves.github.io/compatsphinxext/compat.html which is the docs of a minimal repo where I have been testing

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15846
---
 .../source/developer_guide/documentation.md   |  2 +-
 python/cudf/cudf/core/column/lists.py         | 12 +++++-
 python/cudf/cudf/core/column/string.py        | 16 ++++----
 python/cudf/cudf/core/dataframe.py            | 37 ++++++++++---------
 python/cudf/cudf/core/frame.py                | 10 ++---
 python/cudf/cudf/core/groupby/groupby.py      |  9 +++--
 python/cudf/cudf/core/indexed_frame.py        | 28 +++++++-------
 python/cudf/cudf/core/series.py               | 14 +++----
 python/cudf/cudf/core/tools/numeric.py        |  2 +-
 python/cudf/cudf/core/window/ewm.py           |  2 +-
 10 files changed, 72 insertions(+), 60 deletions(-)

diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index c8da689479c..4f5a57fec02 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -164,7 +164,7 @@ The directive should be used inside docstrings like so:
 Docstring body
 
 .. pandas-compat::
-    **$API_NAME**
+    :meth:`pandas.DataFrame.METHOD`
 
     Explanation of differences
 ```
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index cc15e78314e..46b844413f7 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -646,9 +646,17 @@ def sort_values(
         dtype: list
 
         .. pandas-compat::
-            **ListMethods.sort_values**
+            `pandas.Series.list.sort_values`
 
-            The ``inplace`` and ``kind`` arguments are currently not supported.
+            This method does not exist in pandas but it can be run
+            as:
+
+            >>> import pandas as pd
+            >>> s = pd.Series([[3, 2, 1], [2, 4, 3]])
+            >>> print(s.apply(sorted))
+            0    [1, 2, 3]
+            1    [2, 3, 4]
+            dtype: object
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 96f9cdfd655..ec95c50f455 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -612,7 +612,7 @@ def extract(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.extract**
+            :meth:`pandas.Series.str.extract`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -738,7 +738,7 @@ def contains(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.contains**
+            :meth:`pandas.Series.str.contains`
 
             The parameters `case` and `na` are not yet supported and will
             raise a NotImplementedError if anything other than the default
@@ -974,7 +974,7 @@ def replace(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.replace**
+            :meth:`pandas.Series.str.replace`
 
             The parameters `case` and `flags` are not yet supported and will
             raise a `NotImplementedError` if anything other than the default
@@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
                    )
 
         .. pandas-compat::
-            **StringMethods.partition**
+            :meth:`pandas.Series.str.partition`
 
             The parameter `expand` is not yet supported and will raise a
             `NotImplementedError` if anything other than the default
@@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         Index([0, 0, 2, 1], dtype='int64')
 
         .. pandas-compat::
-            **StringMethods.count**
+            :meth:`pandas.Series.str.count`
 
             -   `flags` parameter currently only supports re.DOTALL
                 and re.MULTILINE.
@@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         dtype: list
 
         .. pandas-compat::
-            **StringMethods.findall**
+            :meth:`pandas.Series.str.findall`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.endswith**
+            :meth:`pandas.Series.str.endswith`
 
             `na` parameter is not yet supported, as cudf uses
             native strings instead of Python objects.
@@ -4264,7 +4264,7 @@ def match(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.match**
+            :meth:`pandas.Series.str.match`
 
             Parameters `case` and `na` are currently not supported.
             The `flags` parameter currently only supports re.DOTALL and
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b3d938829c9..f06e45277e2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2750,7 +2750,7 @@ def reindex(
         Chrome                200          0.02
 
         .. pandas-compat::
-            **DataFrame.reindex**
+            :meth:`pandas.DataFrame.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -3350,7 +3350,7 @@ def diff(self, periods=1, axis=0):
         5     2     5    20
 
         .. pandas-compat::
-            **DataFrame.diff**
+            :meth:`pandas.DataFrame.diff`
 
             Diff currently only supports numeric dtype columns.
         """
@@ -3555,7 +3555,7 @@ def rename(
         30  3  6
 
         .. pandas-compat::
-            **DataFrame.rename**
+            :meth:`pandas.DataFrame.rename`
 
             * Not Supporting: level
 
@@ -3670,7 +3670,7 @@ def agg(self, aggs, axis=None):
             ``DataFrame`` is returned.
 
         .. pandas-compat::
-            **DataFrame.agg**
+            :meth:`pandas.DataFrame.agg`
 
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
@@ -3843,7 +3843,7 @@ def nlargest(self, n, columns, keep="first"):
         Brunei      434000    12128      BN
 
         .. pandas-compat::
-            **DataFrame.nlargest**
+            :meth:`pandas.DataFrame.nlargest`
 
             - Only a single column is supported in *columns*
         """
@@ -3915,7 +3915,7 @@ def nsmallest(self, n, columns, keep="first"):
         Nauru         337000  182      NR
 
         .. pandas-compat::
-            **DataFrame.nsmallest**
+            :meth:`pandas.DataFrame.nsmallest`
 
             - Only a single column is supported in *columns*
         """
@@ -3997,7 +3997,7 @@ def transpose(self):
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
         .. pandas-compat::
-            **DataFrame.transpose, DataFrame.T**
+            :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T`
 
             Not supporting *copy* because default and only behavior is
             copy=True
@@ -4188,7 +4188,7 @@ def merge(
         from both sides.
 
         .. pandas-compat::
-            **DataFrame.merge**
+            :meth:`pandas.DataFrame.merge`
 
             DataFrames merges in cuDF result in non-deterministic row
             ordering.
@@ -4263,7 +4263,7 @@ def join(
         joined : DataFrame
 
         .. pandas-compat::
-            **DataFrame.join**
+            :meth:`pandas.DataFrame.join`
 
             - *other* must be a single DataFrame for now.
             - *on* is not supported yet due to lack of multi-index support.
@@ -4385,7 +4385,7 @@ def query(self, expr, local_dict=None):
         1 2018-10-08
 
         .. pandas-compat::
-            **DataFrame.query**
+            :meth:`pandas.DataFrame.query`
 
             One difference from pandas is that ``query`` currently only
             supports numeric, datetime, timedelta, or bool dtypes.
@@ -5447,10 +5447,11 @@ def from_arrow(cls, table):
         2  3  6
 
         .. pandas-compat::
-            **DataFrame.from_arrow**
+            `pandas.DataFrame.from_arrow`
 
-            -   Does not support automatically setting index column(s) similar
-                to how ``to_pandas`` works for PyArrow Tables.
+            This method does not exist in pandas but it is similar to
+            how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e.
+            it does not support automatically setting index column(s).
         """
         index_col = None
         col_index_names = None
@@ -5884,7 +5885,7 @@ def quantile(
         0.5  2.5  55.0
 
         .. pandas-compat::
-            **DataFrame.quantile**
+            :meth:`pandas.DataFrame.quantile`
 
             One notable difference from Pandas is when DataFrame is of
             non-numeric types and result is expected to be a Series in case of
@@ -6174,7 +6175,7 @@ def count(self, axis=0, numeric_only=False):
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.count**
+            :meth:`pandas.DataFrame.count`
 
             Parameters currently not supported are `axis` and `numeric_only`.
         """
@@ -6412,7 +6413,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         1  <NA>    2.0
 
         .. pandas-compat::
-            **DataFrame.mode**
+            :meth:`pandas.DataFrame.transpose`
 
             ``axis`` parameter is currently not supported.
         """
@@ -7594,7 +7595,7 @@ def interleave_columns(self):
         The interleaved columns as a single column
 
         .. pandas-compat::
-            **DataFrame.interleave_columns**
+            `pandas.DataFrame.interleave_columns`
 
             This method does not exist in pandas but it can be run
             as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
@@ -7696,7 +7697,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         4  5   2   7  3
 
         .. pandas-compat::
-            **DataFrame.eval**
+            :meth:`pandas.DataFrame.eval`
 
             * Additional kwargs are not supported.
             * Bitwise and logical operators are not dtype-dependent.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 802751e47ad..111225a5fc2 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.where, Series.where**
+            :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where`
 
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
@@ -1641,7 +1641,7 @@ def min(
         1
 
         .. pandas-compat::
-            **DataFrame.min, Series.min**
+            :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1689,7 +1689,7 @@ def max(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.max, Series.max**
+            :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1742,7 +1742,7 @@ def all(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.all, Series.all**
+            :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
@@ -1795,7 +1795,7 @@ def any(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.any, Series.any**
+            :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d2c75715be2..3f91be71f29 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -744,7 +744,8 @@ def _reduce(
             Computed {op} of values within each group.
 
         .. pandas-compat::
-            **{cls}.{op}**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.{op}`
 
             The numeric_only, min_count
         """
@@ -1482,7 +1483,8 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **GroupBy.apply**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.apply`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.apply`
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -2358,7 +2360,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             Object shifted within each group.
 
         .. pandas-compat::
-            **GroupBy.shift**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.shift`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.shift`
 
             Parameter ``freq`` is unsupported.
         """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 30b68574960..77675edc0f0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -497,7 +497,7 @@ def empty(self):
         True
 
         .. pandas-compat::
-            **DataFrame.empty, Series.empty**
+            :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty`
 
             If DataFrame/Series contains only `null` values, it is still not
             considered empty. See the example above.
@@ -831,7 +831,7 @@ def replace(
         4    4    9  e
 
         .. pandas-compat::
-            **DataFrame.replace, Series.replace**
+            :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace`
 
             Parameters that are currently not supported are: `limit`, `regex`,
             `method`
@@ -1372,7 +1372,7 @@ def sum(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.sum, Series.sum**
+           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1433,7 +1433,7 @@ def product(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.product, Series.product**
+            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
 
             Parameters currently not supported are level`, `numeric_only`.
         """
@@ -1530,7 +1530,7 @@ def median(
         17.0
 
         .. pandas-compat::
-            **DataFrame.median, Series.median**
+            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
 
             Parameters currently not supported are `level` and `numeric_only`.
         """
@@ -1586,7 +1586,7 @@ def std(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.std, Series.std**
+            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1645,7 +1645,7 @@ def var(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.var, Series.var**
+            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1701,7 +1701,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.kurtosis**
+            :meth:`pandas.DataFrame.kurtosis`
 
             Parameters currently not supported are `level` and `numeric_only`
         """
@@ -1763,7 +1763,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.skew, Series.skew, Frame.skew**
+            :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew`
 
             The `axis` parameter is not currently supported.
         """
@@ -2229,7 +2229,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:27  1  2
 
         .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
+            :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate`
 
             The ``copy`` parameter is only present for API compatibility, but
             ``copy=False`` is not supported. This method always generates a
@@ -2665,7 +2665,7 @@ def sort_index(
         2  3  1
 
         .. pandas-compat::
-            **DataFrame.sort_index, Series.sort_index**
+            :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index`
 
             * Not supporting: kind, sort_remaining=False
         """
@@ -3497,7 +3497,7 @@ def sort_values(
         1  1  2
 
         .. pandas-compat::
-            **DataFrame.sort_values, Series.sort_values**
+            :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * Not supporting: inplace, kind
@@ -4008,7 +4008,7 @@ def resample(
 
 
         .. pandas-compat::
-            **DataFrame.resample, Series.resample**
+            :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample`
 
             Note that the dtype of the index (or the 'on' column if using
             'on=') in the result will be of a frequency closest to the
@@ -4564,7 +4564,7 @@ def sample(
         1  2  4
 
         .. pandas-compat::
-            **DataFrame.sample, Series.sample**
+            :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample`
 
             When sampling from ``axis=0/'index'``, ``random_state`` can be
             either a numpy random state (``numpy.random.RandomState``)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e12cc3d52fb..c9d24890d15 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -960,7 +960,7 @@ def reindex(self, *args, **kwargs):
         dtype: int64
 
         .. pandas-compat::
-            **Series.reindex**
+            :meth:`pandas.Series.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -1243,7 +1243,7 @@ def map(self, arg, na_action=None) -> "Series":
         dtype: int64
 
         .. pandas-compat::
-            **Series.map**
+            :meth:`pandas.Series.map`
 
             Please note map currently only supports fixed-width numeric
             type functions.
@@ -2094,7 +2094,7 @@ def sort_values(
         dtype: int64
 
         .. pandas-compat::
-            **Series.sort_values**
+            :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * The inplace and kind argument is currently unsupported
@@ -2550,7 +2550,7 @@ def count(self):
         5
 
         .. pandas-compat::
-            **Series.count**
+            :meth:`pandas.Series.count`
 
             Parameters currently not supported is `level`.
         """
@@ -2661,7 +2661,7 @@ def cov(self, other, min_periods=None):
         -0.015750000000000004
 
         .. pandas-compat::
-            **Series.cov**
+            :meth:`pandas.Series.cov`
 
             `min_periods` parameter is not yet supported.
         """
@@ -3422,7 +3422,7 @@ def rename(self, index=None, copy=True):
         'numeric_series'
 
         .. pandas-compat::
-            **Series.rename**
+            :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
             - The ``inplace`` and ``level`` is not supported
@@ -4702,7 +4702,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
         dtype: object
 
         .. pandas-compat::
-            **series.DatetimeProperties.strftime**
+            :meth:`pandas.DatetimeIndex.strftime`
 
             The following date format identifiers are not yet
             supported: ``%c``, ``%x``,``%X``
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 466d46f7dca..07158e4ee61 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -80,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype: float64
 
     .. pandas-compat::
-        **cudf.to_numeric**
+        :func:`pandas.to_numeric`
 
         An important difference from pandas is that this function does not
         accept mixed numeric/non-numeric type sequences.
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 21693e106bd..bb153d4b549 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -56,7 +56,7 @@ class ExponentialMovingWindow(_RollingBase):
     the equivalent pandas method.
 
     .. pandas-compat::
-        **cudf.core.window.ExponentialMovingWindow**
+        :meth:`pandas.DataFrame.ewm`
 
         The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
         are not yet supported. Behavior is defined only for data that begins

From d5ab48d4f2586d2e45234463c1bbe877ce76afe8 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 14:32:54 -0400
Subject: [PATCH 11/44] Use workflow branch 24.08 again (#16314)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16314
---
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 22 +++++-----
 5 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 937080572ad..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 1516cb09449..5a937b2f362 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1fe64e7f318..d5dfc9e1ff5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 2a8ebd30993..8ca971dc28d 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 73f8d726e77..36c9088d93c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From dc62177a64a5fb4d6521f346ff0f44c2ede740f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 20:17:42 +0100
Subject: [PATCH 12/44] Preserve order in left join for cudf-polars (#16268)

Unlike all other joins, polars provides an ordering guarantee for left joins. By default libcudf does not, so we need to order the gather maps in this case.

While here, because it requires another hard-coding of `int32` for something that should be `size_type`, expose `type_to_id` in cython and plumb it through.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16268
---
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 15 +----
 .../libcudf/utilities/type_dispatcher.pxd     |  7 +++
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |  7 ++-
 python/cudf/cudf/_lib/types.pyx               |  4 +-
 .../cudf_polars/containers/column.py          |  3 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 58 +++++++++++++++++++
 python/cudf_polars/tests/test_join.py         |  2 +-
 7 files changed, 78 insertions(+), 18 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 308b1b39291..2ded84d84d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -10,12 +10,7 @@ from rmm._lib.device_buffer cimport device_buffer
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    data_type,
-    null_equality,
-    size_type,
-    type_id,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
@@ -23,15 +18,11 @@ from .table cimport Table
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
-    cdef device_buffer c_empty
-    cdef size_type size = dereference(gather_map.get()).size()
     return Column.from_libcudf(
         move(
             make_unique[column](
-                data_type(type_id.INT32),
-                size,
-                dereference(gather_map.get()).release(),
-                move(c_empty),
+                move(dereference(gather_map.get())),
+                device_buffer(),
                 0
             )
         )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
new file mode 100644
index 00000000000..890fca3a662
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.types cimport type_id
+
+
+cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
+    cdef type_id type_to_id[T]()
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index 6dbb287f3c4..c45c6071bb3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,8 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
@@ -67,3 +68,7 @@ cdef class DataType:
         cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
+
+
+SIZE_TYPE = DataType(type_to_id[size_type]())
+SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index fc672caa574..253fdf7b0d9 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -21,8 +21,6 @@ from cudf._lib.types cimport (
 import cudf
 from cudf._lib import pylibcudf
 
-size_type_dtype = np.dtype("int32")
-
 
 class TypeId(IntEnum):
     EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
@@ -150,6 +148,8 @@ datetime_unit_map = {
     TypeId.TIMESTAMP_NANOSECONDS: "ns",
 }
 
+size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+
 
 class Interpolation(IntEnum):
     LINEAR = (
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 42aba0fcdc0..02018548b2c 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -185,8 +185,7 @@ def nan_count(self) -> int:
                 plc.reduce.reduce(
                     plc.unary.is_nan(self.obj),
                     plc.aggregation.sum(),
-                    # TODO: pylibcudf needs to have a SizeType DataType singleton
-                    plc.DataType(plc.TypeId.INT32),
+                    plc.types.SIZE_TYPE,
                 )
             ).as_py()
         return 0
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 01834ab75a5..0b14530e0ed 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -653,6 +653,59 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> list[plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gather map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        list of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return plc.sorting.stable_sort_by_key(
+            plc.Table([lg, rg]),
+            plc.Table([*left_order.columns(), *right_order.columns()]),
+            [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+            [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+        ).columns()
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -693,6 +746,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 89f6fd3455b..1ffbf3c0ef4 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -53,7 +53,7 @@ def test_join(how, coalesce, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
 def test_cross_join():

From cb570fe6d7dc7ebdd6c8c030916ba27bef277b5e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:45:30 -1000
Subject: [PATCH 13/44] Deprecate dtype= parameter in reduction methods
 (#16313)

In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16313
---
 python/cudf/cudf/_lib/reduce.pyx               | 15 ++++++++++-----
 python/cudf/cudf/core/column/column.py         | 11 ++++++++---
 python/cudf/cudf/core/column/datetime.py       |  9 +++------
 python/cudf/cudf/core/column/numerical.py      | 17 +++++++++--------
 python/cudf/cudf/core/column/numerical_base.py | 11 +++--------
 python/cudf/cudf/core/column/timedelta.py      |  7 +++----
 python/cudf/cudf/tests/test_reductions.py      | 15 +++++++++------
 7 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 56bfa0ba332..64634b7a6f9 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+import warnings
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
-    col_dtype = (
-        dtype if dtype is not None
-        else incol._reduction_result_dtype(reduction_op)
-    )
+    if dtype is not None:
+        warnings.warn(
+            "dtype is deprecated and will be remove in a future release. "
+            "Cast the result (e.g. .astype) after the operation instead.",
+            FutureWarning
+        )
+        col_dtype = dtype
+    else:
+        col_dtype = incol._reduction_result_dtype(reduction_op)
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9467bbeed15..5e77aa87e4e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
         if self.null_count == self.size:
             return True
 
-        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", self)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and self.null_count == self.size:
             return False
 
-        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", self)
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -1305,7 +1305,10 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+            dtype = kwargs.pop("dtype", None)
+            return libcudf.reduce.reduce(
+                op, preprocessed, dtype=dtype, **kwargs
+            )
         return preprocessed
 
     def _process_for_reduction(
@@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         Determine the correct dtype to pass to libcudf based on
         the input dtype, data dtype, and specific reduction op
         """
+        if reduction_op in {"any", "all"}:
+            return np.dtype(np.bool_)
         return self.dtype
 
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 004a059af95..a4538179415 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -485,13 +485,11 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
                 format = format.split(" ")[0]
         return self.strftime(format)
 
-    def mean(
-        self, skipna=None, min_count: int = 0, dtype=np.float64
-    ) -> ScalarLike:
+    def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
         return pd.Timestamp(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
+            ).mean(skipna=skipna, min_count=min_count),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -499,12 +497,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+                skipna=skipna, min_count=min_count, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index cea68c88c90..ba080863722 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
         if result_col.null_count == result_col.size:
             return True
 
-        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", result_col)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and result_col.null_count == result_col.size:
             return False
 
-        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", result_col)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -684,15 +684,16 @@ def to_pandas(
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
-        col_dtype = self.dtype
         if reduction_op in {"sum", "product"}:
-            col_dtype = (
-                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
-            )
+            if self.dtype.kind == "f":
+                return self.dtype
+            return np.dtype("int64")
         elif reduction_op == "sum_of_squares":
-            col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
+            return np.result_dtype(self.dtype, np.dtype("uint64"))
+        elif reduction_op in {"var", "std", "mean"}:
+            return np.dtype("float64")
 
-        return col_dtype
+        return super()._reduction_result_dtype(reduction_op)
 
 
 def _normalize_find_and_replace_input(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 95c78c5efcb..f41010062c8 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -144,32 +144,27 @@ def mean(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
     ):
-        return self._reduce(
-            "mean", skipna=skipna, min_count=min_count, dtype=dtype
-        )
+        return self._reduce("mean", skipna=skipna, min_count=min_count)
 
     def var(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 36d7d9f9614..59ea1cc002c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
+    def mean(self, skipna=None) -> pd.Timedelta:
         return pd.Timedelta(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, dtype=dtype),
+            ).mean(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -345,12 +345,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+                skipna=skipna, min_count=min_count, ddof=ddof
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 1247fa362ce..8be6463c699 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
 
 def test_sum_boolean():
     s = Series(np.arange(100000))
-    got = (s > 1).sum(dtype=np.int32)
+    got = (s > 1).sum()
     expect = 99998
 
     assert expect == got
 
-    got = (s > 1).sum(dtype=np.bool_)
-    expect = True
-
-    assert expect == got
-
 
 def test_date_minmax():
     np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
     result = df.mean()
     expected = df.to_pandas().mean()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("op", ["sum", "product"])
+def test_dtype_deprecated(op):
+    ser = cudf.Series(range(5))
+    with pytest.warns(FutureWarning):
+        result = getattr(ser, op)(dtype=np.dtype(np.int8))
+    assert isinstance(result, np.int8)

From 3df4ac28423b99e4dd88570da8d55e2e5af2e1bc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:46:18 -1000
Subject: [PATCH 14/44] Remove squeeze argument from groupby (#16312)

In pandas, this argument was deprecated in pandas 1.x and removed in pandas 2.x. xref https://github.com/pandas-dev/pandas/pull/33218

Looks like in cudf this argument was never implemented, so to align with pandas, I think it should be OK to just remove this argument

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16312
---
 python/cudf/cudf/core/dataframe.py     | 2 --
 python/cudf/cudf/core/indexed_frame.py | 6 ------
 python/cudf/cudf/core/series.py        | 2 --
 3 files changed, 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f06e45277e2..8f8baec0af4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4306,7 +4306,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -4317,7 +4316,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 77675edc0f0..576596f6f7d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5249,7 +5249,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -5259,11 +5258,6 @@ def groupby(
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if squeeze is not False:
-            raise NotImplementedError(
-                "squeeze parameter is not yet implemented"
-            )
-
         if not observed:
             raise NotImplementedError(
                 "observed parameter is not yet implemented"
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c9d24890d15..baaa2eb46a1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3368,7 +3368,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -3379,7 +3378,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )

From 18f5fe0010fd42f604a340cd025a9ca9e122c6f5 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:41:39 -0700
Subject: [PATCH 15/44] Fix polars for 1.2.1 (#16316)

I think Polars made a breaking change in a patch release.
At least the error we're getting looks like the error from
https://github.com/pola-rs/polars/pull/17606.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16316
---
 python/cudf_polars/cudf_polars/utils/versions.py |  1 +
 python/cudf_polars/tests/test_groupby.py         | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index a9ac14c25aa..9807cffb384 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -15,6 +15,7 @@
 POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
 POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
 POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
 POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
 POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
 POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b650fee5079..a75825ef3d3 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -157,7 +157,18 @@ def test_groupby_nan_minmax_raises(op):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
+            ),
+        ),
+        pl.col("key1"),
+    ],
+)
 @pytest.mark.parametrize(
     "expr",
     [

From fa0d89d9b4b4152b919999b5f01b1e68407469c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:46:28 -1000
Subject: [PATCH 16/44] Clean unneeded/redudant dtype utils (#16309)

* Replace `min_scalar_type` with `min_signed_type` (the former just called the latter)
* Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype`
* Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16309
---
 python/cudf/cudf/core/column/column.py    |  6 +++---
 python/cudf/cudf/core/column/numerical.py | 12 +++++++----
 python/cudf/cudf/core/dataframe.py        | 22 ++++---------------
 python/cudf/cudf/core/index.py            | 22 +++++++++----------
 python/cudf/cudf/utils/dtypes.py          | 26 ++++++-----------------
 5 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5e77aa87e4e..89f0f79cb7c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -71,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -1356,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1396,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ba080863722..b55284f1aff 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -29,10 +29,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -517,11 +517,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8f8baec0af4..904bd4ccb2e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -103,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -8304,7 +8290,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4164f981fca..cd52a34e35e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -357,12 +355,10 @@ def _data(self):
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -1601,9 +1597,13 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index af912bee342..69c268db149 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -89,10 +89,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -330,32 +320,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype
 

From 910989eb8fb87b2e896aa032260705c27cce71e0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Jul 2024 15:48:37 -0600
Subject: [PATCH 17/44] Rename gather/scatter benchmarks to clarify coalesced
 behavior. (#16083)

The benchmark names `coalesce_x` and `coalesce_o` are not very clear. This PR renames them to `coalesced` and `shuffled`. This was discussed with @GregoryKimball.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16083
---
 cpp/benchmarks/copying/gather.cu              | 6 +++---
 cpp/benchmarks/copying/scatter.cu             | 6 +++---
 cpp/benchmarks/lists/copying/scatter_lists.cu | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index eeb0149fb3a..985166f7298 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                     \
     ->UseManualTime();
 
-GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+GBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+GBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a521dc82739..c27480b69f4 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {1, 8}})                      \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index dbc3234dabf..570decf410f 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
-SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);

From 6e37afc7c9e177b307c41950e52453bd5906af44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:52:27 -1000
Subject: [PATCH 18/44] Make __bool__ raise for more cudf objects (#16311)

To match pandas, this PR makes `DataFrame`, `MultiIndex` and `RangeIndex` raise on `__bool__`.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16311
---
 python/cudf/cudf/core/_base_index.py         | 6 ++++++
 python/cudf/cudf/core/frame.py               | 6 ++++++
 python/cudf/cudf/core/single_column_frame.py | 6 ------
 python/cudf/cudf/tests/test_csv.py           | 2 +-
 python/cudf/cudf/tests/test_dataframe.py     | 9 +++++++++
 python/cudf/cudf/tests/test_index.py         | 9 +++++++++
 python/cudf/cudf/tests/test_multiindex.py    | 9 +++++++++
 7 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 479f87bb78b..657acc41b18 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -62,6 +62,12 @@ def copy(self, deep: bool = True) -> Self:
     def __len__(self):
         raise NotImplementedError
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     @property
     def size(self):
         # The size of an index is always its length irrespective of dimension.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 111225a5fc2..e3a2e840902 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1587,6 +1587,12 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     # Reductions
     @classmethod
     @_performance_tracking
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 04c7db7a53c..7efe13d9b45 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -91,12 +91,6 @@ def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
-
     @property  # type: ignore
     @_performance_tracking
     def _num_columns(self) -> int:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index a22a627523f..0525b02b698 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1617,7 +1617,7 @@ def test_csv_reader_partial_dtype(dtype):
         StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"]
     )
 
-    assert names_df == header_df
+    assert_eq(names_df, header_df)
     assert all(names_df.dtypes == ["int16", "int64"])
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2009fc49ce5..53ed5d728cb 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11100,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy():
     data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
     df = cudf.DataFrame(data.view(np.recarray), index=idx)
     assert df.index is idx
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.DataFrame()]],
+        rfunc_args_and_kwargs=[[pd.DataFrame()]],
+    )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 9eba6122d26..722a64cb553 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3294,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index):
     df = cudf.DataFrame(range(1))
     df.index = index
     assert df.index is index
+
+
+def test_bool_rangeindex_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[pd.RangeIndex(0)]],
+        rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]],
+    )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 1941eec91eb..2c00d48266c 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2161,3 +2161,12 @@ def test_nunique(array, dropna):
     result = gidx.nunique(dropna=dropna)
     expected = pidx.nunique(dropna=dropna)
     assert result == expected
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
+        rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
+    )

From ecc27a1140c0c287091f6a1291dfaf7ccd82cb19 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:55:40 -1000
Subject: [PATCH 19/44] Align more DataFrame APIs with pandas (#16310)

I have a script that did some signature comparisons between `pandas.DataFrame` and `cudf.DataFrame` API and it appears some signatures have changed between the pandas 1.x and 2.x release. The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `__init__`
* `__array__`
* `__arrow_c_stream__`
* `to_dict`
* `where`
* `add_prefix`
* `join`
* `apply`
* `to_records`
* `from_records`
* `unstack`
* `pct_change`
* `sort_values`

Marking as breaking as I ensured some added keywords are in the same positions as pandas and therefore might break users who are using purely positional arguments.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16310
---
 python/cudf/cudf/core/dataframe.py     | 169 +++++++++++++++++++++++--
 python/cudf/cudf/core/frame.py         |   2 +-
 python/cudf/cudf/core/indexed_frame.py |  13 +-
 python/cudf/cudf/core/reshape.py       |   7 +-
 python/cudf/cudf/core/series.py        |  32 ++++-
 5 files changed, 202 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 904bd4ccb2e..7e07078c95b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -594,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed.
         If None, infer.
+    copy : bool or None, default None
+        Copy data from inputs.
+        Currently not implemented.
     nan_as_null : bool, Default True
         If ``None``/``True``, converts ``np.nan`` values to
         ``null`` values.
@@ -680,8 +683,11 @@ def __init__(
         index=None,
         columns=None,
         dtype=None,
+        copy=None,
         nan_as_null=no_default,
     ):
+        if copy is not None:
+            raise NotImplementedError("copy is not currently implemented.")
         super().__init__()
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
@@ -1524,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs):
             pass
         return NotImplemented
 
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the cudf DataFrame as an Arrow C stream PyCapsule.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema. Currently not implemented.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self.to_arrow().__arrow_c_stream__()
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @_performance_tracking
     def _get_numeric_data(self):
@@ -2235,6 +2260,7 @@ def to_dict(
         self,
         orient: str = "dict",
         into: type[dict] = dict,
+        index: bool = True,
     ) -> dict | list[dict]:
         """
         Convert the DataFrame to a dictionary.
@@ -2268,6 +2294,13 @@ def to_dict(
             instance of the mapping type you want.  If you want a
             collections.defaultdict, you must pass it initialized.
 
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
         Returns
         -------
         dict, list or collections.abc.Mapping
@@ -2349,7 +2382,7 @@ def to_dict(
                 raise TypeError(f"unsupported type: {into}")
             return cons(self.items())  # type: ignore[misc]
 
-        return self.to_pandas().to_dict(orient=orient, into=into)
+        return self.to_pandas().to_dict(orient=orient, into=into, index=index)
 
     @_performance_tracking
     def scatter_by_map(
@@ -3004,7 +3037,12 @@ def fillna(
         )
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
+
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
             _make_categorical_like,
@@ -3614,7 +3652,9 @@ def rename(
         return result
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -4230,6 +4270,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
+        validate: str | None = None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4243,6 +4284,16 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+            Currently not supported.
 
         Returns
         -------
@@ -4256,6 +4307,10 @@ def join(
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
+        elif validate is not None:
+            raise NotImplementedError(
+                "The validate parameter is not yet supported"
+            )
 
         df = self.merge(
             other,
@@ -4404,7 +4459,16 @@ def query(self, expr, local_dict=None):
 
     @_performance_tracking
     def apply(
-        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+        self,
+        func,
+        axis=1,
+        raw=False,
+        result_type=None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Literal["python", "numba"] = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
     ):
         """
         Apply a function along an axis of the DataFrame.
@@ -4432,6 +4496,25 @@ def apply(
             Not yet supported
         args: tuple
             Positional arguments to pass to func in addition to the dataframe.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            Currently not supported.
+
+        engine : {'python', 'numba'}, default 'python'
+            Unused. Added for compatibility with pandas.
+        engine_kwargs : dict
+            Unused. Added for compatibility with pandas.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
 
         Examples
         --------
@@ -4582,13 +4665,17 @@ def apply(
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
-            raise ValueError(
+            raise NotImplementedError(
                 "DataFrame.apply currently only supports row wise ops"
             )
         if raw:
-            raise ValueError("The `raw` kwarg is not yet supported.")
+            raise NotImplementedError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
-            raise ValueError("The `result_type` kwarg is not yet supported.")
+            raise NotImplementedError(
+                "The `result_type` kwarg is not yet supported."
+            )
+        if by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
@@ -5489,7 +5576,7 @@ def from_arrow(cls, table):
         return out
 
     @_performance_tracking
-    def to_arrow(self, preserve_index=None):
+    def to_arrow(self, preserve_index=None) -> pa.Table:
         """
         Convert to a PyArrow Table.
 
@@ -5579,18 +5666,36 @@ def to_arrow(self, preserve_index=None):
         return out.replace_schema_metadata(metadata)
 
     @_performance_tracking
-    def to_records(self, index=True):
+    def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
         """Convert to a numpy recarray
 
         Parameters
         ----------
         index : bool
             Whether to include the index in the output.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types. Currently not supported.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+            This mapping is applied only if `index=True`.
+            Currently not supported.
 
         Returns
         -------
         numpy recarray
         """
+        if column_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
+        elif index_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
         members = [("index", self.index.dtype)] if index else []
         members += [(col, self[col].dtype) for col in self._data.names]
         dtype = np.dtype(members)
@@ -5603,7 +5708,16 @@ def to_records(self, index=True):
 
     @classmethod
     @_performance_tracking
-    def from_records(cls, data, index=None, columns=None, nan_as_null=False):
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+        nan_as_null=False,
+    ):
         """
         Convert structured or record ndarray to DataFrame.
 
@@ -5613,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         index : str, array-like
             The name of the index column in *data*.
             If None, the default index is used.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+            Currently not implemented.
         columns : list of str
             List of column names to include.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+            Currently not implemented.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+            Currently not implemented.
 
         Returns
         -------
         DataFrame
         """
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported.")
+        if coerce_float is not False:
+            raise NotImplementedError(
+                "coerce_float is currently not supported."
+            )
+        if nrows is not None:
+            raise NotImplementedError("nrows is currently not supported.")
+
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found {data.ndim}"
@@ -7344,9 +7477,9 @@ def pivot_table(
 
     @_performance_tracking
     @copy_docstring(reshape.unstack)
-    def unstack(self, level=-1, fill_value=None):
+    def unstack(self, level=-1, fill_value=None, sort: bool = True):
         return cudf.core.reshape.unstack(
-            self, level=level, fill_value=fill_value
+            self, level=level, fill_value=fill_value, sort=sort
         )
 
     @_performance_tracking
@@ -7392,7 +7525,12 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -7417,6 +7555,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift`.
 
         Returns
         -------
@@ -7462,7 +7603,7 @@ def pct_change(
             data = self.fillna(method=fill_method, limit=limit)
 
         return data.diff(periods=periods) / data.shift(
-            periods=periods, freq=freq
+            periods=periods, freq=freq, **kwargs
         )
 
     def __dataframe__(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e3a2e840902..c82e073d7b7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
         return self.to_numpy()
 
     @_performance_tracking
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed, To explicitly construct a GPU matrix, consider using "
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 576596f6f7d..60cd142db4b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
         """
         Prefix labels with string `prefix`.
 
@@ -3464,6 +3464,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -3479,6 +3480,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -3518,6 +3527,8 @@ def sort_values(
             )
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
+        if key is not None:
+            raise NotImplementedError("key is not currently supported.")
 
         if len(self) == 0:
             return self
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1120642947b..b538ae34b6f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     return result
 
 
-def unstack(df, level, fill_value=None):
+def unstack(df, level, fill_value=None, sort: bool = True):
     """
     Pivot one or more levels of the (necessarily hierarchical) index labels.
 
@@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None):
         levels of the index to pivot
     fill_value
         Non-functional argument provided for compatibility with Pandas.
+    sort : bool, default True
+        Sort the level(s) in the resulting MultiIndex columns.
+
 
     Returns
     -------
@@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None):
 
     if fill_value is not None:
         raise NotImplementedError("fill_value is not supported.")
+    elif sort is False:
+        raise NotImplementedError(f"{sort=} is not supported.")
     if pd.api.types.is_list_like(level):
         if not level:
             return df
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index baaa2eb46a1..b1e63806934 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2063,6 +2063,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -2076,6 +2077,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -2107,6 +2116,7 @@ def sort_values(
             kind=kind,
             na_position=na_position,
             ignore_index=ignore_index,
+            key=key,
         )
 
     @_performance_tracking
@@ -3429,7 +3439,9 @@ def rename(self, index=None, copy=True):
         return Series._from_data(out_data, self.index, name=index)
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3527,7 +3539,12 @@ def explode(self, ignore_index=False):
 
     @_performance_tracking
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -3552,6 +3569,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `Series.shift`.
 
         Returns
         -------
@@ -3596,11 +3616,15 @@ def pct_change(
             warnings.simplefilter("ignore")
             data = self.fillna(method=fill_method, limit=limit)
         diff = data.diff(periods=periods)
-        change = diff / data.shift(periods=periods, freq=freq)
+        change = diff / data.shift(periods=periods, freq=freq, **kwargs)
         return change
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             self._from_data_like_self(

From 57ed7fce6742abc96a8fd65216f032bad5937a2f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:24:55 -0500
Subject: [PATCH 20/44] Add tests for `pylibcudf` binaryops (#15470)

This PR implements a more general approach to testing binaryops that originally came up in https://github.com/rapidsai/cudf/pull/15279. This PR can possibly supersede that one.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15470
---
 cpp/include/cudf/binaryop.hpp                 |  11 +
 cpp/src/binaryop/binaryop.cpp                 |   7 +-
 .../binaryop/binop-verify-input-test.cpp      |   4 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |   9 +
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  35 +
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  |  39 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  10 +
 .../cudf/pylibcudf_tests/test_binaryops.py    | 786 ++++++++++++++++++
 8 files changed, 889 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_binaryops.py

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 22dad11e109..c74c91e39c2 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 8ac1491547d..3ac8547baad 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 1346dcd4666..def6e94452e 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 9a8c8e49dcf..2411e28ac66 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +24,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index c1d669c3c1c..44d9f4ad04a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -84,3 +85,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 0eda7d34ff9..b34fea6a775 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -1,9 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.exception_handler cimport cudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -19,9 +21,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
@@ -29,38 +42,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e029edfa2ed..ed2c5ca06c9 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -111,6 +111,16 @@ def _make_fields_nullable(typ):
         lhs = rhs.cast(lhs_type)
 
     if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        lhs_nans = pa.compute.is_nan(lhs)
+        rhs_nans = pa.compute.is_nan(rhs)
+        assert lhs_nans.equals(rhs_nans)
+
+        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            # masks must be equal at this point
+            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+            lhs = lhs.filter(mask)
+            rhs = rhs.filter(mask)
+
         np.testing.assert_array_almost_equal(lhs, rhs)
     else:
         assert lhs.equals(rhs)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
new file mode 100644
index 00000000000..a83caf39ead
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
@@ -0,0 +1,786 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import numpy as np
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def idfn(param):
+    ltype, rtype, outtype, plc_op, _ = param
+    params = (plc_op.name, ltype, rtype, outtype)
+    return "-".join(map(str, params))
+
+
+@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"])
+def nulls(request):
+    return request.param
+
+
+def make_col(dtype, nulls):
+    if dtype == "int64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.int64()
+    elif dtype == "uint64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.uint64()
+    elif dtype == "float64":
+        data = [1.0, 2.0, 3.0, 4.0, 5.0]
+        pa_type = pa.float64()
+    elif dtype == "bool":
+        data = [True, False, True, False, True]
+        pa_type = pa.bool_()
+    elif dtype == "timestamp64[ns]":
+        data = [
+            np.datetime64("2022-01-01"),
+            np.datetime64("2022-01-02"),
+            np.datetime64("2022-01-03"),
+            np.datetime64("2022-01-04"),
+            np.datetime64("2022-01-05"),
+        ]
+        pa_type = pa.timestamp("ns")
+    elif dtype == "timedelta64[ns]":
+        data = [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+        pa_type = pa.duration("ns")
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if nulls:
+        data[3] = None
+
+    return pa.array(data, type=pa_type)
+
+
+@pytest.fixture
+def pa_data(request, nulls):
+    ltype, rtype, outtype = request.param
+    values = make_col(ltype, nulls), make_col(rtype, nulls), outtype
+    return values
+
+
+@pytest.fixture
+def plc_data(pa_data):
+    lhs, rhs, outtype = pa_data
+    return (
+        plc.interop.from_arrow(lhs),
+        plc.interop.from_arrow(rhs),
+        plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))),
+    )
+
+
+@pytest.fixture
+def tests(request, nulls):
+    ltype, rtype, py_outtype, plc_op, py_op = request.param
+    pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls)
+    plc_lhs, plc_rhs = (
+        plc.interop.from_arrow(pa_lhs),
+        plc.interop.from_arrow(pa_rhs),
+    )
+    plc_dtype = plc.interop.from_arrow(
+        pa.from_numpy_dtype(np.dtype(py_outtype))
+    )
+    return (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_dtype,
+        py_op,
+        plc_op,
+    )
+
+
+def custom_pyop(func):
+    def wrapper(x, y):
+        x = x.to_pylist()
+        y = y.to_pylist()
+
+        def inner(x, y):
+            if x is None or y is None:
+                return None
+            return func(x, y)
+
+        return pa.array([inner(x, y) for x, y in zip(x, y)])
+
+    return wrapper
+
+
+@custom_pyop
+def py_floordiv(x, y):
+    return x // y
+
+
+@custom_pyop
+def py_pmod(x, y):
+    return (x % y + y) % y
+
+
+@custom_pyop
+def py_mod(x, y):
+    return x % y
+
+
+@custom_pyop
+def py_atan2(x, y):
+    return math.atan2(x, y)
+
+
+@custom_pyop
+def py_shift_right_unsigned(x, y):
+    unsigned_x = np.uint32(x)
+    result = unsigned_x >> y
+    return result
+
+
+@pytest.mark.parametrize(
+    "tests",
+    [
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.GENERIC_BINARY,
+            None,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INVALID_BINARY,
+            None,
+        ),
+    ],
+    indirect=True,
+    ids=idfn,
+)
+def test_binaryops(tests):
+    (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_outtype,
+        py_op,
+        plc_op,
+    ) = tests
+
+    def get_result():
+        return plc.binaryop.binary_operation(
+            plc_lhs,
+            plc_rhs,
+            plc_op,
+            plc_outtype,
+        )
+
+    if not plc.binaryop.is_supported_operation(
+        plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op
+    ):
+        with pytest.raises(TypeError):
+            get_result()
+    else:
+        expect = py_op(pa_lhs, pa_rhs).cast(py_outtype)
+        got = get_result()
+        assert_column_eq(expect, got)

From 7d3083254c0503b07f82af32188120f42acef860 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:48:39 -1000
Subject: [PATCH 21/44] Replace np.isscalar/issubdtype checks with
 is_scalar/.kind checks (#16275)

* `is_scalar` also handles cudf.Scalars which should be handled internally
* `issubdtype` can largely be replaced by checking the `.kind` attribute on the dtype

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16275
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/column.py      | 10 +++----
 python/cudf/cudf/core/column/datetime.py    |  2 +-
 python/cudf/cudf/core/column/lists.py       |  9 ++++---
 python/cudf/cudf/core/column/numerical.py   | 28 +++++++-------------
 python/cudf/cudf/core/join/_join_helpers.py | 29 ++++++---------------
 python/cudf/cudf/core/series.py             |  2 +-
 python/cudf/cudf/testing/testing.py         | 10 +++----
 python/cudf/cudf/utils/dtypes.py            |  4 +--
 9 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6003a0f6aea..18ab32d2c9e 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -47,7 +47,7 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if isinstance(other, float) and not np.isnan(other):
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
             try:
                 is_safe = source_dtype.type(other) == other
             except OverflowError:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 89f0f79cb7c..da735c22c52 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1458,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2312,9 +2313,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a4538179415..73902789c11 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -639,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 46b844413f7..1b7cd95b3d0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b55284f1aff..5e07bbab40c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -225,25 +225,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index dd0a4f666a1..32c84763401 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -88,38 +88,25 @@ def _match_join_keys(
         )
 
     if (
-        np.issubdtype(ltype, np.number)
-        and np.issubdtype(rtype, np.number)
-        and not (
-            np.issubdtype(ltype, np.timedelta64)
-            or np.issubdtype(rtype, np.timedelta64)
-        )
+        is_numeric_dtype(ltype)
+        and is_numeric_dtype(rtype)
+        and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.result_type(ltype, rtype)
         )
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        and np.issubdtype(rtype, np.datetime64)
-    ) or (
-        np.issubdtype(ltype, np.timedelta64)
-        and np.issubdtype(rtype, np.timedelta64)
+    elif (ltype.kind == "M" and rtype.kind == "M") or (
+        ltype.kind == "m" and rtype.kind == "m"
     ):
         common_type = max(ltype, rtype)
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        or np.issubdtype(ltype, np.timedelta64)
-    ) and not rcol.fillna(0).can_cast_safely(ltype):
+    elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype):
         raise TypeError(
             f"Cannot join between {ltype} and {rtype}, please type-cast both "
             "columns to the same type."
         )
-    elif (
-        np.issubdtype(rtype, np.datetime64)
-        or np.issubdtype(rtype, np.timedelta64)
-    ) and not lcol.fillna(0).can_cast_safely(rtype):
+    elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype):
         raise TypeError(
             f"Cannot join between {rtype} and {ltype}, please type-cast both "
             "columns to the same type."
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b1e63806934..eb077179562 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -213,7 +213,7 @@ def __setitem__(self, key, value):
                         and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
-                and isinstance(value, (np.float32, np.float64))
+                and isinstance(value, np.floating)
                 and np.isnan(value)
             ):
                 raise MixedTypeError(
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index e56c8d867cb..c2072d90e98 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -158,12 +158,12 @@ def assert_column_equal(
             return True
 
     if check_datetimelike_compat:
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             right = right.astype(left.dtype)
-        elif np.issubdtype(right.dtype, np.datetime64):
+        elif right.dtype.kind == "M":
             left = left.astype(right.dtype)
 
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             if not left.equals(right):
                 raise AssertionError(
                     f"[datetimelike_compat=True] {left.values} "
@@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs):
                 tm.assert_index_equal(left, right, **kwargs)
 
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
+        if left.dtype.kind == "f" and right.dtype.kind == "f":
             assert np.allclose(left, right, equal_nan=True)
         else:
             assert np.array_equal(left, right)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 69c268db149..c0de5274742 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -359,10 +359,10 @@ def min_column_type(x, expected_type):
     if x.null_count == len(x):
         return x.dtype
 
-    if np.issubdtype(x.dtype, np.floating):
+    if x.dtype.kind == "f":
         return get_min_float_dtype(x)
 
-    elif np.issubdtype(expected_type, np.integer):
+    elif cudf.dtype(expected_type).kind in "iu":
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)

From 4c46628eaf7ba16a2a181ceb3311f315cd4932dc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:51:07 -1000
Subject: [PATCH 22/44] Mark cudf._typing as a typing module in ruff (#16318)

Additionally breaks up the prior, single-line of `select` rules that are enabled.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16318
---
 pyproject.toml                    | 64 ++++++++++++++++++++++++++++++-
 python/cudf/cudf/core/resample.py |  6 ++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f59864894b..e15cb7b3cdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index cdd4ec6f8e5..4e0c5bd86b9 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import pickle
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -23,7 +25,6 @@
 import cudf
 import cudf._lib.labeling
 import cudf.core.index
-from cudf._typing import DataFrameOrSeries
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -31,6 +32,9 @@
     _Grouping,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import DataFrameOrSeries
+
 
 class _Resampler(GroupBy):
     grouping: "_ResampleGrouping"

From 5dde41d7f7533180ecd355bac248a7ed18adcc10 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 13:08:36 -1000
Subject: [PATCH 23/44] Replace is_float/integer_dtype checks with .kind checks
 (#16261)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16261
---
 python/cudf/cudf/api/types.py                |  2 +-
 python/cudf/cudf/core/_base_index.py         | 19 +++----------
 python/cudf/cudf/core/column/column.py       | 29 ++++++++++----------
 python/cudf/cudf/core/column/decimal.py      |  4 +--
 python/cudf/cudf/core/column/numerical.py    | 13 +++------
 python/cudf/cudf/core/index.py               | 13 +++++----
 python/cudf/cudf/core/indexing_utils.py      |  8 ++----
 python/cudf/cudf/core/series.py              |  7 ++---
 python/cudf/cudf/core/single_column_frame.py |  3 +-
 python/cudf/cudf/tests/test_dataframe.py     |  2 +-
 python/cudf/cudf/utils/dtypes.py             | 28 +++++++++----------
 11 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index d97e9c815b6..294ae2fd985 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -90,7 +90,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer_dtype(obj.dtype)
+        return obj.dtype.kind in "iu"
     return pd.api.types.is_integer(obj)
 
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 657acc41b18..c38352009de 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,14 +19,7 @@
 )
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_scalar,
-    is_signed_integer_dtype,
-    is_unsigned_integer_dtype,
-)
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.errors import MixedTypeError
@@ -621,12 +614,8 @@ def union(self, other, sort=None):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
                 raise MixedTypeError("Cannot perform union with mixed types")
-            if (
-                is_signed_integer_dtype(self.dtype)
-                and is_unsigned_integer_dtype(other.dtype)
-            ) or (
-                is_unsigned_integer_dtype(self.dtype)
-                and is_signed_integer_dtype(other.dtype)
+            if (self.dtype.kind == "i" and other.dtype.kind == "u") or (
+                self.dtype.kind == "u" and other.dtype.kind == "i"
             ):
                 # signed + unsigned types will result in
                 # mixed type for union in pandas.
@@ -2103,7 +2092,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
+        if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
         if not _gather_map_is_valid(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index da735c22c52..32e6aade65b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2219,25 +2219,26 @@ def as_column(
                 and arbitrary.null_count > 0
             ):
                 arbitrary = arbitrary.cast(pa.float64())
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and pa.types.is_integer(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("int")
-            elif cudf.get_option(
-                "default_float_bitwidth"
-            ) and pa.types.is_floating(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("float")
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and pa.types.is_integer(arbitrary.type)
+            ) or (
+                cudf.get_option("default_float_bitwidth")
+                and pa.types.is_floating(arbitrary.type)
+            ):
+                dtype = _maybe_convert_to_default_type(
+                    cudf.dtype(arbitrary.type.to_pandas_dtype())
+                )
         except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
             arbitrary = pd.Series(arbitrary)
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and arbitrary.dtype.kind in set("iu"):
-                dtype = _maybe_convert_to_default_type("int")
-            elif (
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and arbitrary.dtype.kind in set("iu")
+            ) or (
                 cudf.get_option("default_float_bitwidth")
                 and arbitrary.dtype.kind == "f"
             ):
-                dtype = _maybe_convert_to_default_type("float")
+                dtype = _maybe_convert_to_default_type(arbitrary.dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index a63055ed527..6a7f338b065 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,7 +15,7 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf.api.types import is_integer_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -150,7 +150,7 @@ def _validate_fillna_value(
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
-                if not is_integer_dtype(other.dtype):
+                if other.dtype.kind not in "iu":
                     raise TypeError(
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 5e07bbab40c..f9404eb3b40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import (
-    is_float_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_scalar,
-)
+from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -249,7 +244,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
-            if is_float_dtype(self.dtype) or is_float_dtype(other.dtype):
+            if self.dtype.kind == "f" or other.dtype.kind == "f":
                 raise TypeError(
                     f"Operation 'bitwise {op[2:-2]}' not supported between "
                     f"{self.dtype.type.__name__} and "
@@ -260,8 +255,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         if (
             op == "__pow__"
-            and is_integer_dtype(self.dtype)
-            and (is_integer(other) or is_integer_dtype(other.dtype))
+            and self.dtype.kind in "iu"
+            and (is_integer(other) or other.dtype.kind in "iu")
         ):
             op = "INT_POW"
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd52a34e35e..ae20fcd5d9c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1456,18 +1456,19 @@ def notna(self):
     notnull = notna
 
     def _is_numeric(self):
-        return isinstance(
-            self._values, cudf.core.column.NumericalColumn
-        ) and self.dtype != cudf.dtype("bool")
+        return (
+            isinstance(self._values, cudf.core.column.NumericalColumn)
+            and self.dtype.kind != "b"
+        )
 
     def _is_boolean(self):
-        return self.dtype == cudf.dtype("bool")
+        return self.dtype.kind == "b"
 
     def _is_integer(self):
-        return cudf.api.types.is_integer_dtype(self.dtype)
+        return self.dtype.kind in "iu"
 
     def _is_floating(self):
-        return cudf.api.types.is_float_dtype(self.dtype)
+        return self.dtype.kind == "f"
 
     def _is_object(self):
         return isinstance(self._values, cudf.core.column.StringColumn)
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 9c81b0eb607..a0089242909 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -8,11 +8,7 @@
 from typing_extensions import TypeAlias
 
 import cudf
-from cudf.api.types import (
-    _is_scalar_or_zero_d_array,
-    is_integer,
-    is_integer_dtype,
-)
+from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 
@@ -233,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
-        elif is_integer_dtype(key.dtype):
+        elif key.dtype.kind in "iu":
             return MapIndexer(GatherMap(key, n, nullify=False))
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index eb077179562..d8dbaa897e7 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -24,7 +24,6 @@
     _is_scalar_or_zero_d_array,
     is_dict_like,
     is_integer,
-    is_integer_dtype,
     is_scalar,
 )
 from cudf.core import indexing_utils
@@ -356,12 +355,10 @@ def _loc_to_iloc(self, arg):
             )
             if not _is_non_decimal_numeric_dtype(index_dtype) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
-                and is_integer_dtype(index_dtype.categories.dtype)
+                and index_dtype.categories.dtype.kind in "iu"
             ):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
+                if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
                     assert (
                         PANDAS_LT_300
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7efe13d9b45..b93528f9693 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -12,7 +12,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
@@ -352,7 +351,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
-            if is_integer_dtype(arg.dtype):
+            if arg.dtype.kind in "iu":
                 return self._column.take(arg)
             if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 53ed5d728cb..e2ce5c03b70 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10833,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names):
         expectation = contains is cudf.NA and name is cudf.NA
         assert (contains in pdf) == expectation
         assert (contains in gdf) == expectation
-    elif pd.api.types.is_float_dtype(gdf.columns.dtype):
+    elif gdf.columns.dtype.kind == "f":
         # In some cases, the columns are converted to an Index[float] based on
         # the other column names. That casts name values from None to np.nan.
         expectation = contains is np.nan and (name is None or name is np.nan)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index c0de5274742..b0788bcc0fc 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import datetime
 from decimal import Decimal
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
@@ -11,6 +13,9 @@
 
 import cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import DtypeObj
+
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
 handling is required when converting a Boolean column into arrow.
@@ -568,25 +573,18 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _maybe_convert_to_default_type(dtype):
+def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """Convert `dtype` to default if specified by user.
 
     If not specified, return as is.
     """
-    if cudf.get_option("default_integer_bitwidth"):
-        if cudf.api.types.is_signed_integer_dtype(dtype):
-            return cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-        elif cudf.api.types.is_unsigned_integer_dtype(dtype):
-            return cudf.dtype(
-                f'u{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-    if cudf.get_option(
-        "default_float_bitwidth"
-    ) and cudf.api.types.is_float_dtype(dtype):
-        return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}')
-
+    if ib := cudf.get_option("default_integer_bitwidth"):
+        if dtype.kind == "i":
+            return cudf.dtype(f"i{ib//8}")
+        elif dtype.kind == "u":
+            return cudf.dtype(f"u{ib//8}")
+    if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
+        return cudf.dtype(f"f{fb//8}")
     return dtype
 
 

From e169e8e4273e4d317e3f27c810c5b137dd75adb3 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:36:03 -0700
Subject: [PATCH 24/44] Implement read_csv in cudf-polars using pylibcudf
 (#16307)

Replace cudf-classic with pylibcudf for CSV reading in cudf-polars

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16307
---
 python/cudf_polars/cudf_polars/dsl/ir.py | 50 ++++++++++++------------
 python/cudf_polars/tests/test_scan.py    | 38 ++++++++++++++++++
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 0b14530e0ed..a84fe73810e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -242,10 +242,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            dtype_map = {
-                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
-                for name, typ in self.schema.items()
-            }
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
             quote = chr(parse_options["quote_char"])
@@ -280,31 +276,37 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pieces = []
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
-                # TODO: read_csv expands globs which we should not do,
-                # because polars will already have handled them.
                 path = Path(p)
                 with path.open() as f:
                     while f.readline() == "\n":
                         skiprows += 1
-                pieces.append(
-                    cudf.read_csv(
-                        path,
-                        sep=sep,
-                        quotechar=quote,
-                        lineterminator=eol,
-                        names=column_names,
-                        header=header,
-                        usecols=usecols,
-                        na_filter=True,
-                        na_values=null_values,
-                        keep_default_na=False,
-                        skiprows=skiprows,
-                        comment=comment,
-                        decimal=decimal,
-                        dtype=dtype_map,
-                    )
+                tbl_w_meta = plc.io.csv.read_csv(
+                    plc.io.SourceInfo([path]),
+                    delimiter=sep,
+                    quotechar=quote,
+                    lineterminator=eol,
+                    col_names=column_names,
+                    header=header,
+                    usecols=usecols,
+                    na_filter=True,
+                    na_values=null_values,
+                    keep_default_na=False,
+                    skiprows=skiprows,
+                    comment=comment,
+                    decimal=decimal,
+                    dtypes=self.schema,
+                )
+                pieces.append(tbl_w_meta)
+            tables, colnames = zip(
+                *(
+                    (piece.tbl, piece.column_names(include_children=False))
+                    for piece in pieces
                 )
-            df = DataFrame.from_cudf(cudf.concat(pieces))
+            )
+            df = DataFrame.from_table(
+                plc.concatenate.concatenate(list(tables)),
+                colnames[0],
+            )
         elif self.typ == "parquet":
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index d0c41090433..0981a96a34a 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
+
 import pytest
 
 import polars as pl
@@ -129,6 +131,42 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize(
+    "filename,glob",
+    [
+        (["test1.csv", "test2.csv"], True),
+        ("test*.csv", True),
+        # Make sure we don't expand glob when
+        # trying to read a file like test*.csv
+        # when glob=False
+        ("test*.csv", False),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test*.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    os.chdir(tmp_path)
+    q = pl.scan_csv(filename, glob=glob)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_multi_differing_colnames(tmp_path):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""abc,def,ghi\n1,2\n3,4,5""")
+    q = pl.scan_csv(
+        [tmp_path / "test1.csv", tmp_path / "test2.csv"],
+    )
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.explain()
+
+
 def test_scan_csv_skip_after_header_not_implemented(tmp_path):
     with (tmp_path / "test.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")

From 535db9b26ed1a57e4275f4a6f11b04ebeee21248 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:28:14 -0700
Subject: [PATCH 25/44] Deprecate Arrow support in I/O (#16132)

Contributes to https://github.com/rapidsai/cudf/issues/15193

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16132
---
 .../cudf/_lib/pylibcudf/io/datasource.pyx     |  10 +-
 python/cudf/cudf/io/csv.py                    |   2 +-
 python/cudf/cudf/io/orc.py                    |  33 +++--
 python/cudf/cudf/io/parquet.py                |  40 ++++--
 .../io/test_source_sink_info.py               |  21 +--
 python/cudf/cudf/tests/test_csv.py            |   5 +-
 python/cudf/cudf/tests/test_gcs.py            |   3 +-
 python/cudf/cudf/tests/test_parquet.py        |  19 +--
 python/cudf/cudf/tests/test_s3.py             | 136 ++++++++++--------
 python/cudf/cudf/utils/ioutils.py             |  78 ++++++++--
 python/cudf/cudf/utils/utils.py               |  26 ++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |   6 +-
 12 files changed, 247 insertions(+), 132 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
index aa7fa0efdaf..8f265f585de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
 from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 02b26ea1c01..0f0a240b5d0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 287dd8f21c8..438c482b77a 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,11 +2,9 @@
 
 import io
 
-import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0525b02b698..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ecb7fd44422..f2820d9c112 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 76c7f2bfdb8..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -468,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -934,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1006,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1188,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1409,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1430,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..3947c69aaa5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9

From 75335f6af51bde6be68c1fb0a6caa8030b9eda3e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 19 Jul 2024 18:21:27 -0700
Subject: [PATCH 26/44] Report number of rows per file read by PQ reader when
 no row selection and fix segfault in chunked PQ reader when skip_rows > 0
 (#16195)

Closes #15389
Closes #16186

This PR adds the capability to calculate and report the number of rows read from each data source into the table returned by the Parquet reader (both chunked and normal). The returned vector of counts is only valid (non-empty) when row selection (AST filter) is not being used.

This PR also fixes a segfault in chunked parquet reader when skip_rows > 0 and the number of passes > 1. This segfault was being caused by a couple of arithmetic errors when computing the (start_row, num_row)  for row_group_info, pass, column chunk descriptor structs.

Both changes were added to this PR as changes and the gtests from the former work were needed to implement the segfault fix.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16195
---
 cpp/include/cudf/io/types.hpp                 |   3 +
 cpp/src/io/parquet/reader_impl.cpp            |  86 +++-
 cpp/src/io/parquet/reader_impl.hpp            |  31 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  53 ++-
 cpp/src/io/parquet/reader_impl_chunking.hpp   |   6 +
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  32 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  20 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  19 +-
 cpp/tests/io/parquet_chunked_reader_test.cu   | 385 ++++++++++++++++++
 cpp/tests/io/parquet_reader_test.cpp          | 203 +++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  |   1 +
 11 files changed, 796 insertions(+), 43 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 0c96268f6c7..431a5e7be83 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -277,6 +277,9 @@ struct column_name_info {
 struct table_metadata {
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
+  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source.
+                                            //!< Currently only computed for Parquet readers if no
+                                            //!< AST filters being used. Empty vector otherwise.
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input
                                                  //!< file as key-values pairs (deprecated)
   std::vector<std::unordered_map<std::string, std::string>>
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f705f6626e7..68ec61ead0a 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -26,6 +26,7 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
@@ -549,7 +550,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
+  if (!has_more_work()) {
+    // Check if number of rows per source should be included in output metadata.
+    if (include_output_num_rows_per_source()) {
+      // Empty dataframe case: Simply initialize to a list of zeros
+      out_metadata.num_rows_per_source =
+        std::vector<size_t>(_file_itm_data.num_rows_per_source.size(), 0);
+    }
+
+    // Finalize output
+    return finalize_output(mode, out_metadata, out_columns);
+  }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
@@ -585,11 +596,80 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
     }
   }
 
+  // Check if number of rows per source should be included in output metadata.
+  if (include_output_num_rows_per_source()) {
+    // For chunked reading, compute the output number of rows per source
+    if (mode == read_mode::CHUNKED_READ) {
+      out_metadata.num_rows_per_source =
+        calculate_output_num_rows_per_source(read_info.skip_rows, read_info.num_rows);
+    }
+    // Simply move the number of rows per file if reading all at once
+    else {
+      // Move is okay here as we are reading in one go.
+      out_metadata.num_rows_per_source = std::move(_file_itm_data.num_rows_per_source);
+    }
+  }
+
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns);
+  return finalize_output(mode, out_metadata, out_columns);
+}
+
+std::vector<size_t> reader::impl::calculate_output_num_rows_per_source(size_t const chunk_start_row,
+                                                                       size_t const chunk_num_rows)
+{
+  // Handle base cases.
+  if (_file_itm_data.num_rows_per_source.size() == 0) {
+    return {};
+  } else if (_file_itm_data.num_rows_per_source.size() == 1) {
+    return {chunk_num_rows};
+  }
+
+  std::vector<size_t> num_rows_per_source(_file_itm_data.num_rows_per_source.size(), 0);
+
+  // Subtract global skip rows from the start_row as we took care of that when computing
+  // _file_itm_data.num_rows_per_source
+  auto const start_row = chunk_start_row - _file_itm_data.global_skip_rows;
+  auto const end_row   = start_row + chunk_num_rows;
+  CUDF_EXPECTS(start_row <= end_row and end_row <= _file_itm_data.global_num_rows,
+               "Encountered invalid output chunk row bounds.");
+
+  // Copy reference to a const local variable for better readability
+  auto const& partial_sum_nrows_source = _file_itm_data.exclusive_sum_num_rows_per_source;
+
+  // Binary search start_row and end_row in exclusive_sum_num_rows_per_source vector
+  auto const start_iter =
+    std::upper_bound(partial_sum_nrows_source.cbegin(), partial_sum_nrows_source.cend(), start_row);
+  auto const end_iter =
+    (end_row == _file_itm_data.global_skip_rows + _file_itm_data.global_num_rows)
+      ? partial_sum_nrows_source.cend() - 1
+      : std::upper_bound(start_iter, partial_sum_nrows_source.cend(), end_row);
+
+  // Compute the array offset index for both iterators
+  auto const start_idx = std::distance(partial_sum_nrows_source.cbegin(), start_iter);
+  auto const end_idx   = std::distance(partial_sum_nrows_source.cbegin(), end_iter);
+
+  CUDF_EXPECTS(start_idx <= end_idx,
+               "Encountered invalid source files indexes for output chunk row bounds");
+
+  // If the entire chunk is from the same source file, then the count is simply num_rows
+  if (start_idx == end_idx) {
+    num_rows_per_source[start_idx] = chunk_num_rows;
+  } else {
+    // Compute the number of rows from the first source file
+    num_rows_per_source[start_idx] = partial_sum_nrows_source[start_idx] - start_row;
+    // Compute the number of rows from the last source file
+    num_rows_per_source[end_idx] = end_row - partial_sum_nrows_source[end_idx - 1];
+    // Simply copy the number of rows for each source in range: (start_idx, end_idx)
+    std::copy(_file_itm_data.num_rows_per_source.cbegin() + start_idx + 1,
+              _file_itm_data.num_rows_per_source.cbegin() + end_idx,
+              num_rows_per_source.begin() + start_idx + 1);
+  }
+
+  return num_rows_per_source;
 }
 
-table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+table_with_metadata reader::impl::finalize_output(read_mode mode,
+                                                  table_metadata& out_metadata,
                                                   std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 3b8e80a29e6..5e3cc4301f9 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -262,11 +262,13 @@ class reader::impl {
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
    * schema.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(table_metadata& out_metadata,
+  table_with_metadata finalize_output(read_mode mode,
+                                      table_metadata& out_metadata,
                                       std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
@@ -336,11 +338,36 @@ class reader::impl {
              : true;
   }
 
+  /**
+   * @brief Check if this is the first output chunk
+   *
+   * @return True if this is the first output chunk
+   */
   [[nodiscard]] bool is_first_output_chunk() const
   {
     return _file_itm_data._output_chunk_count == 0;
   }
 
+  /**
+   * @brief Check if number of rows per source should be included in output metadata.
+   *
+   * @return True if AST filter is not present
+   */
+  [[nodiscard]] bool include_output_num_rows_per_source() const
+  {
+    return not _expr_conv.get_converted_expr().has_value();
+  }
+
+  /**
+   * @brief Calculate the number of rows read from each source in the output chunk
+   *
+   * @param chunk_start_row The offset of the first row in the output chunk
+   * @param chunk_num_rows The number of rows in the the output chunk
+   * @return Vector of number of rows from each respective data source in the output chunk
+   */
+  [[nodiscard]] std::vector<size_t> calculate_output_num_rows_per_source(size_t chunk_start_row,
+                                                                         size_t chunk_num_rows);
+
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
@@ -387,7 +414,7 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
+  // At the top level, the entire file is divided up into "passes" on which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3da303e6928..05e0d8c0111 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1232,22 +1232,22 @@ void reader::impl::setup_next_pass(read_mode mode)
       pass.skip_rows = _file_itm_data.global_skip_rows;
       pass.num_rows  = _file_itm_data.global_num_rows;
     } else {
-      auto const global_start_row = _file_itm_data.global_skip_rows;
-      auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-      auto const start_row =
-        std::max(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass],
-                 global_start_row);
-      auto const end_row =
-        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
-                 global_end_row);
-
-      // skip_rows is always global in the sense that it is relative to the first row of
-      // everything we will be reading, regardless of what pass we are on.
-      // num_rows is how many rows we are reading this pass.
-      pass.skip_rows =
-        global_start_row +
+      // pass_start_row and pass_end_row are computed from the selected row groups relative to the
+      // global_skip_rows.
+      auto const pass_start_row =
         _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass];
-      pass.num_rows = end_row - start_row;
+      auto const pass_end_row =
+        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
+                 _file_itm_data.global_num_rows);
+
+      // pass.skip_rows is always global in the sense that it is relative to the first row of
+      // the data source (global row number 0), regardless of what pass we are on. Therefore,
+      // we must re-add global_skip_rows to the pass_start_row which is relative to the
+      // global_skip_rows.
+      pass.skip_rows = _file_itm_data.global_skip_rows + pass_start_row;
+      // num_rows is how many rows we are reading this pass. Since this is a difference, adding
+      // global_skip_rows to both variables is redundant.
+      pass.num_rows = pass_end_row - pass_start_row;
     }
 
     // load page information for the chunk. this retrieves the compressed bytes for all the
@@ -1509,6 +1509,7 @@ void reader::impl::create_global_chunk_info()
 
   // Initialize column chunk information
   auto remaining_rows = num_rows;
+  auto skip_rows      = _file_itm_data.global_skip_rows;
   for (auto const& rg : row_groups_info) {
     auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_start = rg.start_row;
@@ -1561,7 +1562,12 @@ void reader::impl::create_global_chunk_info()
                                        schema.type == BYTE_ARRAY and _strings_to_categorical));
     }
 
-    remaining_rows -= row_group_rows;
+    // Adjust for skip_rows when updating the remaining rows after the first group
+    remaining_rows -=
+      (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
+                  : row_group_rows;
+    // Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
   }
 }
 
@@ -1598,6 +1604,9 @@ void reader::impl::compute_input_passes()
   _file_itm_data.input_pass_row_group_offsets.push_back(0);
   _file_itm_data.input_pass_start_row_count.push_back(0);
 
+  // To handle global_skip_rows when computing input passes
+  int skip_rows = _file_itm_data.global_skip_rows;
+
   for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
@@ -1606,6 +1615,14 @@ void reader::impl::compute_input_passes()
     auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
       get_row_group_size(row_group);
 
+    // We must use the effective size of the first row group we are reading to accurately calculate
+    // the first non-zero input_pass_start_row_count.
+    auto const row_group_rows =
+      (skip_rows) ? rgi.start_row + row_group.num_rows - skip_rows : row_group.num_rows;
+
+    //  Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
+
     // can we add this row group
     if (cur_pass_byte_size + compressed_rg_size >= comp_read_limit) {
       // A single row group (the current one) is larger than the read limit:
@@ -1613,7 +1630,7 @@ void reader::impl::compute_input_passes()
       // row group
       if (cur_rg_start == cur_rg_index) {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group.num_rows);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group_rows);
         cur_rg_start       = cur_rg_index + 1;
         cur_pass_byte_size = 0;
       }
@@ -1627,7 +1644,7 @@ void reader::impl::compute_input_passes()
     } else {
       cur_pass_byte_size += compressed_rg_size;
     }
-    cur_row_count += row_group.num_rows;
+    cur_row_count += row_group_rows;
   }
 
   // add the last pass if necessary
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index b959c793011..3a3cdd34a58 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -41,6 +41,12 @@ struct file_intermediate_data {
   // is not capped by global_skip_rows and global_num_rows.
   std::vector<std::size_t> input_pass_start_row_count{};
 
+  // number of rows to be read from each data source
+  std::vector<std::size_t> num_rows_per_source{};
+
+  // partial sum of the number of rows per data source
+  std::vector<std::size_t> exclusive_sum_num_rows_per_source{};
+
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index d1e9a823d3b..581c44d024b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -945,7 +945,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -976,6 +976,9 @@ aggregate_reader_metadata::select_row_groups(
                      static_cast<size_type>(from_opts.second)};
   }();
 
+  // Get number of rows in each data source
+  std::vector<size_t> num_rows_per_source(per_file_metadata.size(), 0);
+
   if (!row_group_indices.empty()) {
     CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(),
                  "Must specify row groups for each source");
@@ -989,28 +992,45 @@ aggregate_reader_metadata::select_row_groups(
         selection.emplace_back(rowgroup_idx, rows_to_read, src_idx);
         // if page-level indexes are present, then collect extra chunk and page info.
         column_info_for_row_group(selection.back(), 0);
-        rows_to_read += get_row_group(rowgroup_idx, src_idx).num_rows;
+        auto const rows_this_rg = get_row_group(rowgroup_idx, src_idx).num_rows;
+        rows_to_read += rows_this_rg;
+        num_rows_per_source[src_idx] += rows_this_rg;
       }
     }
   } else {
     size_type count = 0;
     for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
       auto const& fmd = per_file_metadata[src_idx];
-      for (size_t rg_idx = 0; rg_idx < fmd.row_groups.size(); ++rg_idx) {
+      for (size_t rg_idx = 0;
+           rg_idx < fmd.row_groups.size() and count < rows_to_skip + rows_to_read;
+           ++rg_idx) {
         auto const& rg             = fmd.row_groups[rg_idx];
         auto const chunk_start_row = count;
         count += rg.num_rows;
         if (count > rows_to_skip || count == 0) {
+          // start row of this row group adjusted with rows_to_skip
+          num_rows_per_source[src_idx] += count;
+          num_rows_per_source[src_idx] -=
+            (chunk_start_row <= rows_to_skip) ? rows_to_skip : chunk_start_row;
+
+          // We need the unadjusted start index of this row group to correctly initialize
+          // ColumnChunkDesc for this row group in create_global_chunk_info() and calculate
+          // the row offset for the first pass in compute_input_passes().
           selection.emplace_back(rg_idx, chunk_start_row, src_idx);
-          // if page-level indexes are present, then collect extra chunk and page info.
+
+          // If page-level indexes are present, then collect extra chunk and page info.
+          // The page indexes rely on absolute row numbers, not adjusted for skip_rows.
           column_info_for_row_group(selection.back(), chunk_start_row);
         }
-        if (count >= rows_to_skip + rows_to_read) { break; }
+        // Adjust the number of rows for the last source file.
+        if (count >= rows_to_skip + rows_to_read) {
+          num_rows_per_source[src_idx] -= count - rows_to_skip - rows_to_read;
+        }
       }
     }
   }
 
-  return {rows_to_skip, rows_to_read, std::move(selection)};
+  return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)};
 }
 
 std::tuple<std::vector<input_column_info>,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6bfa8519c76..309132a5347 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -282,17 +282,17 @@ class aggregate_reader_metadata {
    * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return A tuple of corrected row_start, row_count and list of row group indexes and its
-   *         starting row
+   * @return A tuple of corrected row_start, row_count, list of row group indexes and its
+   *         starting row, and list of number of rows per source.
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
-    host_span<std::vector<size_type> const> row_group_indices,
-    int64_t row_start,
-    std::optional<size_type> const& row_count,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::optional<std::reference_wrapper<ast::expression const>> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+  select_row_groups(host_span<std::vector<size_type> const> row_group_indices,
+                    int64_t row_start,
+                    std::optional<size_type> const& row_count,
+                    host_span<data_type const> output_dtypes,
+                    host_span<int const> output_column_schemas,
+                    std::optional<std::reference_wrapper<ast::expression const>> filter,
+                    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of columns
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f28a7311ccb..ff47dfc4cf3 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1235,8 +1235,10 @@ void reader::impl::preprocess_file(read_mode mode)
                    [](auto const& col) { return col.type; });
   }
 
-  std::tie(
-    _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+  std::tie(_file_itm_data.global_skip_rows,
+           _file_itm_data.global_num_rows,
+           _file_itm_data.row_groups,
+           _file_itm_data.num_rows_per_source) =
     _metadata->select_row_groups(_options.row_group_indices,
                                  _options.skip_rows,
                                  _options.num_rows,
@@ -1245,9 +1247,18 @@ void reader::impl::preprocess_file(read_mode mode)
                                  _expr_conv.get_converted_expr(),
                                  _stream);
 
+  // Inclusive scan the number of rows per source
+  if (not _expr_conv.get_converted_expr().has_value() and mode == read_mode::CHUNKED_READ) {
+    _file_itm_data.exclusive_sum_num_rows_per_source.resize(
+      _file_itm_data.num_rows_per_source.size());
+    thrust::inclusive_scan(_file_itm_data.num_rows_per_source.cbegin(),
+                           _file_itm_data.num_rows_per_source.cend(),
+                           _file_itm_data.exclusive_sum_num_rows_per_source.begin());
+  }
+
   // check for page indexes
-  _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
-                                _file_itm_data.row_groups.end(),
+  _has_page_index = std::all_of(_file_itm_data.row_groups.cbegin(),
+                                _file_itm_data.row_groups.cend(),
                                 [](auto const& row_group) { return row_group.has_page_index(); });
 
   if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index cff85647725..2917852235c 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -149,6 +149,33 @@ auto chunked_read(std::string const& filepath,
   return chunked_read(vpath, output_limit, input_limit);
 }
 
+auto const read_table_and_nrows_per_source(cudf::io::chunked_parquet_reader const& reader)
+{
+  auto out_tables       = std::vector<std::unique_ptr<cudf::table>>{};
+  int num_chunks        = 0;
+  auto nrows_per_source = std::vector<size_t>{};
+  while (reader.has_next()) {
+    auto chunk = reader.read_chunk();
+    out_tables.emplace_back(std::move(chunk.tbl));
+    num_chunks++;
+    if (nrows_per_source.empty()) {
+      nrows_per_source = std::move(chunk.metadata.num_rows_per_source);
+    } else {
+      std::transform(chunk.metadata.num_rows_per_source.cbegin(),
+                     chunk.metadata.num_rows_per_source.cend(),
+                     nrows_per_source.begin(),
+                     nrows_per_source.begin(),
+                     std::plus<size_t>());
+    }
+  }
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  return std::tuple(cudf::concatenate(out_tviews), num_chunks, nrows_per_source);
+}
+
 }  // namespace
 
 struct ParquetChunkedReaderTest : public cudf::test::BaseFixture {};
@@ -1477,3 +1504,361 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 }
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Chunked-read single data source entirely
+  {
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows);
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    auto const rows_to_skip          = 1'237;
+    auto const rows_to_read          = 7'232;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected = int64s_col(int64_data.begin() + rows_to_skip,
+                                         int64_data.begin() + rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], rows_to_read);
+  }
+
+  // Chunked-read two data sources skipping the first entire file completely
+  {
+    auto constexpr rows_to_skip      = 15'723;
+    auto constexpr output_read_limit = 1'024'000;
+    auto constexpr pass_read_limit   = 1'024'000;
+
+    auto constexpr nsources = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 2);
+    EXPECT_EQ(num_rows_per_source[0], 0);
+    EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+  }
+
+  // Chunked-read from single data source skipping rows_to_skip
+  {
+    auto const rows_to_skip          = 1'237;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 1'800;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() + rows_to_skip, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows - rows_to_skip);
+  }
+
+  // Filtered chunked-read from single data source
+  {
+    int64_t const max_value          = int64_data[int64_data.size() / 2];
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+    auto literal_value               = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal                     = cudf::ast::literal{literal_value};
+    auto col_ref                     = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+
+    auto int64_col_filtered =
+      int64s_col(int64_data_filtered.begin(), int64_data_filtered.end()).release();
+
+    cudf::table_view expected_filtered({int64_col_filtered->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result->view());
+    EXPECT_TRUE(num_rows_per_source.empty());
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Function to initialize a vector of expected counts per source
+  auto initialize_expected_counts =
+    [](int const nsources, int const num_rows, int const rows_to_skip, int const rows_to_read) {
+      // Initialize expected_counts
+      std::vector<size_t> expected_counts(nsources, num_rows);
+
+      // Adjust expected_counts for rows_to_skip
+      int64_t counter = 0;
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_skip) {
+          counter += nrows;
+          nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+        } else {
+          break;
+        }
+      }
+
+      // Reset the counter
+      counter = 0;
+
+      // Adjust expected_counts for rows_to_read
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_read) {
+          counter += nrows;
+          nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+        } else if (counter > rows_to_read) {
+          nrows = 0;
+        }
+      }
+
+      return expected_counts;
+    };
+
+  // Chunked-read six data sources entirely
+  {
+    auto const nsources              = 6;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
+  {
+    auto const rows_to_skip          = 25'571;
+    auto const rows_to_read          = 41'232;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    auto const nsources              = 8;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, rows_to_read);
+
+    // Initialize expected table
+    auto int64_col_selected = int64s_col(int64_selected_data.begin() + rows_to_skip,
+                                         int64_selected_data.begin() + +rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read four data sources skipping three files completely
+  {
+    auto const nsources              = 4;
+    int constexpr rows_to_skip       = num_rows * 3 + 1;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, num_rows * nsources);
+
+    // Initialize expected table
+    auto int64_col_selected =
+      int64s_col(int64_selected_data.begin() + rows_to_skip, int64_selected_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceEmptyTable)
+{
+  auto constexpr output_read_limit = 4'500;
+  auto constexpr pass_read_limit   = 8'500;
+  auto const nsources              = 10;
+
+  // Table with single col of random int64 values
+  auto int64_empty_col = int64s_col{}.release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_empty_columns;
+  input_empty_columns.emplace_back(std::move(int64_empty_col));
+
+  // Write to Parquet
+  auto const [expected_empty, filepath_empty] = write_file(input_empty_columns,
+                                                           "num_rows_per_source_empty",
+                                                           false,
+                                                           false,
+                                                           cudf::io::default_max_page_size_bytes,
+                                                           500);
+
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const options =
+    cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+  auto const reader = cudf::io::chunked_parquet_reader(
+    output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+  auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_empty->view(), result->view());
+
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(
+    std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+}
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 2edf9e0aee6..6c61535359f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -2243,6 +2243,209 @@ TEST_F(ParquetReaderTest, StringsWithPageStats)
   }
 }
 
+TEST_F(ParquetReaderTest, NumRowsPerSource)
+{
+  int constexpr num_rows          = 10'723;  // A prime number
+  int constexpr rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  column_wrapper<int64_t> const int64_col{
+    int64_data.begin(), int64_data.end(), cudf::test::iterators::no_nulls()};
+  cudf::table_view const expected({int64_col});
+
+  // Write to Parquet
+  auto const filepath = temp_env->get_temp_filepath("NumRowsPerSource.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .row_group_size_rows(rows_in_row_group)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read single data source entirely
+  {
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], num_rows);
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    auto constexpr rows_to_skip = 557;  // a prime number != rows_in_row_group
+    auto constexpr rows_to_read = 7'232;
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const result = cudf::io::read_parquet(in_opts);
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip,
+                                               int64_data.begin() + rows_to_skip + rows_to_read,
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], rows_to_read);
+  }
+
+  // Filtered read from single data source
+  {
+    auto constexpr max_value = 100;
+    auto literal_value       = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal             = cudf::ast::literal{literal_value};
+    auto col_ref             = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+    column_wrapper<int64_t> int64_col_filtered{
+      int64_data_filtered.begin(), int64_data_filtered.end(), cudf::test::iterators::no_nulls()};
+
+    cudf::table_view expected_filtered({int64_col_filtered});
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 0);
+  }
+
+  // Read two data sources skipping the first entire file completely
+  {
+    auto constexpr rows_to_skip = 15'723;
+    auto constexpr nsources     = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip - num_rows,
+                                               int64_data.end(),
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 2);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], 0);
+    EXPECT_EQ(result.metadata.num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+  }
+
+  // Read ten data sources entirely
+  {
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip (> two sources) from ten data sources
+  {
+    auto constexpr rows_to_skip = 25'999;
+    auto constexpr rows_to_read = 47'232;
+
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> expected_counts(nsources, num_rows);
+
+    // Adjust expected_counts for rows_to_skip
+    int64_t counter = 0;
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_skip) {
+        counter += nrows;
+        nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+      } else {
+        break;
+      }
+    }
+
+    // Reset the counter
+    counter = 0;
+
+    // Adjust expected_counts for rows_to_read
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_read) {
+        counter += nrows;
+        nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+      } else if (counter > rows_to_read) {
+        nrows = 0;
+      }
+    }
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetReaderTest, NumRowsPerSourceEmptyTable)
+{
+  auto const nsources = 10;
+
+  column_wrapper<int64_t> const int64_empty_col{};
+  cudf::table_view const expected_empty({int64_empty_col});
+
+  // Write to Parquet
+  auto const filepath_empty = temp_env->get_temp_filepath("NumRowsPerSourceEmpty.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath_empty}, expected_empty)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read from Parquet
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+  auto const result = cudf::io::read_parquet(in_opts);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                         expected_counts.cend(),
+                         result.metadata.num_rows_per_source.cbegin()));
+}
+
 ///////////////////
 // metadata tests
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 8d87deb1472..0a6bddcd907 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -81,6 +81,7 @@ cdef extern from "cudf/io/types.hpp" \
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
+        vector[size_t] num_rows_per_source
 
     cdef cppclass table_with_metadata:
         unique_ptr[table] tbl

From 26a3799d2ff9ffb2aa72d63bb388b4bee70b3440 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:49:01 -1000
Subject: [PATCH 27/44] Make ColumnAccessor strictly require a mapping of
 columns (#16285)

`ColumnAccessor` had a default `data=None` argument and initialized an empty dict in the `__init__` if `data` was not passed. This PR now makes `data` a required argument.

Additionally if `verify=True`, the `__init__` would call `as_column` on each `data.values()` allowing non-`ColumnBase` inputs. This PR now avoids this call and makes the caller responsible for ensuring the inputs are `ColumnBase`s

Also, adds a few `verify=False` internally where we know we are passing columns from a libcudf op or reconstructing from another `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16285
---
 python/cudf/cudf/core/_base_index.py          |   4 +-
 python/cudf/cudf/core/column_accessor.py      |  64 +++---
 python/cudf/cudf/core/dataframe.py            |  24 ++-
 python/cudf/cudf/core/frame.py                |   2 +-
 python/cudf/cudf/core/groupby/groupby.py      |   4 +-
 python/cudf/cudf/core/index.py                |   4 +-
 python/cudf/cudf/core/indexed_frame.py        |   1 +
 python/cudf/cudf/core/reshape.py              |  12 +-
 python/cudf/cudf/core/series.py               |  27 ++-
 .../cudf/cudf/tests/test_column_accessor.py   | 190 ++++++++++++------
 10 files changed, 211 insertions(+), 121 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c38352009de..8fad82c5c46 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -98,7 +98,7 @@ def astype(self, dtype, copy: bool = True):
         """
         raise NotImplementedError
 
-    def argsort(self, *args, **kwargs):
+    def argsort(self, *args, **kwargs) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters vary by subclass.
@@ -1520,7 +1520,7 @@ def sort_values(
         ascending=True,
         na_position="last",
         key=None,
-    ):
+    ) -> Self | tuple[Self, cupy.ndarray]:
         """
         Return a sorted copy of the index, and optionally return the indices
         that sorted the index itself.
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index f30a557efb0..819d351b2c4 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -16,6 +16,8 @@
 from cudf.core import column
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import Dtype
     from cudf.core.column import ColumnBase
 
@@ -86,58 +88,58 @@ class ColumnAccessor(abc.MutableMapping):
         (default=None).
     verify : bool, optional
         For non ColumnAccessor inputs, whether to verify
-        column length and type
+        column length and data.values() are all Columns
     """
 
-    _data: "dict[Any, ColumnBase]"
-    multiindex: bool
+    _data: dict[Any, ColumnBase]
     _level_names: tuple[Any, ...]
 
     def __init__(
         self,
-        data: abc.MutableMapping | ColumnAccessor | None = None,
+        data: abc.MutableMapping[Any, ColumnBase] | Self,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
         verify: bool = True,
     ):
-        self.rangeindex = rangeindex
-        self.label_dtype = label_dtype
-        if data is None:
-            data = {}
-        # TODO: we should validate the keys of `data`
         if isinstance(data, ColumnAccessor):
-            multiindex = multiindex or data.multiindex
-            level_names = level_names or data.level_names
             self._data = data._data
-            self.multiindex = multiindex
-            self._level_names = level_names
-            self.rangeindex = data.rangeindex
-            self.label_dtype = data.label_dtype
-        else:
+            self._level_names = data.level_names
+            self.multiindex: bool = data.multiindex
+            self.rangeindex: bool = data.rangeindex
+            self.label_dtype: Dtype | None = data.label_dtype
+        elif isinstance(data, abc.MutableMapping):
             # This code path is performance-critical for copies and should be
             # modified with care.
-            data = dict(data)
             if data and verify:
-                result = {}
                 # Faster than next(iter(data.values()))
                 column_length = len(data[next(iter(data))])
-                for k, v in data.items():
-                    # Much faster to avoid the function call if possible; the
-                    # extra isinstance is negligible if we do have to make a
-                    # column from something else.
-                    if not isinstance(v, column.ColumnBase):
-                        v = column.as_column(v)
-                    if len(v) != column_length:
+                # TODO: we should validate the keys of `data`
+                for col in data.values():
+                    if not isinstance(col, column.ColumnBase):
+                        raise ValueError(
+                            f"All data.values() must be Column, not {type(col).__name__}"
+                        )
+                    if len(col) != column_length:
                         raise ValueError("All columns must be of equal length")
-                    result[k] = v
-                self._data = result
-            else:
-                self._data = data
 
+            if not isinstance(data, dict):
+                data = dict(data)
+            self._data = data
+
+            if rangeindex and multiindex:
+                raise ValueError(
+                    f"{rangeindex=} and {multiindex=} cannot both be True."
+                )
+            self.rangeindex = rangeindex
             self.multiindex = multiindex
+            self.label_dtype = label_dtype
             self._level_names = level_names
+        else:
+            raise ValueError(
+                f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}"
+            )
 
     def __iter__(self):
         return iter(self._data)
@@ -161,7 +163,9 @@ def __repr__(self) -> str:
         type_info = (
             f"{self.__class__.__name__}("
             f"multiindex={self.multiindex}, "
-            f"level_names={self.level_names})"
+            f"level_names={self.level_names}, "
+            f"rangeindex={self.rangeindex}, "
+            f"label_dtype={self.label_dtype})"
         )
         column_info = "\n".join(
             [f"{name}: {col.dtype}" for name, col in self.items()]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7e07078c95b..dbc7f10b569 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -475,6 +475,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -485,6 +486,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -771,6 +773,7 @@ def __init__(
                     else None,
                     rangeindex=rangeindex,
                     label_dtype=label_dtype,
+                    verify=False,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -931,7 +934,7 @@ def _init_from_series_list(self, data, columns, index):
                     )
                 if not series.index.equals(final_columns):
                     series = series.reindex(final_columns)
-                self._data[idx] = column.as_column(series._column)
+                self._data[idx] = series._column
 
             # Setting `final_columns` to self._index so
             # that the resulting `transpose` will be have
@@ -2958,7 +2961,7 @@ def set_index(
             # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    data_to_add.append(self[col])
+                    data_to_add.append(self[col]._column)
                     names.append(col)
                     if drop:
                         to_drop.append(col)
@@ -2973,7 +2976,7 @@ def set_index(
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
             ):
-                data_to_add.append(col)
+                data_to_add.append(as_column(col))
                 names.append(col.name)
             else:
                 try:
@@ -4769,7 +4772,7 @@ def _func(x):  # pragma: no cover
         result = {}
         for name, col in self._data.items():
             apply_sr = Series._from_data({None: col})
-            result[name] = apply_sr.apply(_func)
+            result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
 
@@ -5806,6 +5809,7 @@ def from_records(
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=new_index,
         )
@@ -5892,6 +5896,7 @@ def _from_arrays(
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=index,
         )
@@ -6302,10 +6307,9 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_data(
             {
-                None: [
-                    length - self._data[col].null_count
-                    for col in self._data.names
-                ]
+                None: as_column(
+                    [length - col.null_count for col in self._columns]
+                )
             },
             cudf.Index(self._data.names),
         )
@@ -7374,7 +7378,9 @@ def to_struct(self, name=None):
             offset=0,
         )
         return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor({name: col}),
+            cudf.core.column_accessor.ColumnAccessor(
+                {name: col}, verify=False
+            ),
             index=self.index,
             name=name,
         )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c82e073d7b7..04ecae4ba85 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1305,7 +1305,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the Series values.
 
         Parameters
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3f91be71f29..1646c5042fd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1360,7 +1360,9 @@ def _post_process_chunk_results(
         if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar(
             chunk_results[0]
         ):
-            data = {None: chunk_results}
+            data = ColumnAccessor(
+                {None: as_column(chunk_results)}, verify=False
+            )
             ty = cudf.Series if self._as_index else cudf.DataFrame
             result = ty._from_data(data, index=group_names)
             result.index.names = self.grouping.names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ae20fcd5d9c..73b7298410a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -349,7 +349,7 @@ def hasnans(self) -> bool:
     @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
-            {self.name: self._values}
+            {self.name: self._values}, verify=False
         )
 
     @_performance_tracking
@@ -1492,7 +1492,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 60cd142db4b..e75b51e0d43 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6229,6 +6229,7 @@ def rank(
                     multiindex=self._data.multiindex,
                     level_names=self._data.level_names,
                     label_dtype=self._data.label_dtype,
+                    verify=False,
                 ),
             )
         else:
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index b538ae34b6f..a542c5f5969 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -932,14 +932,10 @@ def _pivot(df, index, columns):
     index_labels, index_idx = index._encode()
     column_labels = columns_labels.to_pandas().to_flat_index()
 
-    # the result of pivot always has a multicolumn
-    result = cudf.core.column_accessor.ColumnAccessor(
-        multiindex=True, level_names=(None,) + columns._data.names
-    )
-
     def as_tuple(x):
         return x if isinstance(x, tuple) else (x,)
 
+    result = {}
     for v in df:
         names = [as_tuple(v) + as_tuple(name) for name in column_labels]
         nrows = len(index_labels)
@@ -964,8 +960,12 @@ def as_tuple(x):
                 }
             )
 
+    # the result of pivot always has a multicolumn
+    ca = cudf.core.column_accessor.ColumnAccessor(
+        result, multiindex=True, level_names=(None,) + columns._data.names
+    )
     return cudf.DataFrame._from_data(
-        result, index=cudf.Index(index_labels, name=index.name)
+        ca, index=cudf.Index(index_labels, name=index.name)
     )
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d8dbaa897e7..94c33eed37a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2263,20 +2263,19 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
-        obj = self.__class__._from_data(
-            {
-                None: super().argsort(
-                    axis=axis,
-                    kind=kind,
-                    order=order,
-                    ascending=ascending,
-                    na_position=na_position,
-                )
-            }
+    ) -> Self:
+        col = as_column(
+            super().argsort(
+                axis=axis,
+                kind=kind,
+                order=order,
+                ascending=ascending,
+                na_position=na_position,
+            )
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self([col])
         )
-        obj.name = self.name
-        return obj
 
     @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
@@ -2631,7 +2630,7 @@ def mode(self, dropna=True):
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
         return Series._from_data(
-            {self.name: val_counts.index.sort_values()}, name=self.name
+            {self.name: val_counts.index.sort_values()._column}, name=self.name
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index f3343c37d1d..e84e1433c10 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -5,28 +5,35 @@
 import pytest
 
 import cudf
+from cudf.core.column import as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.testing import assert_eq
 
 simple_test_data = [
     {},
-    {"a": []},
-    {"a": [1]},
-    {"a": ["a"]},
-    {"a": [1, 2, 3], "b": ["a", "b", "c"]},
+    {"a": as_column([])},
+    {"a": as_column([1])},
+    {"a": as_column(["a"])},
+    {"a": as_column([1, 2, 3]), "b": as_column(["a", "b", "c"])},
 ]
 
 mi_test_data = [
-    {("a", "b"): [1, 2, 4], ("a", "c"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", ""): [2, 3, 4]},
-    {("a", "b"): [1, 2, 4], ("c", "d"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", "c"): [2, 3, 4], ("b", ""): [4, 5, 6]},
+    {("a", "b"): as_column([1, 2, 4]), ("a", "c"): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 3]), ("a", ""): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 4]), ("c", "d"): as_column([2, 3, 4])},
+    {
+        ("a", "b"): as_column([1, 2, 3]),
+        ("a", "c"): as_column([2, 3, 4]),
+        ("b", ""): as_column([4, 5, 6]),
+    },
 ]
 
 
 def check_ca_equal(lhs, rhs):
     assert lhs.level_names == rhs.level_names
     assert lhs.multiindex == rhs.multiindex
+    assert lhs.rangeindex == rhs.rangeindex
+    assert lhs.label_dtype == rhs.label_dtype
     for l_key, r_key in zip(lhs, rhs):
         assert l_key == r_key
         assert_eq(lhs[l_key], rhs[r_key])
@@ -58,19 +65,26 @@ def test_to_pandas_simple(simple_data):
     # to ignore this `inferred_type` comparison, we pass exact=False.
     assert_eq(
         ca.to_pandas_index(),
-        pd.DataFrame(simple_data).columns,
+        pd.DataFrame(
+            {key: value.values_host for key, value in simple_data.items()}
+        ).columns,
         exact=False,
     )
 
 
 def test_to_pandas_multiindex(mi_data):
     ca = ColumnAccessor(mi_data, multiindex=True)
-    assert_eq(ca.to_pandas_index(), pd.DataFrame(mi_data).columns)
+    assert_eq(
+        ca.to_pandas_index(),
+        pd.DataFrame(
+            {key: value.values_host for key, value in mi_data.items()}
+        ).columns,
+    )
 
 
 def test_to_pandas_multiindex_names():
     ca = ColumnAccessor(
-        {("a", "b"): [1, 2, 3], ("c", "d"): [3, 4, 5]},
+        {("a", "b"): as_column([1, 2, 3]), ("c", "d"): as_column([3, 4, 5])},
         multiindex=True,
         level_names=("foo", "bar"),
     )
@@ -108,16 +122,20 @@ def test_column_size_mismatch():
     differing sizes throws an error.
     """
     with pytest.raises(ValueError):
-        ColumnAccessor({"a": [1], "b": [1, 2]})
+        ColumnAccessor({"a": as_column([1]), "b": as_column([1, 2])})
 
 
 def test_select_by_label_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_label("a"), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_label("b"), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_label("a"), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_label("b"), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
 
 
 def test_select_by_label_multiindex():
@@ -126,40 +144,62 @@ def test_select_by_label_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
 
 def test_select_by_label_simple_slice():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(slice("b", "c"))
     check_ca_equal(expect, got)
 
@@ -167,10 +207,10 @@ def test_select_by_label_simple_slice():
 def test_select_by_label_multiindex_slice():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )  # pandas needs columns to be sorted to do slicing with multiindex
@@ -180,9 +220,9 @@ def test_select_by_label_multiindex_slice():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -191,8 +231,16 @@ def test_select_by_label_multiindex_slice():
 
 
 def test_by_label_list():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(["b", "c"])
     check_ca_equal(expect, got)
 
@@ -201,9 +249,13 @@ def test_select_by_index_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_index(0), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_index(1), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_index(0), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_index(1), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
     check_ca_equal(ca.select_by_index([0, 1]), ca)
     check_ca_equal(ca.select_by_index(slice(0, None)), ca)
 
@@ -214,19 +266,19 @@ def test_select_by_index_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -235,9 +287,9 @@ def test_select_by_index_multiindex():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -248,10 +300,10 @@ def test_select_by_index_multiindex():
 def test_select_by_index_empty():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -267,12 +319,20 @@ def test_select_by_index_empty():
 
 def test_replace_level_values_RangeIndex():
     ca = ColumnAccessor(
-        {("a"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("a"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
     expect = ColumnAccessor(
-        {("f"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("f"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
@@ -282,12 +342,20 @@ def test_replace_level_values_RangeIndex():
 
 def test_replace_level_values_MultiColumn():
     ca = ColumnAccessor(
-        {("a", 1): [1, 2, 3], ("a", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("a", 1): as_column([1, 2, 3]),
+            ("a", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("f", 1): [1, 2, 3], ("f", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("f", 1): as_column([1, 2, 3]),
+            ("f", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
@@ -303,7 +371,17 @@ def test_clear_nrows_empty_before():
 
 
 def test_clear_nrows_empty_after():
-    ca = ColumnAccessor({"new": [1]})
+    ca = ColumnAccessor({"new": as_column([1])})
     assert ca.nrows == 1
     del ca["new"]
     assert ca.nrows == 0
+
+
+def test_not_rangeindex_and_multiindex():
+    with pytest.raises(ValueError):
+        ColumnAccessor({}, multiindex=True, rangeindex=True)
+
+
+def test_data_values_not_column_raises():
+    with pytest.raises(ValueError):
+        ColumnAccessor({"a": [1]})

From c5b96003cef00b2635923d03edcd48a13821a61e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 20:04:19 -0700
Subject: [PATCH 28/44] Migrate Parquet reader to pylibcudf (#16078)

xref #15162

Migrates the parquet reader (and chunked parquet reader) to pylibcudf.

(Does not migrate the writers or the metadata reader yet).

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16078
---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../api_docs/pylibcudf/io/parquet.rst         |   6 +
 python/cudf/cudf/_lib/parquet.pyx             | 312 ++++++------------
 .../cudf/cudf/_lib/pylibcudf/expressions.pyx  |  11 +
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   4 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pxd   |  35 ++
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pyx   | 204 ++++++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |   8 +
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |   8 +-
 python/cudf/cudf/io/parquet.py                |   4 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  80 ++++-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  15 +
 .../cudf/pylibcudf_tests/io/test_parquet.py   | 109 ++++++
 python/cudf/cudf/tests/test_parquet.py        |   5 +-
 16 files changed, 581 insertions(+), 225 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_parquet.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 697bce739de..e2d342ffe47 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -18,3 +18,4 @@ I/O Functions
     avro
     csv
     json
+    parquet
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
new file mode 100644
index 00000000000..9dfbadfa216
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
@@ -0,0 +1,6 @@
+=======
+Parquet
+=======
+
+.. automodule:: cudf._lib.pylibcudf.io.parquet
+   :members:
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e7959d21e01..a2eed94bb3c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -18,16 +18,14 @@ from cython.operator cimport dereference
 
 from cudf.api.types import is_list_like
 
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
-from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -35,25 +33,20 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
+    add_df_col_struct_names,
     make_sinks_info,
     make_source_info,
-    update_struct_field_names,
 )
 from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.io.parquet cimport ChunkedParquetReader
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
-    chunked_parquet_reader as cpp_chunked_parquet_reader,
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
-    parquet_reader_options,
-    parquet_reader_options_builder,
     parquet_writer_options,
-    read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
 from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
@@ -63,19 +56,17 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
-    table_metadata,
 )
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-from cudf._lib.concat import concat_columns
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf cimport Table
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -133,71 +124,37 @@ def _parse_metadata(meta):
     return file_is_range_index, file_index_cols, file_column_dtype
 
 
-cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options(
-     cudf_io_types.source_info source,
-     vector[vector[size_type]] row_groups,
-     bool use_pandas_metadata,
-     Expression filters,
-     object columns):
-
-    cdef parquet_reader_options args
-    cdef parquet_reader_options_builder builder
-    cdef data_type cpp_timestamp_type = cudf_types.data_type(
-        cudf_types.type_id.EMPTY
-    )
-    builder = (
-        parquet_reader_options.builder(source)
-        .row_groups(row_groups)
-        .use_pandas_metadata(use_pandas_metadata)
-        .use_arrow_schema(True)
-        .timestamp_type(cpp_timestamp_type)
-    )
-    if filters is not None:
-        builder = builder.filter(<expression &>dereference(filters.c_obj.get()))
-
-    args = move(builder.build())
-    cdef vector[string] cpp_columns
-    allow_range_index = True
-    if columns is not None:
-        cpp_columns.reserve(len(columns))
-        allow_range_index = len(columns) > 0
-        for col in columns:
-            cpp_columns.push_back(str(col).encode())
-        args.set_columns(cpp_columns)
-    allow_range_index &= filters is None
-
-    return pair[parquet_reader_options, bool](args, allow_range_index)
-
 cdef object _process_metadata(object df,
-                              table_metadata table_meta,
                               list names,
+                              dict child_names,
+                              list per_file_user_data,
                               object row_groups,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
                               bool use_pandas_metadata):
-    update_struct_field_names(df, table_meta.schema_info)
+
+    add_df_col_struct_names(df, child_names)
     index_col = None
     is_range_index = True
     column_index_type = None
     index_col_names = None
     meta = None
-    cdef vector[unordered_map[string, string]] per_file_user_data = \
-        table_meta.per_file_user_data
     for single_file in per_file_user_data:
+        if b'pandas' not in single_file:
+            continue
         json_str = single_file[b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-            is_range_index &= file_is_range_index
-
-            if not file_is_range_index and index_col is not None \
-                    and index_col_names is None:
-                index_col_names = {}
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = c['name']
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
+        is_range_index &= file_is_range_index
+
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
 
     if meta is not None:
         # Book keep each column metadata as the order
@@ -297,6 +254,76 @@ cdef object _process_metadata(object df,
     return df
 
 
+def read_parquet_chunked(
+    filepaths_or_buffers,
+    columns=None,
+    row_groups=None,
+    use_pandas_metadata=True,
+    size_t chunk_read_limit=0,
+    size_t pass_read_limit=1024000000
+):
+    # Convert NativeFile buffers to NativeFileDatasource,
+    # but save original buffers in case we need to use
+    # pyarrow for metadata processing
+    # (See: https://github.com/rapidsai/cudf/issues/9599)
+
+    pa_buffers = []
+
+    new_bufs = []
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            new_bufs.append(NativeFileDatasource(datasource))
+        else:
+            new_bufs.append(datasource)
+
+    # Note: If this function ever takes accepts filters
+    # allow_range_index needs to be False when a filter is passed
+    # (see read_parquet)
+    allow_range_index = columns is not None and len(columns) != 0
+
+    reader = ChunkedParquetReader(
+        plc.io.SourceInfo(new_bufs),
+        columns,
+        row_groups,
+        use_pandas_metadata,
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit
+    )
+
+    tbl_w_meta = reader.read_chunk()
+    column_names = tbl_w_meta.column_names(include_children=False)
+    child_names = tbl_w_meta.child_names
+    per_file_user_data = tbl_w_meta.per_file_user_data
+    concatenated_columns = tbl_w_meta.tbl.columns()
+
+    # save memory
+    del tbl_w_meta
+
+    cdef Table tbl
+    while reader.has_next():
+        tbl = reader.read_chunk().tbl
+
+        for i in range(tbl.num_columns()):
+            concatenated_columns[i] = plc.concatenate.concatenate(
+                [concatenated_columns[i], tbl._columns[i]]
+            )
+            # Drop residual columns to save memory
+            tbl._columns[i] = None
+
+    df = cudf.DataFrame._from_data(
+        *_data_from_columns(
+            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
+            column_names=column_names,
+            index_names=None
+        )
+    )
+    df = _process_metadata(df, column_names, child_names,
+                           per_file_user_data, row_groups,
+                           filepaths_or_buffers, pa_buffers,
+                           allow_range_index, use_pandas_metadata)
+    return df
+
+
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
                    Expression filters=None):
@@ -322,33 +349,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             pa_buffers.append(datasource)
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
 
-    cdef cudf_io_types.source_info source = make_source_info(
-        filepaths_or_buffers)
-
-    cdef vector[vector[size_type]] cpp_row_groups
-    if row_groups is not None:
-        cpp_row_groups = row_groups
-
-    # Setup parquet reader arguments
-    cdef parquet_reader_options args
-    cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, filters, columns)
-    args, allow_range_index = c_res.first, c_res.second
+    allow_range_index = True
+    if columns is not None and len(columns) == 0 or filters:
+        allow_range_index = False
 
     # Read Parquet
-    cdef cudf_io_types.table_with_metadata c_result
 
-    with nogil:
-        c_result = move(parquet_reader(args))
+    tbl_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo(filepaths_or_buffers),
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories = False,
+        use_pandas_metadata = use_pandas_metadata,
+    )
 
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(tbl_w_meta)
+    )
 
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=names
-    ))
-    df = _process_metadata(df, c_result.metadata, names, row_groups,
-                           filepaths_or_buffers, pa_buffers,
+    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
+                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
+                           row_groups, filepaths_or_buffers, pa_buffers,
                            allow_range_index, use_pandas_metadata)
     return df
 
@@ -804,120 +826,6 @@ cdef class ParquetWriter:
         self.initialized = True
 
 
-cdef class ParquetReader:
-    cdef bool initialized
-    cdef unique_ptr[cpp_chunked_parquet_reader] reader
-    cdef size_t chunk_read_limit
-    cdef size_t pass_read_limit
-    cdef size_t row_group_size_bytes
-    cdef table_metadata result_meta
-    cdef vector[unordered_map[string, string]] per_file_user_data
-    cdef object pandas_meta
-    cdef list pa_buffers
-    cdef bool allow_range_index
-    cdef object row_groups
-    cdef object filepaths_or_buffers
-    cdef object names
-    cdef object column_index_type
-    cdef object index_col_names
-    cdef bool is_range_index
-    cdef object index_col
-    cdef bool cpp_use_pandas_metadata
-
-    def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None,
-                  use_pandas_metadata=True,
-                  size_t chunk_read_limit=0,
-                  size_t pass_read_limit=1024000000):
-
-        # Convert NativeFile buffers to NativeFileDatasource,
-        # but save original buffers in case we need to use
-        # pyarrow for metadata processing
-        # (See: https://github.com/rapidsai/cudf/issues/9599)
-
-        pa_buffers = []
-        for i, datasource in enumerate(filepaths_or_buffers):
-            if isinstance(datasource, NativeFile):
-                pa_buffers.append(datasource)
-                filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-        self.pa_buffers = pa_buffers
-        cdef cudf_io_types.source_info source = make_source_info(
-            filepaths_or_buffers)
-
-        self.cpp_use_pandas_metadata = use_pandas_metadata
-
-        cdef vector[vector[size_type]] cpp_row_groups
-        if row_groups is not None:
-            cpp_row_groups = row_groups
-        cdef parquet_reader_options args
-        cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, None, columns)
-        args, self.allow_range_index = c_res.first, c_res.second
-
-        with nogil:
-            self.reader.reset(
-                new cpp_chunked_parquet_reader(
-                    chunk_read_limit,
-                    pass_read_limit,
-                    args
-                )
-            )
-        self.initialized = False
-        self.row_groups = row_groups
-        self.filepaths_or_buffers = filepaths_or_buffers
-
-    def _has_next(self):
-        cdef bool res
-        with nogil:
-            res = self.reader.get()[0].has_next()
-        return res
-
-    def _read_chunk(self):
-        # Read Parquet
-        cdef cudf_io_types.table_with_metadata c_result
-
-        with nogil:
-            c_result = move(self.reader.get()[0].read_chunk())
-
-        if not self.initialized:
-            self.names = [info.name.decode() for info in c_result.metadata.schema_info]
-            self.result_meta = c_result.metadata
-
-        df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-            move(c_result.tbl),
-            column_names=self.names,
-        ))
-
-        self.initialized = True
-        return df
-
-    def read(self):
-        dfs = self._read_chunk()
-        column_names = dfs._column_names
-        concatenated_columns = list(dfs._columns)
-        del dfs
-        while self._has_next():
-            new_chunk = list(self._read_chunk()._columns)
-            for i in range(len(column_names)):
-                concatenated_columns[i] = concat_columns(
-                    [concatenated_columns[i], new_chunk[i]]
-                )
-                # Must drop any residual GPU columns to save memory
-                new_chunk[i] = None
-
-        dfs = cudf.DataFrame._from_data(
-            *data_from_pylibcudf_table(
-                pylibcudf.Table(
-                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
-                ),
-                column_names=column_names,
-                index_names=None
-                )
-            )
-
-        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
-                                 self.filepaths_or_buffers, self.pa_buffers,
-                                 self.allow_range_index, self.cpp_use_pandas_metadata)
-
 cpdef merge_filemetadata(object filemetadata_list):
     """
     Cython function to call into libcudf API, see `merge_row_group_metadata`.
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
index 38de11406ad..b983a617533 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
@@ -38,6 +38,17 @@ from .types cimport DataType
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
+# Define this class just to have a docstring for it
+cdef class Expression:
+    """
+    The base class for all expression types.
+    This class cannot be instantiated directly, please
+    instantiate one of its child classes instead.
+
+    For details, see :cpp:class:`cudf::ast::expression`.
+    """
+    pass
+
 cdef class Literal(Expression):
     """
     A literal value used in an abstract syntax tree.
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 8dd08d11dc8..55bea4fc262 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -22,6 +22,6 @@ rapids_cython_create_modules(
 )
 
 set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
-                                pylibcudf_io_json pylibcudf_io_types
+                                pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index 5b3272d60e0..62820048584 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, types
+from . cimport avro, datasource, json, parquet, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index e17deaa4663..27640f7d955 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, types
+from . import avro, csv, datasource, json, parquet, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
new file mode 100644
index 00000000000..027f215fb91
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.expressions cimport Expression
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef class ChunkedParquetReader:
+    cdef unique_ptr[cpp_chunked_parquet_reader] reader
+
+    cpdef bool has_next(self)
+    cpdef TableWithMetadata read_chunk(self)
+
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = *,
+    list row_groups = *,
+    Expression filters = *,
+    bool convert_strings_to_categories = *,
+    bool use_pandas_metadata = *,
+    int64_t skip_rows = *,
+    size_type num_rows = *,
+    # disabled see comment in parquet.pyx for more
+    # ReaderColumnSchema reader_column_schema = *,
+    # DataType timestamp_type = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
new file mode 100644
index 00000000000..96119e1b714
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.expressions cimport Expression
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+    parquet_reader_options,
+    read_parquet as cpp_read_parquet,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef parquet_reader_options _setup_parquet_reader_options(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    cdef vector[string] col_vec
+    cdef parquet_reader_options opts = (
+        parquet_reader_options.builder(source_info.c_obj)
+        .convert_strings_to_categories(convert_strings_to_categories)
+        .use_pandas_metadata(use_pandas_metadata)
+        .use_arrow_schema(True)
+        .build()
+    )
+    if row_groups is not None:
+        opts.set_row_groups(row_groups)
+    if num_rows != -1:
+        opts.set_num_rows(num_rows)
+    if skip_rows != 0:
+        opts.set_skip_rows(skip_rows)
+    if columns is not None:
+        col_vec.reserve(len(columns))
+        for col in columns:
+            col_vec.push_back(<string>str(col).encode())
+        opts.set_columns(col_vec)
+    if filters is not None:
+        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
+    return opts
+
+
+cdef class ChunkedParquetReader:
+    """
+    Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The names of the columns to be read
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    chunk_read_limit : size_t, default 0
+        Limit on total number of bytes to be returned per read,
+        or 0 if there is no limit.
+    pass_read_limit : size_t, default 1024000000
+        Limit on the amount of memory used for reading and decompressing data
+        or 0 if there is no limit.
+    """
+    def __init__(
+        self,
+        SourceInfo source_info,
+        list columns=None,
+        list row_groups=None,
+        bool use_pandas_metadata=True,
+        bool convert_strings_to_categories=False,
+        int64_t skip_rows = 0,
+        size_type num_rows = -1,
+        size_t chunk_read_limit=0,
+        size_t pass_read_limit=1024000000
+    ):
+
+        cdef parquet_reader_options opts = _setup_parquet_reader_options(
+            source_info,
+            columns,
+            row_groups,
+            filters=None,
+            convert_strings_to_categories=convert_strings_to_categories,
+            use_pandas_metadata=use_pandas_metadata,
+            skip_rows=skip_rows,
+            num_rows=num_rows,
+        )
+
+        with nogil:
+            self.reader.reset(
+                new cpp_chunked_parquet_reader(
+                    chunk_read_limit,
+                    pass_read_limit,
+                    opts
+                )
+            )
+
+    cpdef bool has_next(self):
+        """
+        Returns True if there is another chunk in the Parquet file
+        to be read.
+
+        Returns
+        -------
+        True if we have not finished reading the file.
+        """
+        with nogil:
+            return self.reader.get()[0].has_next()
+
+    cpdef TableWithMetadata read_chunk(self):
+        """
+        Read the next chunk into a :py:class:`~.types.TableWithMetadata`
+
+        Returns
+        -------
+        TableWithMetadata
+            The Table and its corresponding metadata (column names) that were read in.
+        """
+        # Read Parquet
+        cdef table_with_metadata c_result
+
+        with nogil:
+            c_result = move(self.reader.get()[0].read_chunk())
+
+        return TableWithMetadata.from_libcudf(c_result)
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # Disabled, these aren't used by cudf-python
+    # we should only add them back in if there's user demand
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    filters : Expression, default None
+        An AST :py:class:`cudf._lib.pylibcudf.expressions.Expression`
+        to use for predicate pushdown.
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef table_with_metadata c_result
+    cdef parquet_reader_options opts = _setup_parquet_reader_options(
+        source_info,
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories,
+        use_pandas_metadata,
+        skip_rows,
+        num_rows,
+    )
+
+    with nogil:
+        c_result = move(cpp_read_parquet(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index 68498ff88f4..95fa7d4c2ee 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -122,6 +122,14 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+    @property
+    def per_file_user_data(self):
+        """
+        Returns a list containing a dict
+        containing file-format specific metadata,
+        for each file being read in.
+        """
+        return self.metadata.per_file_user_data
 
 cdef class SourceInfo:
     """A class containing details on a source to read from.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index c38f39f7749..d86915c7da9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
@@ -27,8 +27,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         # setter
 
+        void set_filter(expression &filter) except +
         void set_columns(vector[string] col_names) except +
+        void set_num_rows(size_type val) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void set_skip_rows(int64_t val) except +
         void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,6 +52,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
+        parquet_reader_options_builder& convert_strings_to_categories(
+            bool val
+        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 0f0a240b5d0..7dab2f20100 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -929,12 +929,12 @@ def _read_parquet(
                 f"following positional arguments: {list(args)}"
             )
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.ParquetReader(
+            return libparquet.read_parquet_chunked(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
-            ).read()
+            )
         else:
             return libparquet.read_parquet(
                 filepaths_or_buffers,
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index ed2c5ca06c9..e19ff58927f 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pyarrow as pa
 import pytest
+from pyarrow.parquet import write_table as pq_write_table
 
 from cudf._lib import pylibcudf as plc
 from cudf._lib.pylibcudf.io.types import CompressionType
@@ -103,25 +104,68 @@ def _make_fields_nullable(typ):
             return pa.list_(new_fields[0])
         return typ
 
+    def _contains_type(parent_typ, typ_checker):
+        """
+        Check whether the parent or one of the children
+        satisfies the typ_checker.
+        """
+        if typ_checker(parent_typ):
+            return True
+        if pa.types.is_nested(parent_typ):
+            for i in range(parent_typ.num_fields):
+                if _contains_type(parent_typ.field(i).type, typ_checker):
+                    return True
+        return False
+
     if not check_field_nullability:
         rhs_type = _make_fields_nullable(rhs.type)
         rhs = rhs.cast(rhs_type)
 
         lhs_type = _make_fields_nullable(lhs.type)
-        lhs = rhs.cast(lhs_type)
-
-    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
-        lhs_nans = pa.compute.is_nan(lhs)
-        rhs_nans = pa.compute.is_nan(rhs)
-        assert lhs_nans.equals(rhs_nans)
-
-        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
-            # masks must be equal at this point
-            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
-            lhs = lhs.filter(mask)
-            rhs = rhs.filter(mask)
+        lhs = lhs.cast(lhs_type)
 
-        np.testing.assert_array_almost_equal(lhs, rhs)
+    assert lhs.type == rhs.type, f"{lhs.type} != {rhs.type}"
+    if _contains_type(lhs.type, pa.types.is_floating) and _contains_type(
+        rhs.type, pa.types.is_floating
+    ):
+        # Flatten nested arrays to liststo do comparisons if nested
+        # This is so we can do approximate comparisons
+        # for floats in numpy
+        def _flatten_arrays(arr):
+            if pa.types.is_nested(arr.type):
+                flattened = arr.flatten()
+                flat_arrs = []
+                if isinstance(flattened, list):
+                    for flat_arr in flattened:
+                        flat_arrs += _flatten_arrays(flat_arr)
+                else:
+                    flat_arrs = [flattened]
+            else:
+                flat_arrs = [arr]
+            return flat_arrs
+
+        if isinstance(lhs, (pa.ListArray, pa.StructArray)):
+            lhs = _flatten_arrays(lhs)
+            rhs = _flatten_arrays(rhs)
+        else:
+            # Just a regular doublearray
+            lhs = [lhs]
+            rhs = [rhs]
+
+        for lh_arr, rh_arr in zip(lhs, rhs):
+            # Check NaNs positions match
+            # and then filter out nans
+            lhs_nans = pa.compute.is_nan(lh_arr)
+            rhs_nans = pa.compute.is_nan(rh_arr)
+            assert lhs_nans.equals(rhs_nans)
+
+            if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+                # masks must be equal at this point
+                mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+                lh_arr = lh_arr.filter(mask)
+                rh_arr = rh_arr.filter(mask)
+
+            np.testing.assert_array_almost_equal(lh_arr, rh_arr)
     else:
         assert lhs.equals(rhs)
 
@@ -276,6 +320,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
         df.to_json(path_or_buf, mode=mode, **kwargs)
     elif format == "csv":
         df.to_csv(path_or_buf, mode=mode, **kwargs)
+    elif format == "parquet":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        pq_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 4a7194a6d8d..945e1689229 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -170,6 +170,21 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO],
+)
+def binary_source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 unsupported_types = {
     # Not supported by pandas
     # TODO: find a way to test these
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
new file mode 100644
index 00000000000..07d2ab3d69a
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.expressions import (
+    ASTOperator,
+    ColumnNameReference,
+    ColumnReference,
+    Literal,
+    Operation,
+)
+
+# Shared kwargs to pass to make_source
+_COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_parquet_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    res = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]),
+        num_rows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize(
+    "pa_filters,plc_filters",
+    [
+        (
+            pc.field("col_int64") >= 10,
+            Operation(
+                ASTOperator.GREATER_EQUAL,
+                ColumnNameReference("col_int64"),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+        (
+            (pc.field("col_int64") >= 10) & (pc.field("col_double") < 0),
+            Operation(
+                ASTOperator.LOGICAL_AND,
+                Operation(
+                    ASTOperator.GREATER_EQUAL,
+                    ColumnNameReference("col_int64"),
+                    Literal(plc.interop.from_arrow(pa.scalar(10))),
+                ),
+                Operation(
+                    ASTOperator.LESS,
+                    ColumnNameReference("col_double"),
+                    Literal(plc.interop.from_arrow(pa.scalar(0.0))),
+                ),
+            ),
+        ),
+        (
+            (pc.field(0) == 10),
+            Operation(
+                ASTOperator.EQUAL,
+                ColumnReference(0),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+    ],
+)
+def test_read_parquet_filters(
+    table_data, binary_source_or_sink, pa_filters, plc_filters
+):
+    _, pa_table = table_data
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]), filters=plc_filters
+    )
+    exp = read_table(source, filters=pa_filters)
+    assert_table_and_meta_eq(
+        exp, plc_table_w_meta, check_field_nullability=False
+    )
+
+
+# TODO: Test these options
+# list row_groups = None,
+# ^^^ This one is not tested since it's not in pyarrow/pandas, deprecate?
+# bool convert_strings_to_categories = False,
+# bool use_pandas_metadata = True
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f2820d9c112..3806b901b10 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,7 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf._lib.parquet import ParquetReader
+from cudf._lib.parquet import read_parquet_chunked
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -3755,7 +3755,7 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    reader = ParquetReader(
+    actual = read_parquet_chunked(
         [buffer],
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
@@ -3765,7 +3765,6 @@ def test_parquet_chunked_reader(
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
-    actual = reader.read()
     assert_eq(expected, actual)
 
 

From e6537de7474c91b4153542e6611c8a4e33a58caa Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Jul 2024 20:10:40 -0700
Subject: [PATCH 29/44] Experimental support for configurable prefetching
 (#16020)

This PR adds experimental support for prefetching managed memory at a select few points in libcudf. A new configuration object is introduced for handling whether prefetching is enabled or disabled, and whether to print debug information about pointers being prefetched. Prefetching control is managed on a per API basis to enable profiling of the effects of prefetching different classes of data in different contexts. Prefetching in this PR always occurs on the default stream, so it will trigger synchronization with any blocking streams that the user has created. Turning on prefetching and then passing non-blocking to any libcudf APIs will trigger undefined behavior.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Thomas Li (https://github.com/lithomas1)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16020
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/column/column_view.hpp       |  54 ++++--
 cpp/include/cudf/detail/join.hpp              |   3 -
 cpp/include/cudf/strings/detail/gather.cuh    |   7 +-
 .../cudf/strings/detail/strings_children.cuh  |   2 +
 cpp/include/cudf/utilities/prefetch.hpp       | 155 ++++++++++++++++++
 cpp/src/column/column_view.cpp                |  42 +++++
 cpp/src/join/hash_join.cu                     |   2 +
 cpp/src/utilities/prefetch.cpp                |  89 ++++++++++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   3 +
 .../cudf/cudf/_lib/pylibcudf/experimental.pxd |  10 ++
 .../cudf/cudf/_lib/pylibcudf/experimental.pyx |  43 +++++
 .../_lib/pylibcudf/libcudf/experimental.pxd   |  16 ++
 15 files changed, 416 insertions(+), 15 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/prefetch.hpp
 create mode 100644 cpp/src/utilities/prefetch.cpp
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/experimental.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/experimental.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 65347bd6689..5e79204a558 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -674,6 +674,7 @@ add_library(
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp
+  src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 134e835911f..03352fdce13 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -72,7 +74,7 @@ class column_view_base {
             CUDF_ENABLE_IF(std::is_same_v<T, void> or is_rep_layout_compatible<T>())>
   T const* head() const noexcept
   {
-    return static_cast<T const*>(_data);
+    return static_cast<T const*>(get_data());
   }
 
   /**
@@ -225,6 +227,17 @@ class column_view_base {
   [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  virtual void const* get_data() const noexcept { return _data; }
+
   data_type _type{type_id::EMPTY};   ///< Element type
   size_type _size{};                 ///< Number of elements
   void const* _data{};               ///< Pointer to device memory containing elements
@@ -236,7 +249,7 @@ class column_view_base {
                                      ///< Enables zero-copy slicing
 
   column_view_base()                        = default;
-  ~column_view_base()                       = default;
+  virtual ~column_view_base()               = default;
   column_view_base(column_view_base const&) = default;  ///< Copy constructor
   column_view_base(column_view_base&&)      = default;  ///< Move constructor
   /**
@@ -283,11 +296,6 @@ class column_view_base {
                    size_type null_count,
                    size_type offset = 0);
 };
-
-class mutable_column_view_base : public column_view_base {
- public:
- protected:
-};
 }  // namespace detail
 
 /**
@@ -323,7 +331,7 @@ class column_view : public detail::column_view_base {
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
-  ~column_view() = default;
+  ~column_view() override = default;
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
@@ -447,6 +455,18 @@ class column_view : public detail::column_view_base {
     return device_span<T const>(data<T>(), size());
   }
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend column_view bit_cast(column_view const& input, data_type type);
 
@@ -478,7 +498,7 @@ class mutable_column_view : public detail::column_view_base {
  public:
   mutable_column_view() = default;
 
-  ~mutable_column_view(){
+  ~mutable_column_view() override{
     // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
     // from a host+device function marking the implicit version also as host+device
   };
@@ -572,7 +592,7 @@ class mutable_column_view : public detail::column_view_base {
   }
 
   /**
-   * @brief Return first element (accounting for offset) when underlying data is
+   * @brief Return first element (accounting for offset) after underlying data is
    * casted to the specified type.
    *
    * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
@@ -665,6 +685,18 @@ class mutable_column_view : public detail::column_view_base {
    */
   operator column_view() const;
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index aabfff746ea..b4ec5f2cc69 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -40,9 +40,6 @@ class preprocessed_table;
 namespace cudf {
 namespace detail {
 
-// Forward declaration
-class cuco_allocator;
-
 constexpr int DEFAULT_JOIN_CG_SIZE = 2;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index fcd74bebfe8..4369de317b3 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -18,11 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -230,7 +232,8 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
 
   auto chars_data = rmm::device_uvector<char>(chars_bytes, stream, mr);
-  auto d_chars    = chars_data.data();
+  cudf::experimental::prefetch::detail::prefetch("gather", chars_data, stream);
+  auto d_chars = chars_data.data();
 
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
@@ -312,6 +315,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
   // build chars column
   auto const offsets_view =
     cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
+  cudf::experimental::prefetch::detail::prefetch(
+    "gather", strings.chars_begin(stream), strings.chars_size(stream), stream);
   auto out_chars_data = gather_chars(
     d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f5f3982a5d6..55b59dd4ff2 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -186,6 +187,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   // Now build the chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("gather", chars, stream);
   size_and_exec_fn.d_chars = chars.data();
 
   // Execute the function fn again to fill in the chars data.
diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
new file mode 100644
index 00000000000..5ca6fd6f4b0
--- /dev/null
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <map>
+#include <string>
+#include <string_view>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+/**
+ * @brief A singleton class that manages the prefetching configuration.
+ */
+class PrefetchConfig {
+ public:
+  PrefetchConfig& operator=(const PrefetchConfig&) = delete;
+  PrefetchConfig(const PrefetchConfig&)            = delete;
+
+  /**
+   * @brief Get the singleton instance of the prefetching configuration.
+   *
+   * @return The singleton instance of the prefetching configuration.
+   */
+  static PrefetchConfig& instance();
+
+  /**
+   * @brief Get the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @return The value of the configuration key.
+   */
+  bool get(std::string_view key);
+  /**
+   * @brief Set the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @param value The value to set.
+   */
+  void set(std::string_view key, bool value);
+  /**
+   * @brief Enable or disable debug mode.
+   *
+   * In debug mode, the pointers being prefetched are printed to stderr.
+   */
+  bool debug{false};
+
+ private:
+  PrefetchConfig() = default;                 //< Private constructor to enforce singleton pattern
+  std::map<std::string, bool> config_values;  //< Map of configuration keys to values
+};
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device());
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @note This function will not throw exceptions, so it is safe to call in
+ * noexcept contexts. If an error occurs, the error code is returned. This
+ * function primarily exists for [mutable_]column_view::get_data and should be
+ * removed once an method for stream-ordered data pointer access is added to
+ * those data structures.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+cudaError_t prefetch_noexcept(
+  std::string_view key,
+  void const* ptr,
+  std::size_t size,
+  rmm::cuda_stream_view stream,
+  rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) noexcept;
+
+/**
+ * @brief Prefetch the data in a device_uvector.
+ *
+ * @note At present this function does not support stream-ordered execution. Prefetching always
+ * occurs on the default stream.
+ *
+ * @param key The key to enable prefetching for.
+ * @param v The device_uvector to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+template <typename T>
+void prefetch(std::string_view key,
+              rmm::device_uvector<T> const& v,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device())
+{
+  if (v.is_empty()) { return; }
+  prefetch(key, v.data(), v.size(), stream, device_id);
+}
+
+}  // namespace detail
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ */
+void enable_prefetching(std::string_view key);
+
+/**
+ * @brief Disable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to disable prefetching for.
+ */
+void disable_prefetching(std::string_view key);
+
+/**
+ * @brief Enable or disable debug mode.
+ *
+ * In debug mode, the pointers being prefetched are printed to stderr.
+ *
+ * @param enable Whether to enable or disable debug mode.
+ */
+void prefetch_debugging(bool enable);
+
+}  // namespace cudf::experimental::prefetch
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 4d16298c605..a9605efb362 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,8 +15,10 @@
  */
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,10 +29,37 @@
 #include <algorithm>
 #include <exception>
 #include <numeric>
+#include <string>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+
+template <typename ColumnView>
+void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept
+{
+  if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) {
+    if (cudf::is_fixed_width(col.type())) {
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
+    } else if (col.type().id() == type_id::STRING) {
+      strings_column_view scv{col};
+
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key,
+        data_ptr,
+        scv.chars_size(cudf::get_default_stream()) * sizeof(char),
+        cudf::get_default_stream());
+    } else {
+      std::cout << key << ": Unsupported type: " << static_cast<int32_t>(col.type().id())
+                << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
 column_view_base::column_view_base(data_type type,
                                    size_type size,
                                    void const* data,
@@ -126,6 +155,7 @@ bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
 {
   return shallow_equivalent_impl(lhs, rhs);
 }
+
 }  // namespace detail
 
 // Immutable view constructor
@@ -175,6 +205,18 @@ mutable_column_view::operator column_view() const
   return column_view{_type, _size, _data, _null_mask, _null_count, _offset, std::move(child_views)};
 }
 
+void const* column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "column_view::get_data");
+  return _data;
+}
+
+void const* mutable_column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "mutable_column_view::get_data");
+  return _data;
+}
+
 size_type count_descendants(column_view parent)
 {
   auto descendants = [](auto const& child) { return count_descendants(child); };
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index b0184ff6a86..eb9b687630b 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -185,6 +185,8 @@ probe_join_hash_table(
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *left_indices, stream);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *right_indices, stream);
 
   auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
 
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
new file mode 100644
index 00000000000..21f2e40c82a
--- /dev/null
+++ b/cpp/src/utilities/prefetch.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
+
+#include <rmm/cuda_device.hpp>
+
+#include <iostream>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+PrefetchConfig& PrefetchConfig::instance()
+{
+  static PrefetchConfig instance;
+  return instance;
+}
+
+bool PrefetchConfig::get(std::string_view key)
+{
+  // Default to not prefetching
+  if (config_values.find(key.data()) == config_values.end()) {
+    return (config_values[key.data()] = false);
+  }
+  return config_values[key.data()];
+}
+void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; }
+
+cudaError_t prefetch_noexcept(std::string_view key,
+                              void const* ptr,
+                              std::size_t size,
+                              rmm::cuda_stream_view stream,
+                              rmm::cuda_device_id device_id) noexcept
+{
+  if (PrefetchConfig::instance().get(key)) {
+    if (PrefetchConfig::instance().debug) {
+      std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
+                << std::endl;
+    }
+    auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value());
+    // Need to flush the CUDA error so that the context is not corrupted.
+    if (result == cudaErrorInvalidValue) { cudaGetLastError(); }
+    return result;
+  }
+  return cudaSuccess;
+}
+
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id)
+{
+  auto result = prefetch_noexcept(key, ptr, size, stream, device_id);
+  // Ignore cudaErrorInvalidValue because that will be raised if prefetching is
+  // attempted on unmanaged memory.
+  if ((result != cudaErrorInvalidValue) && (result != cudaSuccess)) {
+    std::cerr << "Prefetch failed" << std::endl;
+    CUDF_CUDA_TRY(result);
+  }
+}
+
+}  // namespace detail
+
+void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); }
+
+void disable_prefetching(std::string_view key)
+{
+  detail::PrefetchConfig::instance().set(key, false);
+}
+
+void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; }
+}  // namespace cudf::experimental::prefetch
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0800fa18e94..df4591baa71 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    experimental.pyx
     expressions.pyx
     filling.pyx
     gpumemoryview.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 26e89b818d3..71f523fc3cd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@ __all__ = [
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index e89a5ed9f96..9705eba84b1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd
new file mode 100644
index 00000000000..107c91c8365
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cpdef enable_prefetching(str key)
+
+cpdef disable_prefetching(str key)
+
+cpdef prefetch_debugging(bool enable)
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
new file mode 100644
index 00000000000..1e2a682d879
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.libcudf cimport experimental as cpp_experimental
+
+
+cpdef enable_prefetching(str key):
+    """Turn on prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to enable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.enable_prefetching(c_key)
+
+
+cpdef disable_prefetching(str key):
+    """Turn off prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to disable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.disable_prefetching(c_key)
+
+
+cpdef prefetch_debugging(bool enable):
+    """Enable or disable prefetch debugging.
+
+    When enabled, any prefetch instructions will be logged to the console.
+
+    Parameters
+    ----------
+    enable : bool
+        Whether to enable or disable prefetch debugging.
+    """
+    cpp_experimental.prefetch_debugging(enable)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
new file mode 100644
index 00000000000..f280a382a04
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+
+cdef extern from "cudf/utilities/prefetch.hpp" \
+        namespace "cudf::experimental::prefetch" nogil:
+    # Not technically the right signature, but it's good enough to let Cython
+    # generate valid C++ code. It just means we'll be copying a host string
+    # extra, but that's OK. If we care we could generate string_view bindings,
+    # but there's no real rush so if we go that route we might as well
+    # contribute them upstream to Cython itself.
+    void enable_prefetching(string key)
+    void disable_prefetching(string key)
+    void prefetch_debugging(bool enable)

From 852b151002dc76e9f09d3529c80e4b589f1df9fc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 22 Jul 2024 14:48:18 +0100
Subject: [PATCH 30/44] Fix issue in horizontal concat implementation in
 cudf-polars (#16271)

Shorter tables must be extended to the same length as the longest table.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16271
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 22 +++++
 .../libcudf/scalar/scalar_factories.pxd       |  3 +
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  4 +
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    | 20 ++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 39 ++++++++
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  3 +-
 python/cudf_polars/tests/test_hconcat.py      |  9 ++
 python/cudf_polars/tests/test_join.py         | 93 ++++++++++---------
 python/cudf_polars/tests/utils/test_dtypes.py |  1 +
 9 files changed, 147 insertions(+), 47 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index cb96c1d9fce..a61e0629292 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -255,6 +255,28 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def all_null_like(Column like, size_type size):
+        """Create an all null column from a template.
+
+        Parameters
+        ----------
+        like : Column
+            Column whose type we should mimic
+        size : int
+            Number of rows in the resulting column.
+
+        Returns
+        -------
+        Column
+            An all-null column of `size` rows and type matching `like`.
+        """
+        cdef Scalar slr = Scalar.empty_like(like)
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_column_from_scalar(dereference(slr.get()), size))
+        return Column.from_libcudf(move(c_result))
+
     @staticmethod
     def from_cuda_array_interface_obj(object obj):
         """Create a Column from an object with a CUDA array interface.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
index c8220df8938..8092c3d637d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -3,9 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
     cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
+
+    cdef unique_ptr[scalar] make_empty_scalar_like(const column_view &) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
index 3de86d93519..e6c9db2f1ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -7,6 +7,7 @@ from rmm._lib.memory_resource cimport DeviceMemoryResource
 
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -24,5 +25,8 @@ cdef class Scalar:
     cpdef DataType type(self)
     cpdef bool is_valid(self)
 
+    @staticmethod
+    cdef Scalar empty_like(Column column)
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 6799c37cea2..67730be07d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -2,11 +2,16 @@
 
 from cython cimport no_gc_clear
 from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_empty_scalar_like,
+)
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -46,6 +51,21 @@ cdef class Scalar:
         """True if the scalar is valid, false if not"""
         return self.get().is_valid()
 
+    @staticmethod
+    cdef Scalar empty_like(Column column):
+        """Construct a null scalar with the same type as column.
+
+        Parameters
+        ----------
+        column
+            Column to take type from
+
+        Returns
+        -------
+        New empty (null) scalar of the given type.
+        """
+        return Scalar.from_libcudf(move(make_empty_scalar_like(column.view())))
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
         """Construct a Scalar object from a libcudf scalar.
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a84fe73810e..b934869ffef 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1101,9 +1101,48 @@ class HConcat(IR):
     dfs: list[IR]
     """List of inputs."""
 
+    @staticmethod
+    def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
+        """
+        Extend a table with nulls.
+
+        Parameters
+        ----------
+        table
+            Table to extend
+        nrows
+            Number of additional rows
+
+        Returns
+        -------
+        New pylibcudf table.
+        """
+        return plc.concatenate.concatenate(
+            [
+                table,
+                plc.Table(
+                    [
+                        plc.Column.all_null_like(column, nrows)
+                        for column in table.columns()
+                    ]
+                ),
+            ]
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        max_rows = max(df.num_rows for df in dfs)
+        # Horizontal concatenation extends shorter tables with nulls
+        dfs = [
+            df
+            if df.num_rows == max_rows
+            else DataFrame.from_table(
+                self._extend_with_nulls(df.table, nrows=max_rows - df.num_rows),
+                df.column_names,
+            )
+            for df in dfs
+        ]
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 918cd024fa2..1279fe91d48 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -153,7 +153,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
-        # TODO: This doesn't consider the value type.
+        # Recurse to catch unsupported inner types
+        _ = from_polars(dtype.inner)
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py
index 46cbb21b25a..4737aa18028 100644
--- a/python/cudf_polars/tests/test_hconcat.py
+++ b/python/cudf_polars/tests/test_hconcat.py
@@ -17,3 +17,12 @@ def test_hconcat():
     ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"))
     query = pl.concat([ldf, ldf2], how="horizontal")
     assert_gpu_result_equal(query)
+
+
+def test_hconcat_different_heights():
+    left = pl.LazyFrame({"a": [1, 2, 3, 4]})
+
+    right = pl.LazyFrame({"b": [[1], [2]], "c": ["a", "bcde"]})
+
+    q = pl.concat([left, right], how="horizontal")
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 1ffbf3c0ef4..1e880cdc6de 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -12,65 +12,68 @@
 )
 
 
-@pytest.mark.parametrize(
-    "how",
-    [
-        "inner",
-        "left",
-        "semi",
-        "anti",
-        "full",
-    ],
-)
-@pytest.mark.parametrize("coalesce", [False, True])
-@pytest.mark.parametrize(
-    "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
-)
-@pytest.mark.parametrize(
-    "join_expr",
-    [
-        pl.col("a"),
-        pl.col("a") * 2,
-        [pl.col("a"), pl.col("c") + 1],
-        ["c", "a"],
-    ],
-)
-def test_join(how, coalesce, join_nulls, join_expr):
-    left = pl.DataFrame(
+@pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
+def join_nulls(request):
+    return request.param
+
+
+@pytest.fixture(params=["inner", "left", "semi", "anti", "full"])
+def how(request):
+    return request.param
+
+
+@pytest.fixture
+def left():
+    return pl.LazyFrame(
         {
             "a": [1, 2, 3, 1, None],
             "b": [1, 2, 3, 4, 5],
             "c": [2, 3, 4, 5, 6],
         }
-    ).lazy()
-    right = pl.DataFrame(
+    )
+
+
+@pytest.fixture
+def right():
+    return pl.LazyFrame(
         {
             "a": [1, 4, 3, 7, None, None],
             "c": [2, 3, 4, 5, 6, 7],
         }
-    ).lazy()
+    )
 
+
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        pl.col("a") * 2,
+        [pl.col("a"), pl.col("c") + 1],
+        ["c", "a"],
+    ],
+)
+def test_non_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
-        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=False
     )
     assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join():
-    left = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 1, None],
-            "b": [1, 2, 3, 4, 5],
-            "c": [2, 3, 4, 5, 6],
-        }
-    ).lazy()
-    right = pl.DataFrame(
-        {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
-        }
-    ).lazy()
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        ["c", "a"],
+    ],
+)
+def test_coalesce_join(left, right, how, join_nulls, join_expr):
+    query = left.join(
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
+    )
+    assert_gpu_result_equal(query, check_row_order=False)
+
 
+def test_cross_join(left, right):
     q = left.join(right, how="cross")
 
     assert_gpu_result_equal(q)
@@ -79,9 +82,7 @@ def test_cross_join():
 @pytest.mark.parametrize(
     "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
 )
-def test_join_literal_key_unsupported(left_on, right_on):
-    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
-    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py
index 535fdd846a0..bbdb4faa256 100644
--- a/python/cudf_polars/tests/utils/test_dtypes.py
+++ b/python/cudf_polars/tests/utils/test_dtypes.py
@@ -16,6 +16,7 @@
         pl.Time(),
         pl.Struct({"a": pl.Int8, "b": pl.Float32}),
         pl.Datetime("ms", time_zone="US/Pacific"),
+        pl.List(pl.Datetime("ms", time_zone="US/Pacific")),
         pl.Array(pl.Int8, 2),
         pl.Binary(),
         pl.Categorical(),

From 135c99512e5f7a2d38f6a870ad6883ccb39a3cce Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Jul 2024 04:13:32 -1000
Subject: [PATCH 31/44] Align Series APIs with pandas 2.x (#16333)

Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

* `reindex`
* `reset_index`
* `add_suffix`
* `searchsorted`
* `clip`
* `mask`
* `shift`
* `dropna`
* `rename`
* `cov`
* `apply`
* `replace`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16333
---
 python/cudf/cudf/core/dataframe.py     |  19 ++-
 python/cudf/cudf/core/frame.py         |   9 +-
 python/cudf/cudf/core/indexed_frame.py |  87 +++++++++++--
 python/cudf/cudf/core/series.py        | 164 +++++++++++++++++++++----
 4 files changed, 240 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index dbc7f10b569..288bdfd39b3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2844,6 +2844,10 @@ def reindex(
             index=index,
             inplace=False,
             fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
 
     @_performance_tracking
@@ -3187,7 +3191,14 @@ class  speed  type
         )
     )
     def reset_index(
-        self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+        self,
+        level=None,
+        drop=False,
+        inplace=False,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
     ):
         return self._mimic_inplace(
             DataFrame._from_data(
@@ -3196,6 +3207,8 @@ def reset_index(
                     drop=drop,
                     col_level=col_level,
                     col_fill=col_fill,
+                    allow_duplicates=allow_duplicates,
+                    names=names,
                 )
             ),
             inplace=inplace,
@@ -3666,7 +3679,9 @@ def add_prefix(self, prefix, axis=None):
         return out
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 04ecae4ba85..32c313e42d3 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1187,6 +1187,7 @@ def searchsorted(
         self,
         values,
         side: Literal["left", "right"] = "left",
+        sorter=None,
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
     ) -> ScalarLike | cupy.ndarray:
@@ -1199,6 +1200,10 @@ def searchsorted(
         side : str {'left', 'right'} optional, default 'left'
             If 'left', the index of the first suitable location found is given
             If 'right', return the last such index
+        sorter : 1-D array-like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+            Currently not supported.
         ascending : bool optional, default True
             Sorted Frame is in ascending order (otherwise descending)
         na_position : str {'last', 'first'} optional, default 'last'
@@ -1245,10 +1250,12 @@ def searchsorted(
         >>> df.searchsorted(values_df, ascending=False)
         array([4, 4, 4, 0], dtype=int32)
         """
-        # Call libcudf search_sorted primitive
+        # Note: pandas.DataFrame does not support searchsorted
 
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
+        elif sorter is not None:
+            raise NotImplementedError("sorter is currently not supported.")
 
         scalar_flag = None
         if is_scalar(values):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e75b51e0d43..e14f8923c25 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -84,6 +84,9 @@
 {argument}
         inplace : bool, default False
             Modify the DataFrame in place (do not create a new object).
+        allow_duplicates : bool, default False
+            Allow duplicate column labels to be created.
+            Currently not supported.
 
         Returns
         -------
@@ -902,7 +905,7 @@ def replace(
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
-    def clip(self, lower=None, upper=None, inplace=False, axis=1):
+    def clip(self, lower=None, upper=None, axis=1, inplace=False):
         """
         Trim values at input threshold(s).
 
@@ -1779,7 +1782,14 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         )
 
     @_performance_tracking
-    def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
+    def mask(
+        self,
+        cond,
+        other=None,
+        inplace: bool = False,
+        axis=None,
+        level=None,
+    ) -> Self | None:
         """
         Replace values where the condition is True.
 
@@ -1831,6 +1841,10 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         4       0
         dtype: int64
         """
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
 
         if not hasattr(cond, "__invert__"):
             # We Invert `cond` below and call `where`, so
@@ -2042,13 +2056,26 @@ def interpolate(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
-            raise ValueError("Only axis=0 is supported.")
+            raise NotImplementedError("Only axis=0 is supported.")
         if freq is not None:
-            raise ValueError("The freq argument is not yet supported.")
+            raise NotImplementedError(
+                "The freq argument is not yet supported."
+            )
+        if suffix is not None:
+            raise NotImplementedError(
+                "The suffix argument is not yet supported."
+            )
 
         data_columns = (
             col.shift(periods, fill_value) for col in self._columns
@@ -3225,7 +3252,9 @@ def _split(self, splits, keep_index=True):
         ]
 
     @_performance_tracking
-    def bfill(self, value=None, axis=None, inplace=None, limit=None):
+    def bfill(
+        self, value=None, axis=None, inplace=None, limit=None, limit_area=None
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
 
@@ -3233,6 +3262,9 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3264,7 +3296,14 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
     @_performance_tracking
-    def ffill(self, value=None, axis=None, inplace=None, limit=None):
+    def ffill(
+        self,
+        value=None,
+        axis=None,
+        inplace=None,
+        limit=None,
+        limit_area: Literal["inside", "outside", None] = None,
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
 
@@ -3272,6 +3311,9 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3363,7 +3405,7 @@ def add_prefix(self, prefix, axis=None):
                 Use `Series.add_prefix` or `DataFrame.add_prefix`"
         )
 
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
         """
         Suffix labels with string `suffix`.
 
@@ -3653,6 +3695,10 @@ def _reindex(
         index=None,
         inplace=False,
         fill_value=NA,
+        level=None,
+        method=None,
+        limit=None,
+        tolerance=None,
     ):
         """
         Helper for `.reindex`
@@ -3677,6 +3723,15 @@ def _reindex(
         -------
         Series or DataFrame
         """
+        if method is not None:
+            raise NotImplementedError("method is not currently supported.")
+        if level is not None:
+            raise NotImplementedError("level is not currently supported.")
+        if limit is not None:
+            raise NotImplementedError("limit is not currently supported.")
+        if tolerance is not None:
+            raise NotImplementedError("tolerance is not currently supported.")
+
         if dtypes is None:
             dtypes = {}
 
@@ -4303,8 +4358,22 @@ def take(self, indices, axis=0):
 
         return self._gather(GatherMap(indices, len(self), nullify=False))
 
-    def _reset_index(self, level, drop, col_level=0, col_fill=""):
+    def _reset_index(
+        self,
+        level,
+        drop,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
+    ):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is not currently supported."
+            )
+        elif names is not None:
+            raise NotImplementedError("names is not currently supported.")
         if level is not None:
             if (
                 isinstance(level, int)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 94c33eed37a..8277ccf68fc 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -918,7 +918,18 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         return self.to_pandas().to_dict(into=into)
 
     @_performance_tracking
-    def reindex(self, *args, **kwargs):
+    def reindex(
+        self,
+        index=None,
+        *,
+        axis=None,
+        method: str | None = None,
+        copy: bool = True,
+        level=None,
+        fill_value: ScalarLike | None = None,
+        limit: int | None = None,
+        tolerance=None,
+    ):
         """
         Conform Series to new index.
 
@@ -927,6 +938,8 @@ def reindex(self, *args, **kwargs):
         index : Index, Series-convertible, default None
             New labels / index to conform to,
             should be specified using keywords.
+        axis: int, default None
+            Unused.
         method: Not Supported
         copy : boolean, default True
         level: Not Supported
@@ -965,27 +978,23 @@ def reindex(self, *args, **kwargs):
             where it is cast to float in Pandas.
 
         """
-        if len(args) > 1:
-            raise TypeError(
-                "Only one positional argument ('index') is allowed"
-            )
-        if args:
-            (index,) = args
-            if "index" in kwargs:
-                raise TypeError(
-                    "'index' passed as both positional and keyword argument"
-                )
-        else:
-            index = kwargs.get("index", self.index)
+        if index is None:
+            index = self.index
+        if fill_value is None:
+            fill_value = cudf.NA
 
         name = self.name or 0
         series = self._reindex(
-            deep=kwargs.get("copy", True),
+            deep=copy,
             dtypes={name: self.dtype},
             index=index,
             column_names=[name],
             inplace=False,
-            fill_value=kwargs.get("fill_value", cudf.NA),
+            fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
         series.name = self.name
         return series
@@ -1054,14 +1063,21 @@ def reindex(self, *args, **kwargs):
         )
     )
     def reset_index(
-        self, level=None, drop=False, name=no_default, inplace=False
+        self,
+        level=None,
+        drop=False,
+        name=no_default,
+        inplace=False,
+        allow_duplicates=False,
     ):
         if not drop and inplace:
             raise TypeError(
                 "Cannot reset_index inplace on a Series "
                 "to create a DataFrame"
             )
-        data, index = self._reset_index(level=level, drop=drop)
+        data, index = self._reset_index(
+            level=level, drop=drop, allow_duplicates=allow_duplicates
+        )
         if not drop:
             if name is no_default:
                 name = 0 if self.name is None else self.name
@@ -1632,7 +1648,9 @@ def has_nulls(self):
         return self._column.has_nulls()
 
     @_performance_tracking
-    def dropna(self, axis=0, inplace=False, how=None):
+    def dropna(
+        self, axis=0, inplace=False, how=None, ignore_index: bool = False
+    ):
         """
         Return a Series with null values removed.
 
@@ -1644,6 +1662,8 @@ def dropna(self, axis=0, inplace=False, how=None):
             If True, do operation inplace and return None.
         how : str, optional
             Not in use. Kept for compatibility.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -1709,6 +1729,9 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         result = super().dropna(axis=axis)
 
+        if ignore_index:
+            result.index = RangeIndex(len(result))
+
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
@@ -2046,10 +2069,31 @@ def astype(
         return super().astype(dtype, copy, errors)
 
     @_performance_tracking
-    def sort_index(self, axis=0, *args, **kwargs):
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending=True,
+        inplace=False,
+        kind=None,
+        na_position="last",
+        sort_remaining=True,
+        ignore_index=False,
+        key=None,
+    ):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
-        return super().sort_index(axis=axis, *args, **kwargs)
+        return super().sort_index(
+            axis=axis,
+            level=level,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            sort_remaining=sort_remaining,
+            ignore_index=ignore_index,
+            key=key,
+        )
 
     @_performance_tracking
     def sort_values(
@@ -2278,14 +2322,29 @@ def argsort(
         )
 
     @_performance_tracking
-    def replace(self, to_replace=None, value=no_default, *args, **kwargs):
+    def replace(
+        self,
+        to_replace=None,
+        value=no_default,
+        inplace=False,
+        limit=None,
+        regex=False,
+        method=no_default,
+    ):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
                 "Series.replace cannot use dict-like to_replace and non-None "
                 "value"
             )
 
-        return super().replace(to_replace, value, *args, **kwargs)
+        return super().replace(
+            to_replace,
+            value,
+            inplace=inplace,
+            limit=limit,
+            regex=regex,
+            method=method,
+        )
 
     @_performance_tracking
     def update(self, other):
@@ -2394,7 +2453,14 @@ def update(self, other):
 
     # UDF related
     @_performance_tracking
-    def apply(self, func, convert_dtype=True, args=(), **kwargs):
+    def apply(
+        self,
+        func,
+        convert_dtype=True,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        **kwargs,
+    ):
         """
         Apply a scalar function to the values of a Series.
         Similar to ``pandas.Series.apply``.
@@ -2421,6 +2487,18 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             See examples for details.
         args : tuple
             Positional arguments passed to func after the series value.
+        by_row : False or "compat", default "compat"
+            If ``"compat"`` and func is a callable, func will be passed each element of
+            the Series, like ``Series.map``. If func is a list or dict of
+            callables, will first try to translate each func into pandas methods. If
+            that doesn't work, will try call to apply again with ``by_row="compat"``
+            and if that fails, will call apply again with ``by_row=False``
+            (backward compatible).
+            If False, the func will be passed the whole Series at once.
+
+            ``by_row`` has no effect when ``func`` is a string.
+
+            Currently not implemented.
         **kwargs
             Not supported
 
@@ -2530,6 +2608,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         if convert_dtype is not True:
             raise ValueError("Series.apply only supports convert_dtype=True")
+        elif by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         result = self._apply(func, _get_scalar_kernel, *args, **kwargs)
         result.name = self.name
@@ -2643,7 +2723,7 @@ def round(self, decimals=0, how="half_even"):
         return super().round(decimals, how)
 
     @_performance_tracking
-    def cov(self, other, min_periods=None):
+    def cov(self, other, min_periods=None, ddof: int | None = None):
         """
         Compute covariance with Series, excluding missing values.
 
@@ -2676,6 +2756,8 @@ def cov(self, other, min_periods=None):
             raise NotImplementedError(
                 "min_periods parameter is not implemented yet"
             )
+        if ddof is not None:
+            raise NotImplementedError("ddof parameter is not implemented yet")
 
         if self.empty or other.empty:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -3389,7 +3471,15 @@ def groupby(
         )
 
     @_performance_tracking
-    def rename(self, index=None, copy=True):
+    def rename(
+        self,
+        index=None,
+        axis=None,
+        copy: bool = True,
+        inplace: bool = False,
+        level=None,
+        errors: Literal["ignore", "raise"] = "ignore",
+    ):
         """
         Alter Series name
 
@@ -3399,8 +3489,21 @@ def rename(self, index=None, copy=True):
         ----------
         index : Scalar, optional
             Scalar to alter the Series.name attribute
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
         copy : boolean, default True
             Also copy underlying data
+        inplace : bool, default False
+            Whether to return a new Series. If True the value of copy is ignored.
+            Currently not supported.
+        level : int or level name, default None
+            In case of MultiIndex, only rename labels in the specified level.
+            Currently not supported.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise `KeyError` when a `dict-like mapper` or
+            `index` contains labels that are not present in the index being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be ignored.
+            Currently not supported.
 
         Returns
         -------
@@ -3429,8 +3532,13 @@ def rename(self, index=None, copy=True):
             :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
-            - The ``inplace`` and ``level`` is not supported
         """
+        if inplace is not False:
+            raise NotImplementedError("inplace is currently not supported.")
+        if level is not None:
+            raise NotImplementedError("level is currently not supported.")
+        if errors != "ignore":
+            raise NotImplementedError("errors is currently not supported.")
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
@@ -3445,7 +3553,9 @@ def add_prefix(self, prefix, axis=None):
         )
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),

From 3053f42351b04e22d873f78f5bc49f8b20ff17ac Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Mon, 22 Jul 2024 10:56:39 -0700
Subject: [PATCH 32/44] Add missing `stream` param to dictionary factory APIs
 (#16319)

Add `stream` param to dictionary column factory functions. Partially solves #13744

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16319
---
 .../cudf/dictionary/dictionary_factories.hpp  | 13 ++++--
 cpp/src/dictionary/dictionary_factories.cu    | 13 ++++--
 cpp/tests/streams/dictionary_test.cpp         | 46 +++++++++++++++++++
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 7cdfa3bf9e5..21f593e1aec 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -87,12 +87,17 @@ std::unique_ptr<column> make_dictionary_column(
  * @param indices_column Indices to use for the new dictionary column.
  * @param null_mask Null mask for the output column.
  * @param null_count Number of nulls for the output column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
-std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
-                                               std::unique_ptr<column> indices_column,
-                                               rmm::device_buffer&& null_mask,
-                                               size_type null_count);
+std::unique_ptr<column> make_dictionary_column(
+  std::unique_ptr<column> keys_column,
+  std::unique_ptr<column> indices_column,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 37f8fa7a05b..0617d71fa51 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -77,7 +77,9 @@ std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
                                                std::unique_ptr<column> indices_column,
                                                rmm::device_buffer&& null_mask,
-                                               size_type null_count)
+                                               size_type null_count,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
   CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
@@ -89,7 +91,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
   children.emplace_back(std::move(keys_column));
   return std::make_unique<column>(data_type{type_id::DICTIONARY32},
                                   count,
-                                  rmm::device_buffer{},
+                                  rmm::device_buffer{0, stream, mr},
                                   std::move(null_mask),
                                   null_count,
                                   std::move(children));
@@ -134,8 +136,11 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
   auto indices_column = [&] {
     // If the types match, then just commandeer the column's data buffer.
     if (new_type.id() == indices_type) {
-      return std::make_unique<column>(
-        new_type, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
+      return std::make_unique<column>(new_type,
+                                      indices_size,
+                                      std::move(*(contents.data.release())),
+                                      rmm::device_buffer{0, stream, mr},
+                                      0);
     }
     // If the new type does not match, then convert the data.
     cudf::column_view cast_view{
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
index 9e81c8574b8..03e4cf47470 100644
--- a/cpp/tests/streams/dictionary_test.cpp
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -26,6 +26,52 @@
 
 class DictionaryTest : public cudf::test::BaseFixture {};
 
+TEST_F(DictionaryTest, FactoryColumnViews)
+{
+  cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
+  cudf::test::fixed_width_column_wrapper<uint8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+
+  auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values);
+}
+
+TEST_F(DictionaryTest, FactoryColumns)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
+TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
 TEST_F(DictionaryTest, Encode)
 {
   cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});

From e54b82c9f3499b35e7e789d41d2042a5d5a80810 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:03:04 +1000
Subject: [PATCH 33/44] Use resource_ref for upstream in
 stream_checking_resource_adaptor (#16187)

As we move toward replacing all `device_memory_resource` pointers with `resource_ref`s, there are some places that changes can be made ahead of RMM to simplify required changes as RMM is refactored.

In this PR I eliminate the unnecessary `Upstream` template parameter from `cudf_test::stream_checking_resource_adaptor`, and use a `device_async_resource` for the upstream resource.   A similar change will be made to all RMM resource adaptors, but this one can be done without deprecations since it is just a test utility.

Authors:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16187
---
 .../stream_checking_resource_adaptor.hpp      | 33 +++++++++----------
 cpp/include/cudf_test/testing_main.hpp        | 10 +++---
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 5a077e86a0f..4f3c723d195 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -24,13 +24,11 @@
 
 #include <iostream>
 
+namespace cudf::test {
+
 /**
  * @brief Resource that verifies that the default stream is not used in any allocation.
- *
- * @tparam Upstream Type of the upstream resource used for
- * allocation/deallocation.
  */
-template <typename Upstream>
 class stream_checking_resource_adaptor final : public rmm::mr::device_memory_resource {
  public:
   /**
@@ -40,14 +38,13 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  stream_checking_resource_adaptor(Upstream* upstream,
+  stream_checking_resource_adaptor(rmm::device_async_resource_ref upstream,
                                    bool error_on_invalid_stream,
                                    bool check_default_stream)
     : upstream_{upstream},
       error_on_invalid_stream_{error_on_invalid_stream},
       check_default_stream_{check_default_stream}
   {
-    CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   stream_checking_resource_adaptor()                                                   = delete;
@@ -86,7 +83,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    return upstream_->allocate(bytes, stream);
+    return upstream_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -101,7 +98,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    upstream_->deallocate(ptr, bytes, stream);
+    upstream_.deallocate_async(ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -113,8 +110,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     if (this == &other) { return true; }
-    auto cast = dynamic_cast<stream_checking_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    auto cast = dynamic_cast<stream_checking_resource_adaptor const*>(&other);
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
@@ -150,7 +147,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
     }
   }
 
-  Upstream* upstream_;            // the upstream resource used for satisfying allocation requests
+  rmm::device_async_resource_ref
+    upstream_;                    // the upstream resource used for satisfying allocation requests
   bool error_on_invalid_stream_;  // If true, throw an exception when the wrong stream is detected.
                                   // If false, simply print to stdout.
   bool check_default_stream_;  // If true, throw an exception when the default stream is observed.
@@ -162,13 +160,12 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
  * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
  * upstream resource `upstream`.
  *
- * @tparam Upstream Type of the upstream `device_memory_resource`.
- * @param upstream Pointer to the upstream resource
+ * @param upstream Reference to the upstream resource
  */
-template <typename Upstream>
-stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(
-  Upstream* upstream, bool error_on_invalid_stream, bool check_default_stream)
+inline stream_checking_resource_adaptor make_stream_checking_resource_adaptor(
+  rmm::device_async_resource_ref upstream, bool error_on_invalid_stream, bool check_default_stream)
 {
-  return stream_checking_resource_adaptor<Upstream>{
-    upstream, error_on_invalid_stream, check_default_stream};
+  return stream_checking_resource_adaptor{upstream, error_on_invalid_stream, check_default_stream};
 }
+
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 66b831b917f..3ad4b127f80 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -32,8 +32,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-namespace cudf {
-namespace test {
+namespace cudf::test {
 
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
@@ -91,8 +90,7 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
   CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode);
 }
 
-}  // namespace test
-}  // namespace cudf
+}  // namespace cudf::test
 
 /**
  * @brief Parses the cuDF test command line options.
@@ -182,8 +180,8 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor =
-    make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream);
+  auto adaptor                       = cudf::test::make_stream_checking_resource_adaptor(
+    resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
   }

From e0a00c1fcb4b72b7abd29debe5b2f6b38081d39a Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Mon, 22 Jul 2024 12:03:24 -0700
Subject: [PATCH 34/44] Add `stream` param to list explode APIs (#16317)

Add `stream` param to list `explode*` APIs. Partially fixes https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16317
---
 cpp/include/cudf/lists/explode.hpp        |  8 ++++
 cpp/include/cudf/lists/set_operations.hpp |  2 +-
 cpp/src/lists/explode.cu                  | 29 +++++++-----
 cpp/tests/streams/lists_test.cpp          | 57 ++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index 81d82dcfa09..303f182ce8c 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -66,6 +66,7 @@ namespace cudf {
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -73,6 +74,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -109,6 +111,7 @@ std::unique_ptr<table> explode(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with exploded value and position. The column order of return table is
@@ -117,6 +120,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -152,6 +156,7 @@ std::unique_ptr<table> explode_position(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -159,6 +164,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -196,6 +202,7 @@ std::unique_ptr<table> explode_outer(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -203,6 +210,7 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index b8abfd62461..871e66b2d83 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -53,8 +53,8 @@ namespace cudf::lists {
  * @param nulls_equal Flag to specify whether null elements should be considered as equal, default
  *        to be `UNEQUAL` which means only non-null elements are checked for overlapping
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
- * @param mr Device memory resource used to allocate the returned object
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object
  * @return A column of type BOOL containing the check results
  */
 std::unique_ptr<column> have_overlap(
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 370d7480578..46c4fc78a6f 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -229,8 +229,8 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
   if (null_or_empty_count == 0) {
     // performance penalty to run the below loop if there are no nulls or empty lists.
     // run simple explode instead
-    return include_position ? explode_position(input_table, explode_column_idx, stream, mr)
-                            : explode(input_table, explode_column_idx, stream, mr);
+    return include_position ? detail::explode_position(input_table, explode_column_idx, stream, mr)
+                            : detail::explode(input_table, explode_column_idx, stream, mr);
   }
 
   auto gather_map_size = sliced_child.size() + null_or_empty_count;
@@ -300,58 +300,63 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_position(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode_position(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
+                                     rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, false, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, false, stream, mr);
 }
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, true, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, true, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index 711e20e4b17..7963dced292 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/filling.hpp>
 #include <cudf/lists/gather.hpp>
@@ -212,3 +213,57 @@ TEST_F(ListTest, HaveOverlap)
                             cudf::nan_equality::ALL_EQUAL,
                             cudf::test::get_default_stream());
 }
+
+TEST_F(ListTest, Explode)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodePosition)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode_position(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuter)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer(lists_table, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuterPosition)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer_position(lists_table, 0, cudf::test::get_default_stream());
+}

From c14c8bf59fd1e97fe94c8dfd2db6df7f9a6c65ad Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:03:56 -0700
Subject: [PATCH 35/44] Implement parquet reading using pylibcudf in
 cudf-polars (#16346)

Replace cudf-classic with pylibcudf for parquet reading in cudf-polars.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16346
---
 .../cudf_polars/containers/dataframe.py       | 12 ---------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 26 +++++++++----------
 python/cudf_polars/tests/test_scan.py         | 10 +------
 3 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index cbeadf1426a..dba76855329 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -23,8 +23,6 @@
 
     from typing_extensions import Self
 
-    import cudf
-
     from cudf_polars.containers import Column
 
 
@@ -83,16 +81,6 @@ def num_rows(self) -> int:
         """Number of rows."""
         return 0 if len(self.columns) == 0 else self.table.num_rows()
 
-    @classmethod
-    def from_cudf(cls, df: cudf.DataFrame) -> Self:
-        """Create from a cudf dataframe."""
-        return cls(
-            [
-                NamedColumn(c.to_pylibcudf(mode="read"), name)
-                for name, c in df._data.items()
-            ]
-        )
-
     @classmethod
     def from_polars(cls, df: pl.DataFrame) -> Self:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b934869ffef..e5691cba7dd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -25,7 +25,6 @@
 
 import polars as pl
 
-import cudf
 import cudf._lib.pylibcudf as plc
 
 import cudf_polars.dsl.expr as expr
@@ -205,8 +204,6 @@ class Scan(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
-        if self.file_options.n_rows is not None:
-            raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
         if self.cloud_options is not None and any(
@@ -241,6 +238,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         options = self.file_options
         with_columns = options.with_columns
         row_index = options.row_index
+        nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1
         if self.typ == "csv":
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
@@ -295,6 +293,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     comment=comment,
                     decimal=decimal,
                     dtypes=self.schema,
+                    nrows=nrows,
                 )
                 pieces.append(tbl_w_meta)
             tables, colnames = zip(
@@ -308,9 +307,16 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 colnames[0],
             )
         elif self.typ == "parquet":
-            cdf = cudf.read_parquet(self.paths, columns=with_columns)
-            assert isinstance(cdf, cudf.DataFrame)
-            df = DataFrame.from_cudf(cdf)
+            tbl_w_meta = plc.io.parquet.read_parquet(
+                plc.io.SourceInfo(self.paths),
+                columns=with_columns,
+                num_rows=nrows,
+            )
+            df = DataFrame.from_table(
+                tbl_w_meta.tbl,
+                # TODO: consider nested column names?
+                tbl_w_meta.column_names(include_children=False),
+            )
         else:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
@@ -337,13 +343,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 null_order=plc.types.NullOrder.AFTER,
             )
             df = DataFrame([index, *df.columns])
-        # TODO: should be true, but not the case until we get
-        # cudf-classic out of the loop for IO since it converts date32
-        # to datetime.
-        # assert all(
-        #     c.obj.type() == dtype
-        #     for c, dtype in zip(df.columns, self.schema.values())
-        # )
+        assert all(c.obj.type() == self.schema[c.name] for c in df.columns)
         if self.predicate is None:
             return df
         else:
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 0981a96a34a..642b6ae8a37 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -24,15 +24,7 @@ def row_index(request):
 
 
 @pytest.fixture(
-    params=[
-        None,
-        pytest.param(
-            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-        pytest.param(
-            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-    ],
+    params=[None, 2, 3],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
 def n_rows(request):

From 996cb8d870b7b6153802bde670435e8cd3b8775d Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:15:16 -0400
Subject: [PATCH 36/44] Migrate lists/sorting to pylibcudf (#16179)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16179
---
 python/cudf/cudf/_lib/lists.pyx               | 28 +++------
 .../_lib/pylibcudf/libcudf/lists/sorting.pxd  |  6 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  4 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 57 ++++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 46 +++++++++++++++
 5 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 76f37c3b845..50061f6e468 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -11,9 +11,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
-    sort_lists as cpp_sort_lists,
-)
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
@@ -21,7 +18,6 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
-    order,
     size_type,
 )
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -80,24 +76,14 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
-    )
-    cdef order c_sort_order = (
-        order.ASCENDING if ascending else order.DESCENDING
-    )
-    cdef null_order c_null_prec = (
-        null_order.BEFORE if na_position == "first" else null_order.AFTER
-    )
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
+    return Column.from_pylibcudf(
+        pylibcudf.lists.sort_lists(
+            col.to_pylibcudf(mode="read"),
+            ascending,
+            null_order.BEFORE if na_position == "first" else null_order.AFTER,
+            False,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
index 145ab41302f..337ac73908b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
@@ -15,3 +15,9 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
         order column_order,
         null_order null_precedence
     ) except +
+
+    cdef unique_ptr[column] stable_sort_lists(
+        const lists_column_view source_column,
+        order column_order,
+        null_order null_precedence
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 38eb575ee8d..cacecae6010 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -2,7 +2,7 @@
 
 from libcpp cimport bool
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -35,3 +35,5 @@ cpdef Column segmented_gather(Column, Column)
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
 
 cpdef Column count_elements(Column)
+
+cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index ea469642dd5..b5661a3e634 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -23,8 +23,12 @@ from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
+from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+    stable_sort_lists as cpp_stable_sort_lists,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
 from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
@@ -320,3 +324,54 @@ cpdef Column count_elements(Column input):
         c_result = move(cpp_count_elements(list_view.view()))
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sort_lists(
+    Column input,
+    bool ascending,
+    null_order na_position,
+    bool stable = False
+):
+    """Sort the elements within a list in each row of a list column.
+
+    For details, see :cpp:func:`sort_lists`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    ascending : bool
+        If true, the sort order is ascending. Otherwise, the sort order is descending.
+    na_position : NullOrder
+        If na_position equals NullOrder.FIRST, then the null values in the output
+        column are placed first. Otherwise, they are be placed after.
+    stable: bool
+        If true :cpp:func:`stable_sort_lists` is used, Otherwise,
+        :cpp:func:`sort_lists` is used.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in each list sorted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    cdef order c_sort_order = (
+        order.ASCENDING if ascending else order.DESCENDING
+    )
+
+    with nogil:
+        if stable:
+            c_result = move(cpp_stable_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+        else:
+            c_result = move(cpp_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 7cfed884f90..87472f6d59b 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -22,6 +22,11 @@ def column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def lists_column():
+    return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]]
+
+
 def test_concatenate_rows(test_data):
     arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
@@ -191,3 +196,44 @@ def test_count_elements(test_data):
     expect = pa.array([1, 1, 0, 3], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "ascending,na_position,expected",
+    [
+        (
+            True,
+            plc.types.NullOrder.BEFORE,
+            [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]],
+        ),
+        (
+            True,
+            plc.types.NullOrder.AFTER,
+            [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.BEFORE,
+            [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+    ],
+)
+def test_sort_lists(lists_column, ascending, na_position, expected):
+    plc_column = plc.interop.from_arrow(pa.array(lists_column))
+    res = plc.lists.sort_lists(plc_column, ascending, na_position, False)
+    res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+    assert_column_eq(expect, res_stable)

From 81e65ee312af5133ca2b98d52efaeb29c274a825 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 22 Jul 2024 15:18:40 -0500
Subject: [PATCH 37/44] Fix docstring of `DataFrame.apply` (#16351)

This PR fixes docstring of `DataFrame.apply`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16351
---
 python/cudf/cudf/core/dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 288bdfd39b3..1d7136e61e3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4525,7 +4525,6 @@ def apply(
             If False, the funcs will be passed the whole Series at once.
 
             Currently not supported.
-
         engine : {'python', 'numba'}, default 'python'
             Unused. Added for compatibility with pandas.
         engine_kwargs : dict

From 0cac2a9d68341a38721be16132ead14cf4a0d70b Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 22 Jul 2024 14:18:21 -0700
Subject: [PATCH 38/44] Remove size constraints on source files in batched JSON
 reading (#16162)

Addresses https://github.com/rapidsai/cudf/issues/16138
The batched multi-source JSON reader fails when the size of any of the input source buffers exceeds `INT_MAX` bytes.
The goal of this PR is to remove this constraint by modifying the batching behavior of the reader.  Instead of constructing batches that include entire source files, the batches are now constructed at the granularity of byte ranges of size at most `INT_MAX` bytes,

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16162
---
 cpp/include/cudf/io/json.hpp                  |   4 +-
 cpp/src/io/json/read_json.cu                  | 139 +++++++++---------
 cpp/src/io/json/read_json.hpp                 |  18 ++-
 cpp/tests/CMakeLists.txt                      |  14 +-
 .../json_chunked_reader.cu}                   |  81 ++--------
 .../json_quote_normalization_test.cpp         |   0
 cpp/tests/io/{ => json}/json_test.cpp         |   0
 cpp/tests/io/{ => json}/json_tree.cpp         |   0
 .../io/{ => json}/json_type_cast_test.cu      |   0
 cpp/tests/io/json/json_utils.cuh              | 105 +++++++++++++
 .../json_whitespace_normalization_test.cu     |   0
 cpp/tests/io/{ => json}/json_writer.cpp       |   0
 cpp/tests/io/{ => json}/nested_json_test.cpp  |   0
 .../{json_tests.cpp => json_tests.cu}         |  45 +++++-
 14 files changed, 242 insertions(+), 164 deletions(-)
 rename cpp/tests/io/{json_chunked_reader.cpp => json/json_chunked_reader.cu} (64%)
 rename cpp/tests/io/{ => json}/json_quote_normalization_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_tree.cpp (100%)
 rename cpp/tests/io/{ => json}/json_type_cast_test.cu (100%)
 create mode 100644 cpp/tests/io/json/json_utils.cuh
 rename cpp/tests/io/{ => json}/json_whitespace_normalization_test.cu (100%)
 rename cpp/tests/io/{ => json}/json_writer.cpp (100%)
 rename cpp/tests/io/{ => json}/nested_json_test.cpp (100%)
 rename cpp/tests/large_strings/{json_tests.cpp => json_tests.cu} (50%)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7af90766ad0..d47266fdd12 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -333,14 +333,14 @@ class json_reader_options {
    *
    * @param offset Number of bytes of offset
    */
-  void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; }
+  void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; }
 
   /**
    * @brief Set number of bytes to read.
    *
    * @param size Number of bytes to read
    */
-  void set_byte_range_size(size_type size) { _byte_range_size = size; }
+  void set_byte_range_size(size_t size) { _byte_range_size = size; }
 
   /**
    * @brief Set delimiter separating records in JSON lines
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 9cd39038348..0ba4dedfc34 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -148,20 +148,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   return buffer.first(uncomp_data.size());
 }
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream)
+size_t estimate_size_per_subchunk(size_t chunk_size)
 {
-  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
-  rmm::device_uvector<char> buffer(total_source_size, stream);
-  auto readbufspan = ingest_raw_input(buffer,
-                                      sources,
-                                      reader_opts.get_compression(),
-                                      reader_opts.get_byte_range_offset(),
-                                      reader_opts.get_byte_range_size(),
-                                      stream);
-  return find_first_delimiter(readbufspan, '\n', stream);
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
 }
 
 /**
@@ -183,7 +175,6 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
 
   size_t const total_source_size            = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
@@ -198,17 +189,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  // Some magic numbers
-  constexpr int num_subchunks               = 10;  // per chunk_size
-  constexpr size_t min_subchunk_size        = 10000;
-  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
-  constexpr int estimated_compression_ratio = 4;
-
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-
-  size_t const size_per_subchunk =
-    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
@@ -308,67 +290,78 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  std::for_each(sources.begin(), sources.end(), [](auto const& source) {
-    CUDF_EXPECTS(source->size() < std::numeric_limits<int>::max(),
-                 "The size of each source file must be less than INT_MAX bytes");
-  });
-
-  constexpr size_t batch_size_ub = std::numeric_limits<int>::max();
-  size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  /*
+   * The batched JSON reader enforces that the size of each batch is at most INT_MAX
+   * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by
+   * chunk offset and chunk size - that may span across multiple source files.
+   * Note that the batched reader does not work for compressed inputs or for regular
+   * JSON inputs.
+   */
+  size_t const total_source_size = sources_size(sources, 0, 0);
+  size_t chunk_offset            = reader_opts.get_byte_range_offset();
   size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? sources_size(sources, 0, 0) : chunk_size;
-
-  // Identify the position of starting source file from which to begin batching based on
-  // byte range offset. If the offset is larger than the sum of all source
-  // sizes, then start_source is total number of source files i.e. no file is read
-  size_t const start_source = [&]() {
-    size_t sum = 0;
+  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
+                                               : std::min(chunk_size, total_source_size - chunk_offset);
+
+  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
+  size_t const batch_size_ub =
+    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+
+  /*
+   * Identify the position (zero-indexed) of starting source file from which to begin
+   * batching based on byte range offset. If the offset is larger than the sum of all
+   * source sizes, then start_source is total number of source files i.e. no file is
+   * read
+   */
+
+  // Prefix sum of source file sizes
+  size_t pref_source_size = 0;
+  // Starting source file from which to being batching evaluated using byte range offset
+  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
     for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
-      if (sum + sources[src_idx]->size() > chunk_offset) return src_idx;
-      sum += sources[src_idx]->size();
+      if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
+      pref_source_size += sources[src_idx]->size();
     }
     return sources.size();
   }();
-
-  // Construct batches of source files, with starting position of batches indicated by
-  // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch
-  // is capped at INT_MAX bytes.
-  size_t cur_size = 0;
-  std::vector<size_t> batch_positions;
-  std::vector<size_t> batch_sizes;
-  batch_positions.push_back(0);
-  for (size_t i = start_source; i < sources.size(); i++) {
-    cur_size += sources[i]->size();
-    if (cur_size >= batch_size_ub) {
-      batch_positions.push_back(i);
-      batch_sizes.push_back(cur_size - sources[i]->size());
-      cur_size = sources[i]->size();
+  /*
+   * Construct batches of byte ranges spanning source files, with the starting position of batches
+   * indicated by `batch_offsets`. `pref_bytes_size` gives the bytes position from which the current
+   * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
+   * stops.
+   */
+  size_t pref_bytes_size = chunk_offset;
+  size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<size_t> batch_offsets{pref_bytes_size};
+  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+    pref_source_size += sources[i]->size();
+    // If the current source file can subsume multiple batches, we split the file until the
+    // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
+    while (pref_bytes_size < end_bytes_size &&
+           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+      batch_offsets.push_back(batch_offsets.back() + next_batch_size);
+      pref_bytes_size += next_batch_size;
     }
+    i++;
   }
-  batch_positions.push_back(sources.size());
-  batch_sizes.push_back(cur_size);
-
-  // If there is a single batch, then we can directly return the table without the
-  // unnecessary concatenate
-  if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr);
+  /*
+   * If there is a single batch, then we can directly return the table without the
+   * unnecessary concatenate. The size of batch_offsets is 1 if all sources are empty,
+   * or if end_bytes_size is larger than total_source_size.
+   */
+  if (batch_offsets.size() <= 2) return read_batch(sources, reader_opts, stream, mr);
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
-
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_sizes.size(); i++) {
-    batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size));
-    partial_tables.emplace_back(read_batch(
-      host_span<std::unique_ptr<datasource>>(sources.begin() + batch_positions[i],
-                                             batch_positions[i + 1] - batch_positions[i]),
-      batched_reader_opts,
-      stream,
-      rmm::mr::get_current_device_resource()));
-    if (chunk_size <= batch_sizes[i]) break;
-    chunk_size -= batch_sizes[i];
-    batched_reader_opts.set_byte_range_offset(0);
+  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+    batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
+    batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
+    partial_tables.emplace_back(
+      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 0c30b4cad46..ff69f9b7627 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -29,6 +29,19 @@
 
 namespace cudf::io::json::detail {
 
+// Some magic numbers
+constexpr int num_subchunks               = 10;  // per chunk_size
+constexpr size_t min_subchunk_size        = 10000;
+constexpr int estimated_compression_ratio = 4;
+constexpr int max_subchunks_prealloced    = 3;
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream);
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -38,9 +51,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream);
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8e2017ccb97..05e9759632f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,17 +313,17 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
   PERCENT 30
 )
-ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
-ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
-ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
+ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
+ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
+ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
-ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
+ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
@@ -572,7 +572,7 @@ ConfigureTest(
   LARGE_STRINGS_TEST
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
-  large_strings/json_tests.cpp
+  large_strings/json_tests.cu
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json/json_chunked_reader.cu
similarity index 64%
rename from cpp/tests/io/json_chunked_reader.cpp
rename to cpp/tests/io/json/json_chunked_reader.cu
index 23d54f7263c..b9dee54752c 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "io/json/read_json.hpp"
+#include "json_utils.cuh"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -37,65 +37,6 @@ cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
-// function to extract first delimiter in the string in each chunk,
-// collate together and form byte_range for each chunk,
-// parse separately.
-std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
-  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
-  cudf::io::json_reader_options const& reader_opts,
-  int32_t chunk_size,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using namespace cudf::io::json::detail;
-  using cudf::size_type;
-  size_t total_source_size = 0;
-  for (auto const& source : sources) {
-    total_source_size += source->size();
-  }
-  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
-  constexpr size_type no_min_value = -1;
-
-  // Get the first delimiter in each chunk.
-  std::vector<size_type> first_delimiter_index(num_chunks);
-  auto reader_opts_chunk = reader_opts;
-  for (size_t i = 0; i < num_chunks; i++) {
-    auto const chunk_start = i * chunk_size;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_size);
-    first_delimiter_index[i] =
-      find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream);
-    if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; }
-  }
-
-  // Process and allocate record start, end for each worker.
-  using record_range = std::pair<size_type, size_type>;
-  std::vector<record_range> record_ranges;
-  record_ranges.reserve(num_chunks);
-  first_delimiter_index[0] = 0;
-  auto prev                = first_delimiter_index[0];
-  for (size_t i = 1; i < num_chunks; i++) {
-    if (first_delimiter_index[i] == no_min_value) continue;
-    record_ranges.emplace_back(prev, first_delimiter_index[i]);
-    prev = first_delimiter_index[i];
-  }
-  record_ranges.emplace_back(prev, total_source_size);
-
-  std::vector<cudf::io::table_with_metadata> tables;
-  // Process each chunk in parallel.
-  for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1 or
-        static_cast<size_t>(chunk_start) >= total_source_size)
-      continue;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
-    tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
-  }
-  // assume all records have same number of columns, and inferred same type. (or schema is passed)
-  // TODO a step before to merge all columns, types and infer final schema.
-  return tables;
-}
-
 TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
@@ -118,11 +59,11 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -213,11 +154,11 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
similarity index 100%
rename from cpp/tests/io/json_quote_normalization_test.cpp
rename to cpp/tests/io/json/json_quote_normalization_test.cpp
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json/json_test.cpp
similarity index 100%
rename from cpp/tests/io/json_test.cpp
rename to cpp/tests/io/json/json_test.cpp
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
similarity index 100%
rename from cpp/tests/io/json_tree.cpp
rename to cpp/tests/io/json/json_tree.cpp
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
similarity index 100%
rename from cpp/tests/io/json_type_cast_test.cu
rename to cpp/tests/io/json/json_type_cast_test.cu
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
new file mode 100644
index 00000000000..9383797d91b
--- /dev/null
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "io/json/read_json.hpp"
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <numeric>
+
+// Helper function to test correctness of JSON byte range reading.
+// We split the input source files into a set of byte range chunks each of size
+// `chunk_size` and return an array of partial tables constructed from each chunk
+template <typename IndexType = std::int32_t>
+std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::io::json_reader_options const& reader_opts,
+  IndexType chunk_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto total_source_size = [&sources]() {
+    return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+      auto const size = source->size();
+      return sum + size;
+    });
+  }();
+  auto find_first_delimiter_in_chunk =
+    [total_source_size, &sources, &stream](
+      cudf::io::json_reader_options const& reader_opts) -> IndexType {
+    rmm::device_uvector<char> buffer(total_source_size, stream);
+    auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer,
+                                                                sources,
+                                                                reader_opts.get_compression(),
+                                                                reader_opts.get_byte_range_offset(),
+                                                                reader_opts.get_byte_range_size(),
+                                                                stream);
+    // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
+    // return type of that function is size_type. However, when the chunk_size is
+    // larger than INT_MAX, the position of the delimiter can also be larger than
+    // INT_MAX. We do not encounter this overflow error in the detail function
+    // since the batched JSON reader splits the byte_range_size into chunk_sizes
+    // smaller than INT_MAX bytes
+    auto const first_delimiter_position_it =
+      thrust::find(rmm::exec_policy(stream), readbufspan.begin(), readbufspan.end(), '\n');
+    return first_delimiter_position_it != readbufspan.end()
+             ? thrust::distance(readbufspan.begin(), first_delimiter_position_it)
+             : -1;
+  };
+  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
+  constexpr IndexType no_min_value = -1;
+
+  // Get the first delimiter in each chunk.
+  std::vector<IndexType> first_delimiter_index(num_chunks);
+  auto reader_opts_chunk = reader_opts;
+  for (size_t i = 0; i < num_chunks; i++) {
+    auto const chunk_start = i * chunk_size;
+    // We are updating reader_opt_chunks to store offset and size information for the current chunk
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_size);
+    first_delimiter_index[i] = find_first_delimiter_in_chunk(reader_opts_chunk);
+  }
+
+  // Process and allocate record start, end for each worker.
+  using record_range = std::pair<size_t, size_t>;
+  std::vector<record_range> record_ranges;
+  record_ranges.reserve(num_chunks);
+  size_t prev = 0;
+  for (size_t i = 1; i < num_chunks; i++) {
+    // In the case where chunk_size is smaller than row size, the chunk needs to be skipped
+    if (first_delimiter_index[i] == no_min_value) continue;
+    size_t next = static_cast<size_t>(first_delimiter_index[i]) + (i * chunk_size);
+    record_ranges.emplace_back(prev, next);
+    prev = next;
+  }
+  record_ranges.emplace_back(prev, total_source_size);
+
+  std::vector<cudf::io::table_with_metadata> tables;
+  for (auto const& [chunk_start, chunk_end] : record_ranges) {
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr));
+  }
+  // assume all records have same number of columns, and inferred same type. (or schema is passed)
+  // TODO a step before to merge all columns, types and infer final schema.
+  return tables;
+}
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
similarity index 100%
rename from cpp/tests/io/json_whitespace_normalization_test.cu
rename to cpp/tests/io/json/json_whitespace_normalization_test.cu
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
similarity index 100%
rename from cpp/tests/io/json_writer.cpp
rename to cpp/tests/io/json/json_writer.cpp
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
similarity index 100%
rename from cpp/tests/io/nested_json_test.cpp
rename to cpp/tests/io/json/nested_json_test.cpp
diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cu
similarity index 50%
rename from cpp/tests/large_strings/json_tests.cpp
rename to cpp/tests/large_strings/json_tests.cu
index bf16d131ba7..49abf7b484d 100644
--- a/cpp/tests/large_strings/json_tests.cpp
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -14,8 +14,13 @@
  * limitations under the License.
  */
 
+#include "../io/json/json_utils.cuh"
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -28,31 +33,57 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t expected_file_size = std::numeric_limits<int>::max() / 2;
+  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
+  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
   json_string.reserve(json_string.size() * (1UL << log_repetitions));
-  std::size_t numrows = 4;
   for (std::size_t i = 0; i < log_repetitions; i++) {
     json_string += json_string;
-    numrows <<= 1;
   }
 
   constexpr int num_sources = 2;
-  std::vector<cudf::host_span<char>> hostbufs(
-    num_sources, cudf::host_span<char>(json_string.data(), json_string.size()));
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{
-        cudf::host_span<cudf::host_span<char>>(hostbufs.data(), hostbufs.size())})
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
       .lines(true)
       .compression(cudf::io::compression_type::NONE)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
   cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
-  ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources);
+
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& hb : hostbufs) {
+    datasources.emplace_back(cudf::io::datasource::create(hb));
+  }
+  // Test for different chunk sizes
+  std::vector<size_t> chunk_sizes{
+    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  for (auto chunk_size : chunk_sizes) {
+    auto const tables =
+      split_byte_range_reading<std::int64_t>(datasources,
+                                             json_lines_options,
+                                             chunk_size,
+                                             cudf::get_default_stream(),
+                                             rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
 }

From c7b28ceeb46d2b921e30f081a9ed97745c91ff9e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:28:13 -0500
Subject: [PATCH 39/44] Add `drop_nulls` in `cudf-polars` (#16290)

Closes https://github.com/rapidsai/cudf/issues/16219

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16290
---
 python/cudf_polars/cudf_polars/dsl/expr.py  | 30 +++++++++-
 python/cudf_polars/tests/test_drop_nulls.py | 65 +++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_drop_nulls.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a034d55120a..8322d6bd6fb 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -882,7 +882,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
+        if self.name not in (
+            "mask_nans",
+            "round",
+            "setsorted",
+            "unique",
+            "dropnull",
+            "fill_null",
+        ):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -968,6 +975,27 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
+        elif self.name == "dropnull":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py
new file mode 100644
index 00000000000..5dfe9f66a97
--- /dev/null
+++ b/python/cudf_polars/tests/test_drop_nulls.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+@pytest.fixture(
+    params=[
+        [1, 2, 1, 3, 5, None, None],
+        [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        [],
+        [None, None],
+        [1, 2, 3, 4, 5],
+    ]
+)
+def null_data(request):
+    is_empty = pl.Series(request.param).dtype == pl.Null
+    return pl.DataFrame(
+        {
+            "a": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+            "b": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+        }
+    ).lazy()
+
+
+def test_drop_null(null_data):
+    q = null_data.select(pl.col("a").drop_nulls())
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [0, pl.col("a").mean(), pl.col("b")],
+    ids=["scalar", "aggregation", "column_expression"],
+)
+def test_fill_null(null_data, value):
+    q = null_data.select(pl.col("a").fill_null(value))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"]
+)
+def test_fill_null_with_strategy(null_data, strategy):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("strategy", ["forward", "backward"])
+@pytest.mark.parametrize("limit", [0, 1, 2])
+def test_fill_null_with_limit(null_data, strategy, limit):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)

From e6d412cba7c23df7ee500c28257ed9281cea49b9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 06:03:28 -0500
Subject: [PATCH 40/44] Fall back when casting a timestamp to numeric in
 cudf-polars (#16232)

This PR adds logic that falls back to CPU when a cudf-polars query would cast a timestamp column to a numeric type, an unsupported operation in libcudf, which should fix a few polars tests. It could be cleaned up a bit with some of the utilities that will be added in https://github.com/rapidsai/cudf/pull/16150.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16232
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  4 ++
 .../tests/expressions/test_casting.py         | 52 +++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_casting.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 8322d6bd6fb..9835e6f8461 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1188,6 +1188,10 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
+        if not plc.unary.is_supported_cast(self.dtype, value.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py
new file mode 100644
index 00000000000..3e003054338
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_casting.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+_supported_dtypes = [(pl.Int8(), pl.Int64())]
+
+_unsupported_dtypes = [
+    (pl.String(), pl.Int64()),
+]
+
+
+@pytest.fixture
+def dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def tests(dtypes):
+    fromtype, totype = dtypes
+    if fromtype == pl.String():
+        data = ["a", "b", "c"]
+    else:
+        data = [1, 2, 3]
+    return pl.DataFrame(
+        {
+            "a": pl.Series(data, dtype=fromtype),
+        }
+    ).lazy(), totype
+
+
+@pytest.mark.parametrize("dtypes", _supported_dtypes, indirect=True)
+def test_cast_supported(tests):
+    df, totype = tests
+    q = df.select(pl.col("a").cast(totype))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("dtypes", _unsupported_dtypes, indirect=True)
+def test_cast_unsupported(tests):
+    df, totype = tests
+    assert_ir_translation_raises(
+        df.select(pl.col("a").cast(totype)), NotImplementedError
+    )

From ff30c0211109e14b1f6918fcc6c2e2b98f863a1f Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:03:55 -0700
Subject: [PATCH 41/44] Fix compile warnings with `jni_utils.hpp` (#16336)

This fixes the compiler warnings with `jni_utils.hpp`, removing some `const` qualifiers that are redundant.

Closes https://github.com/rapidsai/cudf/issues/16335.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16336
---
 java/src/main/native/include/jni_utils.hpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ea04c1cda83..a3b4bfcb63e 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -284,7 +284,7 @@ class native_jArray {
     return data()[index];
   }
 
-  const N_TYPE* const data() const
+  N_TYPE const* data() const
   {
     init_data_ptr();
     return data_ptr;
@@ -296,17 +296,15 @@ class native_jArray {
     return data_ptr;
   }
 
-  const N_TYPE* const begin() const { return data(); }
+  N_TYPE const* begin() const { return data(); }
 
   N_TYPE* begin() { return data(); }
 
-  const N_TYPE* const end() const { return data() + size(); }
+  N_TYPE const* end() const { return data() + size(); }
 
   N_TYPE* end() { return data() + size(); }
 
-  const J_ARRAY_TYPE get_jArray() const { return orig; }
-
-  J_ARRAY_TYPE get_jArray() { return orig; }
+  J_ARRAY_TYPE get_jArray() const { return orig; }
 
   /**
    * @brief Conversion to std::vector
@@ -430,9 +428,7 @@ class native_jpointerArray {
   T* const* begin() const { return data(); }
   T* const* end() const { return data() + size(); }
 
-  const jlongArray get_jArray() const { return wrapped.get_jArray(); }
-
-  jlongArray get_jArray() { return wrapped.get_jArray(); }
+  jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   void assert_no_nulls() const
   {
@@ -624,7 +620,7 @@ class native_jstring {
     return true;
   }
 
-  const jstring get_jstring() const { return orig; }
+  jstring get_jstring() const { return orig; }
 
   ~native_jstring()
   {
@@ -753,13 +749,13 @@ class native_jstringArray {
     return cache[index];
   }
 
-  char const** const as_c_array() const
+  char const** as_c_array() const
   {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const
+  std::vector<std::string> as_cpp_vector() const
   {
     init_cpp_cache();
     return cpp_cache;

From cd711913d2312ba158e34f5c03784a7b07f1583a Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 24 Jul 2024 00:24:19 +0200
Subject: [PATCH 42/44] Adds write-coalescing code path optimization to FST
 (#16143)

This PR adds an optimized code path to the finite-state transducer (FST) that will use a shared memory-backed write buffer for the translated output and translated output indexes, if the the write buffer does not require allocating excessive amounts of shared memory (i.e., current heuristic is 24 KB/CTA). Writes are first buffered in shared memory and then collaboratively written out using coalesced writes to global memory.

## Benchmark results

Numbers are for libcudf's FST_NVBENCH for a 1.073 GB input. FST outputs one token per input symbol. Benchmarks run on V100 with 900 GB/s theoretical peak BW.
We compare the current FST implementation (old) to an FST implementaation that uses write-coalescing to gmem (new).

|                  | OLD throughput  (GB/s) | NEW throughput  (GB/s) | relative performance |   | 1st kernel, per byte: bytes read/written | 2nd kernel, per byte: bytes read/written | expected SOL (GB/s) | achieved SOL (old) | achieved SOL (new) |
|------------------|------------------------|------------------------|----------------------|---|------------------------------------------|------------------------------------------|---------------------|--------------------|--------------------|
| full             |                   15.7 |                  74.74 |                 476% |   |                                        1 |                                        6 |              102.86 |             15.26% |             72.66% |
| no out-indexes   |                 39.123 |                  105.8 |                 270% |   |                                        1 |                                        2 |              240.00 |             16.30% |             44.08% |
| no-output        |                 229.27 |                 178.92 |                  78% |   |                                        1 |                                        1 |              360.00 |             63.69% |             49.70% |
| out-indexes-only |                  24.95 |                   85.2 |                 341% |   |                                        1 |                                        5 |              120.00 |             20.79% |             71.00% |

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16143
---
 cpp/benchmarks/io/fst.cu              |  16 +-
 cpp/src/io/fst/agent_dfa.cuh          | 371 ++++++++++++++++++++++----
 cpp/src/io/fst/dispatch_dfa.cuh       |   7 +-
 cpp/src/io/fst/lookup_tables.cuh      |  70 +++--
 cpp/src/io/json/json_normalization.cu |  26 +-
 cpp/src/io/json/nested_json_gpu.cu    |  25 +-
 cpp/tests/io/fst/common.hpp           |   4 +-
 cpp/tests/io/fst/fst_test.cu          |   4 +-
 8 files changed, 425 insertions(+), 98 deletions(-)

diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu
index ad19bdfdfcb..31f1bf8e70f 100644
--- a/cpp/benchmarks/io/fst.cu
+++ b/cpp/benchmarks/io/fst.cu
@@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 2171764decd..bc5b94e2718 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,7 +18,9 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/sequence.h>
 
 namespace cudf::io::fst::detail {
@@ -44,9 +46,10 @@ using StateIndexT = uint32_t;
 template <int32_t NUM_ITEMS>
 struct VectorCompositeOp {
   template <typename VectorT>
-  __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
+  __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
   {
     VectorT res{};
+#pragma unroll
     for (int32_t i = 0; i < NUM_ITEMS; ++i) {
       res.Set(i, rhs.Get(lhs.Get(i)));
     }
@@ -57,61 +60,275 @@ struct VectorCompositeOp {
 /**
  * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
  * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
- * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols
- * it intends to write out and writing out such symbols to the given output iterators.
+ * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols
+ * it intends to write out.
+ */
+template <typename TransducerTableT>
+class DFACountCallbackWrapper {
+ public:
+  __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table)
+    : transducer_table(transducer_table)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const&)
+  {
+    out_count = 0;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+    out_count += count;
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+  TransducerTableT const transducer_table;
+  uint32_t out_count{};
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators.
  *
+ * @tparam MaxTranslatedOutChars The maximum number of symbols that are written on a any given state
+ * transition
  * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
  * the symbols that are supposed to be emitted on a given state transition.
- * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
  * transducer table are assignable.
- * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
  */
-template <typename TransducerTableT, typename TransducedOutItT, typename TransducedIndexOutItT>
-class DFASimulationCallbackWrapper {
+template <int MaxTranslatedOutChars,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class DFAWriteCallbackWrapper {
  public:
-  __host__ __device__ __forceinline__ DFASimulationCallbackWrapper(
-    TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it)
-    : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false)
+  __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table,
+                                                     TransducedOutItT out_it,
+                                                     TransducedIndexOutItT out_idx_it,
+                                                     uint32_t out_offset,
+                                                     uint32_t /*tile_out_offset*/,
+                                                     uint32_t /*tile_in_offset*/,
+                                                     uint32_t /*tile_out_count*/)
+    : transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      out_offset(out_offset)
   {
   }
 
   template <typename OffsetT>
-  __host__ __device__ __forceinline__ void Init(OffsetT const& offset)
+  __device__ __forceinline__ void Init(OffsetT const& in_offset)
+  {
+    this->in_offset = in_offset;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_> /*MaxTranslatedOutChars*/)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+#pragma unroll
+    for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) {
+      if (out_char < count) {
+        out_it[out_offset + out_char] =
+          transducer_table(old_state, symbol_id, out_char, read_symbol);
+        out_idx_it[out_offset + out_char] = in_offset + character_index;
+      }
+    }
+    out_offset += count;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_>)
   {
-    this->offset = offset;
-    if (!write) out_count = 0;
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol);
+      out_idx_it[out_offset + out_char] = in_offset + character_index;
+    }
+    out_offset += count;
   }
 
   template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
-                                                      StateIndexT const old_state,
-                                                      StateIndexT const new_state,
-                                                      SymbolIndexT const symbol_id,
-                                                      SymbolT const read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    ReadSymbol(character_index,
+               old_state,
+               new_state,
+               symbol_id,
+               read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars>{});
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+
+ public:
+  TransducerTableT const transducer_table;
+  TransducedOutItT out_it;
+  TransducedIndexOutItT out_idx_it;
+  uint32_t out_offset;
+  uint32_t in_offset;
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to
+ * global memory.
+ *
+ * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given
+ * output iterator
+ * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of
+ * writing them to the given output iterator
+ * @tparam NumWriteBufferItems The number of items to allocate in shared memory for the write
+ * buffer.
+ * @tparam OutputT The type of the translated items
+ * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
+ * the symbols that are supposed to be emitted on a given state transition.
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
+ * transducer table are assignable.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
+ */
+template <bool DiscardIndexOutput,
+          bool DiscardTranslatedOutput,
+          int NumWriteBufferItems,
+          typename OutputT,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class WriteCoalescingCallbackWrapper {
+  struct TempStorage_Offsets {
+    uint16_t compacted_offset[NumWriteBufferItems];
+  };
+  struct TempStorage_Symbols {
+    OutputT compacted_symbols[NumWriteBufferItems];
+  };
+  using offset_cache_t =
+    ::cuda::std::conditional_t<DiscardIndexOutput, cub::NullType, TempStorage_Offsets>;
+  using symbol_cache_t = ::cuda::std::
+    conditional_t<DiscardTranslatedOutput, cub::Uninitialized<cub::NullType>, TempStorage_Symbols>;
+  struct TempStorage_ : offset_cache_t, symbol_cache_t {};
+
+  __device__ __forceinline__ TempStorage_& PrivateStorage()
+  {
+    __shared__ TempStorage private_storage;
+    return private_storage.Alias();
+  }
+  TempStorage_& temp_storage;
+
+ public:
+  struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
+  __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table,
+                                                            TransducedOutItT out_it,
+                                                            TransducedIndexOutItT out_idx_it,
+                                                            uint32_t thread_out_offset,
+                                                            uint32_t tile_out_offset,
+                                                            uint32_t tile_in_offset,
+                                                            uint32_t tile_out_count)
+    : temp_storage(PrivateStorage()),
+      transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      thread_out_offset(thread_out_offset),
+      tile_out_offset(tile_out_offset),
+      tile_in_offset(tile_in_offset),
+      tile_out_count(tile_out_count)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const& offset)
+  {
+    this->in_offset = offset;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
-    if (write) {
-#if defined(__CUDA_ARCH__)
-#pragma unroll 1
-#endif
-      for (uint32_t out_char = 0; out_char < count; out_char++) {
-        out_it[out_count + out_char] =
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      if constexpr (!DiscardIndexOutput) {
+        temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] =
+          in_offset + character_index - tile_in_offset;
+      }
+      if constexpr (!DiscardTranslatedOutput) {
+        temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] =
           transducer_table(old_state, symbol_id, out_char, read_symbol);
-        out_idx_it[out_count + out_char] = offset + character_index;
       }
     }
-    out_count += count;
+    thread_out_offset += count;
   }
 
-  __host__ __device__ __forceinline__ void TearDown() {}
+  __device__ __forceinline__ void TearDown()
+  {
+    __syncthreads();
+    if constexpr (!DiscardTranslatedOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char];
+      }
+    }
+    if constexpr (!DiscardIndexOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_idx_it[tile_out_offset + out_char] =
+          temp_storage.compacted_offset[out_char] + tile_in_offset;
+      }
+    }
+    __syncthreads();
+  }
 
  public:
   TransducerTableT const transducer_table;
   TransducedOutItT out_it;
   TransducedIndexOutItT out_idx_it;
-  uint32_t out_count;
-  uint32_t offset;
-  bool write;
+  uint32_t thread_out_offset;
+  uint32_t tile_out_offset;
+  uint32_t tile_in_offset;
+  uint32_t in_offset;
+  uint32_t tile_out_count;
 };
 
 /**
@@ -125,17 +342,18 @@ class DFASimulationCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __host__ __device__ __forceinline__ StateVectorTransitionOp(
+  __device__ __forceinline__ StateVectorTransitionOp(
     TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol) const
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol) const
   {
+#pragma unroll
     for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
       state_vector[i] = transition_table(state_vector[i], read_symbol_id);
     }
@@ -152,17 +370,17 @@ struct StateTransitionOp {
   TransitionTableT const& transition_table;
   CallbackOpT& callback_op;
 
-  __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
-                                                        StateIndexT state,
-                                                        CallbackOpT& callback_op)
+  __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
+                                               StateIndexT state,
+                                               CallbackOpT& callback_op)
     : transition_table(transition_table), state(state), callback_op(callback_op)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol)
   {
     // Remember what state we were in before we made the transition
     StateIndexT previous_state = state;
@@ -420,7 +638,7 @@ struct AgentDFA {
     __syncthreads();
 
     // Thread's symbols
-    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+    CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
 
     // Parse thread's symbols and transition the state-vector
     if (is_full_block) {
@@ -538,6 +756,43 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // The state transition vector passed on to the second stage of the algorithm
   StateVectorT out_state_vector;
 
+  using OutSymbolT = typename DfaT::OutSymbolT;
+  // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT;
+  static constexpr bool discard_out_index =
+    ::cuda::std::is_same<TransducedIndexOutItT, thrust::discard_iterator<>>::value;
+  static constexpr bool discard_out_it =
+    ::cuda::std::is_same<TransducedOutItT, thrust::discard_iterator<>>::value;
+  using NonWriteCoalescingT =
+    DFAWriteCallbackWrapper<num_max_translated_out,
+                            decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                            TransducedOutItT,
+                            TransducedIndexOutItT>;
+
+  using WriteCoalescingT =
+    WriteCoalescingCallbackWrapper<discard_out_index,
+                                   discard_out_it,
+                                   num_max_translated_out * SYMBOLS_PER_BLOCK,
+                                   OutSymbolT,
+                                   decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                                   TransducedOutItT,
+                                   TransducedIndexOutItT>;
+
+  static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS;
+
+  // Use write-coalescing only if the worst-case output size per tile fits into shared memory
+  static constexpr bool can_use_smem_cache =
+    (sizeof(typename WriteCoalescingT::TempStorage) + sizeof(typename AgentDfaSimT::TempStorage) +
+     sizeof(typename DfaT::SymbolGroupStorageT) + sizeof(typename DfaT::TransitionTableStorageT) +
+     sizeof(typename DfaT::TranslationTableStorageT)) < (48 * 1024);
+  static constexpr bool use_smem_cache =
+    is_translation_pass and
+    (sizeof(typename WriteCoalescingT::TempStorage) <= AgentDFAPolicy::SMEM_THRESHOLD) and
+    can_use_smem_cache;
+
+  using DFASimulationCallbackWrapperT =
+    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
@@ -576,7 +831,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that
     // incorporates all preceding blocks/tiles results)
     //------------------------------------------------------------------------------
-    if (IS_SINGLE_PASS) {
+    if constexpr (IS_SINGLE_PASS) {
       uint32_t tile_idx             = blockIdx.x;
       using StateVectorCompositeOpT = VectorCompositeOp<NUM_STATES>;
 
@@ -623,10 +878,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     }
 
     // Perform finite-state machine simulation, computing size of transduced output
-    DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
-                                 TransducedOutItT,
-                                 TransducedIndexOutItT>
-      callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);
+    DFACountCallbackWrapper count_chars_callback_op{transducer_table};
 
     StateIndexT t_start_state = state;
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
@@ -635,7 +887,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         state,
-                                        callback_wrapper,
+                                        count_chars_callback_op,
                                         cub::Int2Type<IS_SINGLE_PASS>());
 
     __syncthreads();
@@ -650,15 +902,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage;
 
     uint32_t tile_idx = blockIdx.x;
+    uint32_t tile_out_offset{};
+    uint32_t tile_out_count{};
+    uint32_t thread_out_offset{};
     if (tile_idx == 0) {
       OffsetT block_aggregate = 0;
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(callback_wrapper.out_count,
-                       callback_wrapper.out_count,
+        .ExclusiveScan(count_chars_callback_op.out_count,
+                       thread_out_offset,
                        static_cast<OffsetT>(0),
                        cub::Sum{},
                        block_aggregate);
-
+      tile_out_count = block_aggregate;
       if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
         offset_tile_state.SetInclusive(0, block_aggregate);
       }
@@ -671,22 +926,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
         offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
 
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(
-          callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op);
-
+        .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op);
+      tile_out_offset = prefix_op.GetExclusivePrefix();
+      tile_out_count  = prefix_op.GetBlockAggregate();
       if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
         *d_num_transduced_out_it = prefix_op.GetInclusivePrefix();
       }
     }
 
-    callback_wrapper.write = true;
+    DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table,
+                                                               transduced_out_it,
+                                                               transduced_out_idx_it,
+                                                               thread_out_offset,
+                                                               tile_out_offset,
+                                                               blockIdx.x * SYMBOLS_PER_BLOCK,
+                                                               tile_out_count};
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
                                         transition_table,
                                         d_chars,
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         t_start_state,
-                                        callback_wrapper,
+                                        write_translated_callback_op,
                                         cub::Int2Type<true>());
   }
 }
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index be63ec6539f..ef5e9c8a78f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -37,6 +37,11 @@ struct AgentDFAPolicy {
 
   // The number of symbols processed by each thread
   static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering
+  // the output in a write buffer and instead immediately write out to global memory, potentially
+  // resulting in non-coalesced writes
+  static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024;
 };
 
 /**
@@ -49,7 +54,7 @@ struct DeviceFSMPolicy {
   struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> {
     enum {
       BLOCK_THREADS    = 128,
-      ITEMS_PER_THREAD = 32,
+      ITEMS_PER_THREAD = 16,
     };
 
     using AgentDFAPolicy = AgentDFAPolicy<BLOCK_THREADS, ITEMS_PER_THREAD>;
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 5532a7f994b..ae1f81fd541 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -367,18 +367,18 @@ class TransitionTable {
 
   template <typename StateIdT>
   static KernelParameter InitDeviceTransitionTable(
-    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table)
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& transition_table)
   {
     KernelParameter init_data{};
-    // translation_table[state][symbol] -> new state
-    for (std::size_t state = 0; state < translation_table.size(); ++state) {
-      for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
+    // transition_table[state][symbol] -> new state
+    for (std::size_t state = 0; state < transition_table.size(); ++state) {
+      for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) {
         CUDF_EXPECTS(
-          static_cast<int64_t>(translation_table[state][symbol]) <=
+          static_cast<int64_t>(transition_table[state][symbol]) <=
             std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
         init_data.transitions[symbol * MAX_NUM_STATES + state] =
-          static_cast<ItemT>(translation_table[state][symbol]);
+          static_cast<ItemT>(transition_table[state][symbol]);
       }
     }
 
@@ -494,6 +494,10 @@ class dfa_device_view {
   // This is a value queried by the DFA simulation algorithm
   static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
 
+  using OutSymbolT                            = typename TranslationTableT::OutSymbolT;
+  static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT;
+
   using SymbolGroupStorageT      = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
                                                  typename SymbolGroupIdLookupT::TempStorage,
                                                  typename cub::NullType>;
@@ -542,24 +546,33 @@ class dfa_device_view {
  * @tparam OutSymbolT The symbol type being output
  * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output
  * symbols
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state
+ * transition
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used.
  */
-template <typename OutSymbolT,
+template <typename OutSymbolT_,
           typename OutSymbolOffsetT,
           int32_t MAX_NUM_SYMBOLS,
           int32_t MAX_NUM_STATES,
+          int32_t MIN_TRANSLATED_OUT_,
+          int32_t MAX_TRANSLATED_OUT_,
           int32_t MAX_TABLE_SIZE = (MAX_NUM_SYMBOLS * MAX_NUM_STATES)>
 class TransducerLookupTable {
  private:
   struct _TempStorage {
     OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
-    OutSymbolT out_symbols[MAX_TABLE_SIZE];
+    OutSymbolT_ out_symbols[MAX_TABLE_SIZE];
   };
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
@@ -567,6 +580,8 @@ class TransducerLookupTable {
                                                OutSymbolOffsetT,
                                                MAX_NUM_SYMBOLS,
                                                MAX_NUM_STATES,
+                                               MIN_TRANSLATED_OUT,
+                                               MAX_TRANSLATED_OUT,
                                                MAX_TABLE_SIZE>;
 
     OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
@@ -686,14 +701,19 @@ class TransducerLookupTable {
  * sequence of symbols that the finite-state transducer is supposed to output for each transition.
  *
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used
+ * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state
+ * transition
  * @tparam OutSymbolT The symbol type being output
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
  * @param translation_table The translation table
  * @return A translation table of type `TransducerLookupTable`.
  */
 template <std::size_t MAX_TABLE_SIZE,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
           typename OutSymbolT,
           std::size_t MAX_NUM_SYMBOLS,
           std::size_t MAX_NUM_STATES>
@@ -705,20 +725,30 @@ auto make_translation_table(std::array<std::array<std::vector<OutSymbolT>, MAX_N
                                                     OutSymbolOffsetT,
                                                     MAX_NUM_SYMBOLS,
                                                     MAX_NUM_STATES,
+                                                    MIN_TRANSLATED_OUT,
+                                                    MAX_TRANSLATED_OUT,
                                                     MAX_TABLE_SIZE>;
   return translation_table_t::InitDeviceTranslationTable(translation_table);
 }
 
-template <typename TranslationOpT>
+template <typename TranslationOpT,
+          typename OutSymbolT_,
+          std::int32_t MIN_TRANSLATED_OUT_,
+          std::int32_t MAX_TRANSLATED_OUT_>
 class TranslationOp {
  private:
   struct _TempStorage {};
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
-    using LookupTableT = TranslationOp<TranslationOpT>;
+    using LookupTableT =
+      TranslationOp<TranslationOpT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>;
     TranslationOpT translation_op;
   };
 
@@ -772,6 +802,10 @@ class TranslationOp {
  *
  * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`
+ * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given
+ * input symbol
+ * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given
+ * input symbol
  * @param map_op A function object that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`.
  * Invocations of the first signature, (1), must return the number of symbols that are emitted for
@@ -779,10 +813,14 @@ class TranslationOp {
  * that transition, where `i` corresponds to `relative_offse`
  * @return A translation table of type `TranslationO`
  */
-template <typename FunctorT>
+template <typename OutSymbolT,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
+          typename FunctorT>
 auto make_translation_functor(FunctorT map_op)
 {
-  return TranslationOp<FunctorT>::InitDeviceTranslationTable(map_op);
+  return TranslationOp<FunctorT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>::
+    InitDeviceTranslationTable(map_op);
 }
 
 /**
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index ca56a12eb36..760b2214365 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_quotes::TransduceToNormalizedQuotes{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
@@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
-    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+                          fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_whitespace::TransduceToNormalizedWS{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a007754ef4f..8decaf034f3 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1455,11 +1455,14 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  auto json_to_stack_ops_fst = fst::detail::make_fst(
+  static constexpr auto min_translated_out = 0;
+  static constexpr auto max_translated_out = 1;
+  auto json_to_stack_ops_fst               = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
     fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
-    fst::detail::make_translation_table<max_translation_table_size>(
-      to_stack_op::get_translation_table(stack_behavior)),
+    fst::detail::
+      make_translation_table<max_translation_table_size, min_translated_out, max_translated_out>(
+        to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1507,11 +1510,12 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
-  auto filter_fst =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
-                          fst::detail::make_transition_table(token_filter::transition_table),
-                          fst::detail::make_translation_functor(token_filter::TransduceToken{}),
-                          stream);
+  using symbol_t  = thrust::tuple<PdaTokenT, SymbolOffsetT>;
+  auto filter_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
+    fst::detail::make_transition_table(token_filter::transition_table),
+    fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
+    stream);
 
   auto const mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
@@ -1598,7 +1602,8 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
       fst::detail::make_symbol_group_lookup_op(
         fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
-      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      fst::detail::make_translation_functor<StackSymbolT, 1, 1>(
+        fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
     fix_stack_of_excess_chars.Transduce(zip_in,
                                         static_cast<SymbolOffsetT>(json_in.size()),
@@ -1619,7 +1624,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   auto json_to_tokens_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
-    fst::detail::make_translation_table<max_translation_table_size>(
+    fst::detail::make_translation_table<max_translation_table_size, 0, 3>(
       tokenizer_pda::get_translation_table(recover_from_error)),
     stream);
 
diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp
index 382d21fabb8..0177300eda9 100644
--- a/cpp/tests/io/fst/common.hpp
+++ b/cpp/tests/io/fst/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,6 +69,8 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_s
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
 // Translation table (i.e., for each transition, what are the symbols that we output)
+static constexpr auto min_translated_out = 1;
+static constexpr auto max_translated_out = 1;
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{
   {/* IN_STATE         {      [      }      ]      "      \    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}},
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
index 4df0d3ae04d..8a8d3d39e0f 100644
--- a/cpp/tests/io/fst/fst_test.cu
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   // Allocate device-side temporary storage & run algorithm

From 39f256c3397afc9c495cb819636abddb23f81dc0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:03:16 -0500
Subject: [PATCH 43/44] Fall back to CPU for unsupported libcudf binaryops in
 cudf-polars (#16188)

This PR adds logic that should trigger CPU fallback unsupported binary ops.

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16188
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 13 ++++---
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 38 +------------------
 .../tests/expressions/test_literal.py         | 18 ++++++---
 3 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 9835e6f8461..6325feced94 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1424,13 +1424,14 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
-        if (
-            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and plc.traits.is_chrono(left.dtype)
-            and plc.traits.is_chrono(right.dtype)
-            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
         ):
-            raise NotImplementedError("Casting rules for timelike types")
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 1279fe91d48..cd68d021286 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -14,43 +14,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
-
-
-def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
-    """
-    Do two datetime typeids have matching resolution for a binop.
-
-    Parameters
-    ----------
-    lid
-       Left type id
-    rid
-       Right type id
-
-    Returns
-    -------
-    True if resolutions are compatible, False otherwise.
-
-    Notes
-    -----
-    Polars has different casting rules for combining
-    datetimes/durations than libcudf, and while we don't encode the
-    casting rules fully, just reject things we can't handle.
-
-    Precondition for correctness: both lid and rid are timelike.
-    """
-    if lid == rid:
-        return True
-    # Timestamps are smaller than durations in the libcudf enum.
-    lid, rid = sorted([lid, rid])
-    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
-        return rid == plc.TypeId.DURATION_MILLISECONDS
-    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
-        return rid == plc.TypeId.DURATION_MICROSECONDS
-    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
-        return rid == plc.TypeId.DURATION_NANOSECONDS
-    return False
+__all__ = ["from_polars", "downcast_arrow_lists"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 55e688428bd..5bd3131d1d7 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -6,6 +6,8 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -64,11 +66,17 @@ def test_timelike_literal(timestamp, timedelta):
         adjusted=timestamp + timedelta,
         two_delta=timedelta + timedelta,
     )
-    schema = q.collect_schema()
-    time_type = schema["time"]
-    delta_type = schema["delta"]
-    if dtypes.have_compatible_resolution(
-        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    schema = {k: dtypes.from_polars(v) for k, v in q.collect_schema().items()}
+    if plc.binaryop.is_supported_operation(
+        schema["adjusted"],
+        schema["time"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
+    ) and plc.binaryop.is_supported_operation(
+        schema["two_delta"],
+        schema["delta"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
     ):
         assert_gpu_result_equal(q)
     else:

From f0efc8b36a8f43cfa027966265dcea052bb5c45d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 23 Jul 2024 17:17:05 -0700
Subject: [PATCH 44/44] Modify `make_host_vector` and `make_device_uvector`
 factories to optionally use pinned memory and kernel copy (#16206)

Issue #15616

Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel.

Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU.

Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy.

Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`.

Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU.

Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16206
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/detail/gather.cuh            |   2 +-
 cpp/include/cudf/detail/null_mask.cuh         |   4 +-
 .../cudf/detail/utilities/host_memory.hpp     |  51 +++++++++
 .../cudf/detail/utilities/host_vector.hpp     |  24 +++-
 .../detail/utilities/vector_factories.hpp     | 106 ++++++++++++------
 cpp/include/cudf/io/text/detail/trie.hpp      |   4 +-
 cpp/include/cudf/lists/detail/dremel.hpp      |  10 +-
 cpp/include/cudf/utilities/pinned_memory.hpp  |  16 +++
 cpp/include/cudf/utilities/span.hpp           |  32 ++++++
 cpp/src/copying/concatenate.cu                |   6 +-
 cpp/src/copying/contiguous_split.cu           |   3 +-
 cpp/src/datetime/timezone.cpp                 |   6 +-
 cpp/src/dictionary/detail/concatenate.cu      |   2 +-
 cpp/src/io/avro/reader_impl.cu                |   8 +-
 cpp/src/io/csv/reader_impl.cu                 |  44 +++++---
 cpp/src/io/json/json_column.cu                |   4 +-
 cpp/src/io/json/nested_json_gpu.cu            |   6 +-
 cpp/src/io/json/read_json.cu                  |   3 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  10 +-
 cpp/src/io/orc/stripe_enc.cu                  |   4 +-
 cpp/src/io/orc/writer_impl.cu                 |  50 +++++----
 cpp/src/io/orc/writer_impl.hpp                |   9 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  20 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu    |  78 +++++++------
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  10 +-
 cpp/src/io/parquet/writer_impl.cu             |   7 +-
 cpp/src/lists/dremel.cu                       |   6 +-
 cpp/src/strings/combine/join.cu               |   6 +-
 cpp/src/strings/convert/convert_datetime.cu   |   2 +-
 cpp/src/strings/copying/concatenate.cu        |   2 +-
 cpp/src/strings/filter_chars.cu               |   2 +-
 cpp/src/strings/replace/multi_re.cu           |   2 +-
 cpp/src/strings/translate.cu                  |   2 +-
 cpp/src/table/row_operators.cu                |   5 +-
 cpp/src/utilities/cuda_memcpy.cu              |  20 +++-
 .../{pinned_memory.cpp => host_memory.cpp}    |  86 +++++++++++++-
 cpp/tests/io/json/json_tree.cpp               |   6 +-
 cpp/tests/strings/integers_tests.cpp          |   4 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  67 ++++++++++-
 40 files changed, 539 insertions(+), 192 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp
 rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (73%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5e79204a558..a2c2dd3af4c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -671,9 +671,9 @@ add_library(
   src/unary/null_ops.cu
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
+  src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5977c7341c1..d3e9fc4974d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e62675cbc8c..ae6db5409cc 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
new file mode 100644
index 00000000000..c6775a950c9
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
+
+/**
+ * @brief Get the allocator to be used for the host memory allocation.
+ *
+ * @param size The number of elements of type T to allocate
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 2d14d0306cd..f4e5f718da4 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -100,8 +104,12 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
@@ -173,15 +181,25 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..45dc839c9bd 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -32,8 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
 namespace cudf {
@@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -271,21 +272,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -324,7 +318,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -361,11 +355,46 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+/**
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -373,16 +402,24 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = make_host_vector<T>(v.size(), stream);
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -394,8 +431,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -404,7 +441,8 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -412,7 +450,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -423,7 +461,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -435,8 +473,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -444,7 +482,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -460,7 +498,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index e0b9c7635e3..28862d97ede 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -223,11 +223,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }
 
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index d36a4091947..53448424827 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct dremel_device_view {
   size_type const* offsets;
   uint8_t const* rep_levels;
   uint8_t const* def_levels;
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 };
 
 /**
@@ -45,8 +45,8 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 
   operator dremel_device_view() const
   {
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 3e2fa43cb50..fa7e1b35327 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold);
  */
 size_t get_kernel_pinned_copy_threshold();
 
+/**
+ * @brief Set the threshold size for allocating host memory as pinned memory.
+ *
+ * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to
+ * this threshold, the memory will be allocated as pinned memory. If the size is greater than this
+ * threshold, the memory will be allocated as pageable memory.
+ */
+void set_allocate_host_as_pinned_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for allocating host memory as pinned memory.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_allocate_host_as_pinned_threshold();
+
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 3b35e60e034..c5054c733a7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_vector.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -257,6 +259,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   {
   }
 
+  /// Constructor from a host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT>& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
+  /// Constructor from a const host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
   // Copy construction to support const conversion
   /// @param other The span to copy
   template <typename OtherT,
@@ -268,6 +290,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Returns whether the data is device accessible (e.g. pinned memory)
+   *
+   * @return true if the data is device accessible
+   */
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
+
+ private:
+  bool _is_device_accessible{false};
 };
 
 // ===== device_span ===============================================================================
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 6acbafd24fb..4be3054b3dc 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -73,8 +73,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
   });
 
   // Assemble contiguous array of device views
-  auto device_views = thrust::host_vector<column_device_view>();
-  device_views.reserve(views.size());
+  auto device_views =
+    cudf::detail::make_empty_host_vector<column_device_view>(views.size(), stream);
   std::transform(device_view_owners.cbegin(),
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
@@ -84,7 +84,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
-  auto offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   thrust::transform_inclusive_scan(
     thrust::host,
     device_views.cbegin(),
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37db2c74790..95544742fb7 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1539,7 +1539,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
-    std::vector<std::size_t> accum_size_per_iteration;
+    auto accum_size_per_iteration =
+      cudf::detail::make_empty_host_vector<std::size_t>(h_offsets.size(), stream);
     std::size_t accum_size = 0;
     {
       auto current_offset_it = h_offsets.begin();
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 1b0d201501b..7ca1b51df98 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -485,14 +485,12 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   CUDF_EXPECTS(transition_times.size() == offsets.size(),
                "Error reading TZif file for timezone " + std::string{timezone_name});
 
-  std::vector<timestamp_s> ttimes_typed;
-  ttimes_typed.reserve(transition_times.size());
+  auto ttimes_typed = make_empty_host_vector<timestamp_s>(transition_times.size(), stream);
   std::transform(transition_times.cbegin(),
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  std::vector<duration_s> offsets_typed;
-  offsets_typed.reserve(offsets.size());
+  auto offsets_typed = make_empty_host_vector<duration_s>(offsets.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fdc3d9d0ecf..72828309425 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -105,7 +105,7 @@ struct compute_children_offsets_fn {
    */
   rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
-    std::vector<offsets_pair> offsets(columns_ptrs.size());
+    auto offsets = cudf::detail::make_host_vector<offsets_pair>(columns_ptrs.size(), stream);
     thrust::transform_exclusive_scan(
       thrust::host,
       columns_ptrs.begin(),
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 814efe2b5a1..69a0e982a5b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
       auto d_global_dict_data = rmm::device_uvector<char>(0, stream);
 
       if (total_dictionary_entries > 0) {
-        auto h_global_dict      = std::vector<string_index_pair>(total_dictionary_entries);
-        auto h_global_dict_data = std::vector<char>(dictionary_data_size);
-        size_t dict_pos         = 0;
+        auto h_global_dict =
+          cudf::detail::make_host_vector<string_index_pair>(total_dictionary_entries, stream);
+        auto h_global_dict_data =
+          cudf::detail::make_host_vector<char>(dictionary_data_size, stream);
+        size_t dict_pos = 0;
 
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx          = selected_columns[i].first;
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 05faded651d..40d4372ae9d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -567,7 +567,7 @@ void infer_column_types(parse_options const& parse_opts,
 }
 
 std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
+                                       host_span<column_parse::flags const> column_flags,
                                        std::vector<std::string> const& column_names,
                                        device_span<char const> data,
                                        device_span<uint64_t const> row_offsets,
@@ -592,8 +592,8 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+  auto h_data  = cudf::detail::make_host_vector<void*>(num_active_columns, stream);
+  auto h_valid = cudf::detail::make_host_vector<bitmask_type*>(num_active_columns, stream);
 
   for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -622,14 +622,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   return out_buffers;
 }
 
-std::vector<data_type> determine_column_types(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<std::string const> column_names,
-                                              device_span<char const> data,
-                                              device_span<uint64_t const> row_offsets,
-                                              int32_t num_records,
-                                              host_span<column_parse::flags> column_flags,
-                                              rmm::cuda_stream_view stream)
+cudf::detail::host_vector<data_type> determine_column_types(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  host_span<std::string const> column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  int32_t num_records,
+  host_span<column_parse::flags> column_flags,
+  cudf::size_type num_active_columns,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> column_types(column_flags.size());
 
@@ -653,7 +655,8 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
                      stream);
 
   // compact column_types to only include active columns
-  std::vector<data_type> active_col_types;
+  auto active_col_types =
+    cudf::detail::make_empty_host_vector<data_type>(num_active_columns, stream);
   std::copy_if(column_types.cbegin(),
                column_types.cend(),
                std::back_inserter(active_col_types),
@@ -697,8 +700,10 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   auto const num_actual_columns = static_cast<int32_t>(column_names.size());
   auto num_active_columns       = num_actual_columns;
-  auto column_flags             = std::vector<column_parse::flags>(
-    num_actual_columns, column_parse::enabled | column_parse::inferred);
+  auto column_flags =
+    cudf::detail::make_host_vector<column_parse::flags>(num_actual_columns, stream);
+  std::fill(
+    column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred);
 
   // User did not pass column names to override names in the file
   // Process names from the file to remove empty and duplicated strings
@@ -842,8 +847,15 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // Exclude the end-of-data row from number of rows with actual data
   auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto const column_types = determine_column_types(
-    reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
+  auto const column_types = determine_column_types(reader_opts,
+                                                   parse_opts,
+                                                   column_names,
+                                                   data,
+                                                   row_offsets,
+                                                   num_records,
+                                                   column_flags,
+                                                   num_active_columns,
+                                                   stream);
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3e587768b11..17fa7abdffe 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -622,7 +622,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  std::vector<uint8_t> ignore_vals(num_columns, 0);
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
@@ -812,7 +812,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<1>(a) < thrust::get<1>(b);
   });
   // move columns data to device.
-  std::vector<json_column_data> columns_data(num_columns);
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
     auto& col            = col_ref.get();
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8decaf034f3..1e484d74679 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1703,10 +1703,8 @@ void make_json_column(json_column& root_column,
   auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
-  thrust::host_vector<PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto tokens            = cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
+  auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0ba4dedfc34..590f70864b1 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -78,10 +78,9 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_t> delimiter_map{};
+    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    delimiter_map.reserve(sources.size());
     size_t bytes_read = 0;
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 8e20505d3ff..e3b9a048be8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
+  auto const num_struct_cols =
+    std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) {
+      return chunk.type_kind == STRUCT;
+    });
+  auto prefix_sums_to_update =
+    cudf::detail::make_empty_host_vector<thrust::pair<size_type, uint32_t*>>(num_struct_cols,
+                                                                             stream);
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
+      prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx});
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 805959327ac..80f32512b98 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
   if (rg_bounds.count() == 0) return;
 
   // Convert map to a vector of views of the `elem_sizes` device buffers
-  std::vector<decimal_column_element_sizes> h_sizes;
-  h_sizes.reserve(elem_sizes.size());
+  auto h_sizes =
+    cudf::detail::make_empty_host_vector<decimal_column_element_sizes>(elem_sizes.size(), stream);
   std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
     return decimal_column_element_sizes{p.first, p.second};
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 4cb20bb7518..f3b8cfbc836 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -444,14 +444,17 @@ namespace {
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         stripe_size_limits max_stripe_size)
+                                         stripe_size_limits max_stripe_size,
+                                         rmm::cuda_stream_view stream)
 {
-  std::vector<stripe_rowgroups> infos;
-  auto const num_rowgroups = rowgroup_bounds.size().first;
-  size_t stripe_start      = 0;
-  size_t stripe_bytes      = 0;
-  size_type stripe_rows    = 0;
-  for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
+  // Number of stripes is not known in advance. Only reserve a single element to use pinned memory
+  // resource if at all enabled.
+  auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
+  size_type const num_rowgroups = rowgroup_bounds.size().first;
+  size_type stripe_start        = 0;
+  size_t stripe_bytes           = 0;
+  size_type stripe_rows         = 0;
+  for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
@@ -470,7 +473,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
     if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
                                     stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
-      infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(rg_idx - stripe_start)});
       stripe_start = rg_idx;
       stripe_bytes = 0;
       stripe_rows  = 0;
@@ -479,7 +484,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     stripe_bytes += rowgroup_total_bytes;
     stripe_rows += rowgroup_rows_max;
     if (rg_idx + 1 == num_rowgroups) {
-      infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(num_rowgroups - stripe_start)});
     }
   }
 
@@ -1336,7 +1343,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     if (num_file_blobs == 0) { return {}; }
 
     // Create empty file stats and merge groups
-    std::vector<statistics_chunk> h_stat_chunks(num_file_blobs);
+    auto h_stat_chunks = cudf::detail::make_host_vector<statistics_chunk>(num_file_blobs, stream);
     cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_file_blobs, stream);
     // Fill in stats_merge and stat_chunks on the host
     for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -1677,39 +1684,39 @@ struct pushdown_null_masks {
   // Owning vector for masks in device memory
   std::vector<rmm::device_uvector<bitmask_type>> data;
   // Pointers to pushdown masks in device memory. Can be same for multiple columns.
-  std::vector<bitmask_type const*> masks;
+  cudf::detail::host_vector<bitmask_type const*> masks;
 };
 
 pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                                              rmm::cuda_stream_view stream)
 {
-  std::vector<bitmask_type const*> mask_ptrs;
-  mask_ptrs.reserve(orc_table.num_columns());
+  auto mask_ptrs =
+    cudf::detail::make_empty_host_vector<bitmask_type const*>(orc_table.num_columns(), stream);
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
     if (col.num_children() == 0) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
     auto const null_mask      = col.null_mask();
 
     if (null_mask == nullptr and parent_pd_mask == nullptr) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     if (col.orc_kind() == STRUCT) {
       if (null_mask != nullptr and parent_pd_mask == nullptr) {
         // Reuse own null mask
-        mask_ptrs.emplace_back(null_mask);
+        mask_ptrs.push_back(null_mask);
       } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
         // Reuse parent's pushdown mask
-        mask_ptrs.emplace_back(parent_pd_mask);
+        mask_ptrs.push_back(parent_pd_mask);
       } else {
         // Both are nullable, allocate new pushdown mask
         pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
-        mask_ptrs.emplace_back(pd_masks.back().data());
+        mask_ptrs.push_back({pd_masks.back().data()});
 
         thrust::transform(rmm::exec_policy(stream),
                           null_mask,
@@ -1724,7 +1731,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
-      mask_ptrs.emplace_back(pd_masks.back().data());
+      mask_ptrs.push_back({pd_masks.back().data()});
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
     }
   }
@@ -1815,8 +1822,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
-  std::vector<TypeKind> type_kinds;
-  type_kinds.reserve(orc_columns.size());
+  auto type_kinds = cudf::detail::make_empty_host_vector<TypeKind>(orc_columns.size(), stream);
   std::transform(
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
@@ -2299,7 +2305,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream);
 
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index bd082befe0c..f5f8b3cfed9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -78,10 +78,9 @@ struct orc_table_view {
  * Provides a container-like interface to iterate over rowgroup indices.
  */
 struct stripe_rowgroups {
-  uint32_t id;     // stripe id
-  uint32_t first;  // first rowgroup in the stripe
-  uint32_t size;   // number of rowgroups in the stripe
-  stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
+  size_type id;     // stripe id
+  size_type first;  // first rowgroup in the stripe
+  size_type size;   // number of rowgroups in the stripe
   [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
   [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
@@ -125,7 +124,7 @@ class orc_streams {
  */
 struct file_segmentation {
   hostdevice_2dvector<rowgroup_rows> rowgroups;
-  std::vector<stripe_rowgroups> stripes;
+  cudf::detail::host_vector<stripe_rowgroups> stripes;
 
   auto num_rowgroups() const noexcept { return rowgroups.size().first; }
   auto num_stripes() const noexcept { return stripes.size(); }
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 11f4a00ee8b..481c1e9fcdd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -141,11 +141,11 @@ struct stats_caster {
       // Local struct to hold host columns
       struct host_column {
         // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        thrust::host_vector<T> val;
+        cudf::detail::host_vector<T> val;
         std::vector<bitmask_type> null_mask;
         cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups)
-          : val(total_row_groups),
+        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
             null_mask(
               cudf::util::div_rounding_up_safe<size_type>(
                 cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
@@ -170,8 +170,14 @@ struct stats_caster {
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
         {
-          std::vector<char> chars{};
-          std::vector<cudf::size_type> offsets(1, 0);
+          auto const total_char_count = std::accumulate(
+            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+              return sum + str.size_bytes();
+            });
+          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+          auto offsets =
+            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+          offsets.push_back(0);
           for (auto const& str : host_strings) {
             auto tmp =
               str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
@@ -206,8 +212,8 @@ struct stats_caster {
             null_count);
         }
       };  // local struct host_column
-      host_column min(total_row_groups);
-      host_column max(total_row_groups);
+      host_column min(total_row_groups, stream);
+      host_column max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 05e0d8c0111..794750ab6d2 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -804,16 +804,16 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   rmm::device_buffer decomp_pages(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
+  auto comp_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto comp_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
+  auto copy_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto copy_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
   thrust::fill(rmm::exec_policy_nosync(stream),
@@ -822,7 +822,6 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
   for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
@@ -836,56 +835,64 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
       if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
+        copy_in.push_back({page.page_data, static_cast<size_t>(offset)});
+        copy_out.push_back({dst_base, static_cast<size_t>(offset)});
       }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      comp_in.push_back(
+        {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
+      comp_out.push_back(
+        {dst_base + offset, static_cast<size_t>(page.uncompressed_page_size - offset)});
       page.page_data = dst_base;
       decomp_offset += page.uncompressed_page_size;
     });
+  }
+  auto d_comp_in = cudf::detail::make_device_uvector_async(
+    comp_in, stream, rmm::mr::get_current_device_resource());
+  auto d_comp_out = cudf::detail::make_device_uvector_async(
+    comp_out, stream, rmm::mr::get_current_device_resource());
+
+  int32_t start_pos = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    device_span<device_span<uint8_t const> const> d_comp_in_view{d_comp_in.data() + start_pos,
+                                                                 codec.num_pages};
+
+    device_span<device_span<uint8_t> const> d_comp_out_view(d_comp_out.data() + start_pos,
+                                                            codec.num_pages);
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        gpuinflate(
+          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
                                      d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+          gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream);
         }
         break;
       case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    stream);
         break;
       case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
+        gpu_debrotli(d_comp_in_view,
+                     d_comp_out_view,
                      d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
@@ -893,8 +900,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         break;
       case LZ4_RAW:
         nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
@@ -1127,9 +1134,8 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
-  std::vector<decompression_info> h_decomp_info =
-    cudf::detail::make_std_vector_sync(decomp_info, stream);
-  std::vector<size_t> temp_cost(pages.size());
+  auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream);
+  auto temp_cost     = cudf::detail::make_host_vector<size_t>(pages.size(), stream);
   thrust::transform(thrust::host,
                     h_decomp_info.begin(),
                     h_decomp_info.end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ff47dfc4cf3..e006cc7d714 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -370,7 +370,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
                        rmm::cuda_stream_view stream)
 {
   auto const num_pages = pages.size();
-  std::vector<page_index_info> page_indexes(num_pages);
+  auto page_indexes    = cudf::detail::make_host_vector<page_index_info>(num_pages, stream);
 
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     auto const& chunk = chunks[c];
@@ -1031,8 +1031,8 @@ struct get_page_num_rows {
 };
 
 struct input_col_info {
-  int const schema_idx;
-  size_type const nesting_depth;
+  int schema_idx;
+  size_type nesting_depth;
 };
 
 /**
@@ -1523,8 +1523,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    std::vector<input_col_info> h_cols_info;
-    h_cols_info.reserve(_input_columns.size());
+    auto h_cols_info =
+      cudf::detail::make_empty_host_vector<input_col_info>(_input_columns.size(), _stream);
     std::transform(_input_columns.cbegin(),
                    _input_columns.cend(),
                    std::back_inserter(h_cols_info),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8413e716224..2df71b77301 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1824,7 +1824,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   size_type max_page_fragment_size =
     max_page_fragment_size_opt.value_or(default_max_page_fragment_size);
 
-  std::vector<size_type> column_frag_size(num_columns, max_page_fragment_size);
+  auto column_frag_size = cudf::detail::make_host_vector<size_type>(num_columns, stream);
+  std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size);
 
   if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) {
     std::vector<size_t> column_sizes;
@@ -1880,7 +1881,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
 
-  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  auto part_frag_offset =
+    cudf::detail::make_empty_host_vector<int>(num_frag_in_part.size() + 1, stream);
+  // Store the idx of the first fragment in each partition
   std::exclusive_scan(
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 5625e1bf05c..50f40924478 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets =
-    cudf::detail::make_host_vector_async(d_column_offsets, stream);
-  thrust::host_vector<size_type> column_ends =
-    cudf::detail::make_host_vector_async(d_column_ends, stream);
+  auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  auto column_ends    = cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c4cc0dbe09d..b534e9b2e5b 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -169,8 +169,10 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    auto offsets = cudf::detail::make_device_uvector_async(
-      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    auto h_offsets = cudf::detail::make_host_vector<size_type>(2, stream);
+    h_offsets[0]   = 0;
+    h_offsets[1]   = chars.size();
+    auto offsets   = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
     return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2f4ebf97264..64a2107e17a 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
     : format(fmt), d_items(0, stream)
   {
     specifiers.insert(extra_specifiers.begin(), extra_specifiers.end());
-    std::vector<format_item> items;
+    auto items  = cudf::detail::make_empty_host_vector<format_item>(format.length(), stream);
     auto str    = format.data();
     auto length = format.length();
     while (length > 0) {
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 7622e39e735..352e0f9f41a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = std::vector<size_t>(views.size() + 1);
+  auto input_offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
     thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index a34828fa97e..48620af8cad 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -129,7 +129,7 @@ std::unique_ptr<column> filter_characters(
 
   // convert input table for copy to device memory
   size_type table_size = static_cast<size_type>(characters_to_filter.size());
-  thrust::host_vector<char_range> htable(table_size);
+  auto htable          = cudf::detail::make_host_vector<char_range>(table_size, stream);
   std::transform(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cd60a4296b9..31234ea42ec 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
   auto d_buffer          = rmm::device_buffer(buffer_size, stream);
 
   // copy all the reprog_device instances to a device memory array
-  std::vector<reprog_device> progs;
+  auto progs = cudf::detail::make_empty_host_vector<reprog_device>(h_progs.size(), stream);
   std::transform(h_progs.begin(),
                  h_progs.end(),
                  std::back_inserter(progs),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 16b22d0de4c..a242b008a54 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -97,7 +97,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
-  thrust::host_vector<translate_table> htable(table_size);
+  auto htable = cudf::detail::make_host_vector<translate_table>(table_size, stream);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 13c31e8ae4c..2969557c78f 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -308,7 +308,10 @@ auto decompose_structs(table_view table,
 auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
 {
   std::vector<detail::dremel_data> dremel_data;
-  std::vector<detail::dremel_device_view> dremel_device_views;
+  auto const num_list_columns = std::count_if(
+    table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; });
+  auto dremel_device_views =
+    cudf::detail::make_empty_host_vector<detail::dremel_device_view>(num_list_columns, stream);
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
       dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 3d0822d8545..0efb881eb3e 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/integer_utils.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -26,15 +29,24 @@ namespace cudf::detail {
 
 namespace {
 
+// Simple kernel to copy between device buffers
+CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx < n) { dst[idx] = src[idx]; }
+}
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
 
   if (size < get_kernel_pinned_copy_threshold()) {
-    thrust::copy_n(rmm::exec_policy_nosync(stream),
-                   static_cast<const char*>(src),
-                   size,
-                   static_cast<char*>(dst));
+    const int block_size = 256;
+    auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    // We are explicitly launching the kernel here instead of calling a thrust function because the
+    // thrust function can potentially call cudaMemcpyAsync instead of using a kernel
+    copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
   }
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp
similarity index 73%
rename from cpp/src/utilities/pinned_memory.cpp
rename to cpp/src/utilities/host_memory.cpp
index 3ea4293fc60..7c3cea42023 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
@@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource {
     }
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     deallocate_async(ptr, bytes, alignment, stream_);
     stream_.wait();
@@ -186,6 +186,61 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+class new_delete_memory_resource {
+ public:
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    try {
+      return rmm::detail::aligned_host_allocate(
+        bytes, alignment, [](std::size_t size) { return ::operator new(size); });
+    } catch (std::bad_alloc const& e) {
+      CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, alignment);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    rmm::detail::aligned_host_deallocate(
+      ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); });
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, alignment);
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  bool operator==(new_delete_memory_resource const& other) const { return true; }
+
+  bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
+
+  friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+};
+
+static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
+              "Pageable pool mr must be accessible from the host");
+
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -225,4 +280,29 @@ void set_kernel_pinned_copy_threshold(size_t threshold)
 
 size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
+CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
+{
+  // use pageable memory for all host allocations
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_allocate_host_as_pinned_threshold(size_t threshold)
+{
+  allocate_host_as_pinned_threshold() = threshold;
+}
+
+size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
+
+namespace detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource()
+{
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+}  // namespace detail
+
 }  // namespace cudf
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 7a72b77e1fb..8bcd5790e99 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu(
 {
   constexpr bool include_quote_char = true;
   // Copy the JSON tokens to the host
-  thrust::host_vector<cuio_json::PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(tokens_gpu, stream);
-  thrust::host_vector<cuio_json::SymbolOffsetT> token_indices =
-    cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
+  auto tokens        = cudf::detail::make_host_vector_async(tokens_gpu, stream);
+  auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 51e9b3bd0a0..7a038fa6d75 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(
+  auto const d_integers = cudf::detail::make_device_uvector_sync(
     h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
@@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..93259fd63ee 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -18,16 +18,33 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-class PinnedMemoryTest : public cudf::test::BaseFixture {};
+class PinnedMemoryTest : public cudf::test::BaseFixture {
+  size_t prev_copy_threshold;
+  size_t prev_alloc_threshold;
 
-TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+ public:
+  PinnedMemoryTest()
+    : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()},
+      prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()}
+  {
+  }
+  ~PinnedMemoryTest() override
+  {
+    cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold);
+    cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold);
+  }
+};
+
+TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet)
 {
   // Global environment for temporary files
   auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
@@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
   // reset memory resource back
   cudf::set_pinned_memory_resource(last_mr);
 }
+
+TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet)
+{
+  cudf::set_kernel_pinned_copy_threshold(12345);
+  EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet)
+{
+  cudf::set_allocate_host_as_pinned_threshold(12345);
+  EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, MakePinnedVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(0);
+
+  // should always use pinned memory
+  {
+    auto const vec = cudf::detail::make_pinned_vector_async<char>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+}
+
+TEST_F(PinnedMemoryTest, MakeHostVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(7);
+
+  // allocate smaller than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate the same size as the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<char>(7, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate larger than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(2, cudf::get_default_stream());
+    EXPECT_FALSE(vec.get_allocator().is_device_accessible());
+  }
+}