From 836f800e61acafa0fa6b3c7d9826904f0ba2ad06 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:14 -0500
Subject: [PATCH 01/25] Use CTAD with Thrust function objects (#9768)

While reviewing another PR, I noticed unnecessary usage of explicit template parameters with Thrust function objects and decided to open a small PR to clean this up (CTAD showed up in C++17).

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9768
---
 cpp/include/cudf/strings/detail/gather.cuh       |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp         |  7 ++-----
 cpp/src/copying/concatenate.cu                   |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu           |  4 ++--
 cpp/src/groupby/sort/group_rank_scan.cu          |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh         |  6 +++---
 .../sort/group_single_pass_reduction_util.cuh    | 16 ++++++++--------
 cpp/src/groupby/sort/group_tdigest.cu            | 10 +++++-----
 cpp/src/join/hash_join.cu                        |  2 +-
 cpp/src/join/join_utils.cu                       |  2 +-
 .../lists/combine/concatenate_list_elements.cu   |  2 +-
 cpp/src/lists/contains.cu                        |  7 ++-----
 cpp/src/lists/interleave_columns.cu              |  8 ++++----
 cpp/src/quantiles/tdigest/tdigest.cu             |  7 ++-----
 cpp/src/reductions/scan/scan_inclusive.cu        |  9 ++++-----
 cpp/src/rolling/grouped_rolling.cu               |  6 +++---
 cpp/src/rolling/rolling_collect_list.cu          |  2 +-
 cpp/src/sort/rank.cu                             | 10 +++++-----
 cpp/src/strings/copying/concatenate.cu           |  2 +-
 cpp/src/strings/findall.cu                       |  7 ++-----
 cpp/src/strings/repeat_strings.cu                |  2 +-
 cpp/src/strings/split/split.cu                   | 14 ++++----------
 cpp/tests/iterator/iterator_tests.cuh            | 11 +++--------
 .../apply_boolean_mask_tests.cpp                 |  4 ++--
 cpp/tests/strings/fixed_point_tests.cpp          |  2 +-
 cpp/tests/transform/row_bit_count_test.cu        |  6 ++----
 26 files changed, 63 insertions(+), 89 deletions(-)
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ec4a88a0e46..eb7258830ce 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -315,7 +315,7 @@ std::unique_ptr<cudf::column> gather(
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
-    thrust::plus<size_t>{});
+    thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cd2ac9f3ec1..ccfdde2270c 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1502,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     // concatenate them together, skipping children that are null.
     std::vector<column_view> children;
-    thrust::copy_if(std::cbegin(cols),
-                    std::cend(cols),
-                    valids,  // stencil
-                    std::back_inserter(children),
-                    thrust::identity<bool>{});
+    thrust::copy_if(
+      std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f4b6a8bf5fd..34c0cea683e 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     device_views.cend(),
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
-    thrust::plus<size_t>{});
+    thrust::plus{});
   auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4e2a5b68abc..bde7c985df1 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -173,8 +173,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 
   // Generate bitmask for the output.
   // Only mean and M2 values can be nullable. Count column must be non-nullable.
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
   if (null_count > 0) {
     result_means->set_null_mask(null_mask, null_count);           // copy null_mask
     result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 935ef9554a9..f36bdc0a660 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -79,7 +79,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                 group_labels.end(),
                                 mutable_ranks.begin<size_type>(),
                                 mutable_ranks.begin<size_type>(),
-                                thrust::equal_to<size_type>{},
+                                thrust::equal_to{},
                                 scan_op);
 
   return ranks;
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index ae3e3232e06..e25fdd6fc27 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -115,7 +115,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -160,7 +160,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -214,7 +214,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index decb127b264..95a36f40e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -191,7 +191,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -215,10 +215,10 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
@@ -264,7 +264,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -283,10 +283,10 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
       auto const binop =
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 146a6a8c31c..551eb128231 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -625,7 +625,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to<size_type>{},    // key equality check
+                        thrust::equal_to{},               // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -850,8 +850,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::minimum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -864,8 +864,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::maximum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::maximum{});
 
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index e4bd1938ecc..c5b680f129e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
     left_join_complement_size = thrust::count_if(rmm::exec_policy(stream),
                                                  invalid_index_map->begin(),
                                                  invalid_index_map->end(),
-                                                 thrust::identity<size_type>());
+                                                 thrust::identity());
   }
   return join_size + left_join_complement_size;
 }
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 4aca4b4a9cf..9e98f87e7f0 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map->begin(),
                                               right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
+                                              thrust::identity{}) -
                               right_indices_complement->begin();
     right_indices_complement->resize(indices_count, stream);
   }
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4bef312b396..2ddede97ce4 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -225,7 +225,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
   auto list_entries =
     gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
 
   return make_lists_column(num_rows,
                            std::move(list_offsets),
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index bdbc9ae013c..b48982d205a 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -74,11 +74,8 @@ struct lookup_functor {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
     } else {
-      return cudf::detail::valid_if(result_validity.begin<bool>(),
-                                    result_validity.end<bool>(),
-                                    thrust::identity<bool>{},
-                                    stream,
-                                    mr);
+      return cudf::detail::valid_if(
+        result_validity.begin<bool>(), result_validity.end<bool>(), thrust::identity{}, stream, mr);
     }
   }
 
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index b9b73d98ed2..220cb25a942 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -228,8 +228,8 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    auto [null_mask, null_count] =
+      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
@@ -306,7 +306,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
 
     if (data_has_null_mask) {
       auto [null_mask, null_count] = cudf::detail::valid_if(
-        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+        validities.begin(), validities.end(), thrust::identity{}, stream, mr);
       if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
     }
 
@@ -405,7 +405,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   }
 
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
   return make_lists_column(num_output_lists,
                            std::move(list_offsets),
                            std::move(list_entries),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 57c221b15ed..18e7d02d086 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -348,11 +348,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(tdigest_is_empty,
-                                  tdigest_is_empty + tdv.size(),
-                                  thrust::logical_not<size_type>{},
-                                  stream,
-                                  mr);
+    return cudf::detail::valid_if(
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 70f5ca90539..b0e761c4c3b 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -50,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view,
   auto valid_itr = detail::make_validity_iterator(*d_input);
 
   auto first_null_position = [&] {
-    size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream),
-                                                     valid_itr,
-                                                     valid_itr + input_view.size(),
-                                                     thrust::identity<bool>{}) -
-                                 valid_itr;
+    size_type const first_null =
+      thrust::find_if_not(
+        rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) -
+      valid_itr;
     size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0;
     return std::min(input_view.size(), first_null + exclusive_offset);
   }();
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 509f67bb5c6..5a7f15148d8 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -142,8 +142,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
+    return thrust::minimum{}(preceding_window,
+                             idx - group_start + 1);  // Preceding includes current row.
   };
 
   auto following_calculator = [d_group_offsets = group_offsets.data(),
@@ -152,7 +152,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
     auto group_label = d_group_labels[idx];
     auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
                                                           // is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+    return thrust::minimum{}(following_window, (group_end - 1) - idx);
   };
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
index ecef90dc8e1..30c39bde7d2 100644
--- a/cpp/src/rolling/rolling_collect_list.cu
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -75,7 +75,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum<size_type>{});
+                         thrust::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c8a908e44cd..e9589e6c4b3 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to<size_type>{},
+                        thrust::equal_to{},
                         tie_breaker);
   auto sorted_tied_rank = thrust::make_transform_iterator(
     dense_rank_sorted.begin(),
@@ -171,8 +171,8 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::minimum{},
+                                       thrust::identity{},
                                        stream);
 }
 
@@ -189,8 +189,8 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::maximum{},
+                                       thrust::identity{},
                                        stream);
 }
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index db8b37a9592..3822fa8bf5a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -96,7 +96,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
                                    device_views_ptr + views.size(),
                                    std::next(d_partition_offsets.begin()),
                                    chars_size_transform{},
-                                   thrust::plus<size_t>{});
+                                   thrust::plus{});
   auto const output_chars_size = d_partition_offsets.back_element(stream);
   stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 3ab5b55020c..8d96f0de415 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -153,11 +153,8 @@ std::unique_ptr<table> findall_re(
 
   std::vector<std::unique_ptr<column>> results;
 
-  size_type const columns = thrust::reduce(rmm::exec_policy(stream),
-                                           find_counts.begin(),
-                                           find_counts.end(),
-                                           0,
-                                           thrust::maximum<size_type>{});
+  size_type const columns = thrust::reduce(
+    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 458f3ed885c..7820e0064a6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -369,7 +369,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              thrust::make_counting_iterator<size_type>(strings_count),
                              fn,
                              int64_t{0},
-                             thrust::plus<int64_t>{});
+                             thrust::plus{});
 
   return std::make_pair(std::move(output_sizes), total_bytes);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 5113b418501..c6e52a79059 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -490,11 +490,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                            token_counts.begin(),
-                                            token_counts.end(),
-                                            0,
-                                            thrust::maximum<size_type>{});
+  auto const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -748,11 +745,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                     [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
 
   // column count is the maximum number of tokens for any string
-  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                                 token_counts.begin(),
-                                                 token_counts.end(),
-                                                 0,
-                                                 thrust::maximum<size_type>{});
+  size_type const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 07eb595449c..d93c1275122 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -51,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              d_in,
-                              dev_result.begin(),
-                              num_items,
-                              thrust::minimum<T_output>{},
-                              init);
+    cub::DeviceReduce::Reduce(
+      nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init);
 
     // Allocate temporary storage
     rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
@@ -68,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum<T_output>{},
+                              thrust::minimum{},
                               init);
 
     evaluate(expected, dev_result, "cub test");
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 813cceb0861..c80a8fba55c 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest)
                   dec32_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec32_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
   thrust::copy_if(thrust::seq,
                   dec64_data.cbegin(),
                   dec64_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec64_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
 
   decimal32_wrapper expect_col32(
     expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3});
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index ce4280e0733..5872a9e5bb7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -329,4 +329,4 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
 
   auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
   EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 7fb7326f221..43d63c9fd22 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -239,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(thrust::device,
-                   ints_view.begin<int32_t>(),
-                   ints_view.end<int32_t>(),
-                   thrust::identity<int32_t>());
+  thrust::tabulate(
+    thrust::device, ints_view.begin<int32_t>(), ints_view.end<int32_t>(), thrust::identity{});
 
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);

From 677e63236a81ea3c402df993845a1fdc98072c9e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:25 -0500
Subject: [PATCH 02/25] Avoid overflow for `fixed_point` `cudf::cast` and
 performance optimization (#9772)

This resolves https://github.com/rapidsai/cudf/issues/9000.

When using `cudf::cast` for a wider decimal type to a narrower decimal type, you can overflow. This PR modifies the code path for this specific use case so that the "rescale" happens for the type cast. A small perf improvement was added when you have identical scales to avoid rescaling.

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9772
---
 cpp/src/unary/cast_ops.cu      | 49 +++++++++++++++++++++-------------
 cpp/tests/unary/cast_tests.cpp | 13 +++++++++
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e852b00796a..131fde11cf8 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -305,28 +305,39 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
-
-    auto const size = input.size();
-    auto temporary =
-      std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
-                               size,
-                               rmm::device_buffer{size * cudf::size_of(type), stream},
-                               copy_bitmask(input, stream),
-                               input.null_count());
-
     using SourceDeviceT = device_storage_type_t<SourceT>;
     using TargetDeviceT = device_storage_type_t<TargetT>;
 
-    mutable_column_view output_mutable = *temporary;
-
-    thrust::transform(rmm::exec_policy(stream),
-                      input.begin<SourceDeviceT>(),
-                      input.end<SourceDeviceT>(),
-                      output_mutable.begin<TargetDeviceT>(),
-                      device_cast<SourceDeviceT, TargetDeviceT>{});
-
-    // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
+    auto casted = [&]() {
+      auto const size = input.size();
+      auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
+                                             size,
+                                             rmm::device_buffer{size * cudf::size_of(type), stream},
+                                             copy_bitmask(input, stream),
+                                             input.null_count());
+
+      mutable_column_view output_mutable = *output;
+
+      thrust::transform(rmm::exec_policy(stream),
+                        input.begin<SourceDeviceT>(),
+                        input.end<SourceDeviceT>(),
+                        output_mutable.begin<TargetDeviceT>(),
+                        device_cast<SourceDeviceT, TargetDeviceT>{});
+
+      return output;
+    };
+
+    if (input.type().scale() == type.scale()) return casted();
+
+    if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) {
+      // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT
+      auto temporary = casted();
+      return detail::rescale<TargetT>(*temporary, scale_type{type.scale()}, stream, mr);
+    } else {
+      // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow
+      auto temporary = detail::rescale<SourceT>(input, scale_type{type.scale()}, stream, mr);
+      return detail::cast(*temporary, type, stream, mr);
+    }
   }
 
   template <typename TargetT,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 4d0009ab20a..db457623d8d 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1003,3 +1003,16 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
+{
+  using namespace numeric;
+  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const input    = fp_wrapperB{{141230900000L}, scale_type{-10}};
+  auto const expected = fp_wrapperA{{14123}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal32>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}

From 7d8a8e53f495279ae129fa46948c07230d6e77b4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 1 Dec 2021 13:53:05 -0800
Subject: [PATCH 03/25] Allow cast decimal128 to string and add tests (#9756)

Small PR that enables Decimal128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9756
---
 java/src/main/native/src/ColumnViewJni.cpp       |  3 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4efac307627..02d5dc4569c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
         case cudf::type_id::DECIMAL32:
-        case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index fa9052029cc..31a52eb2ec0 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3372,6 +3372,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastBigDecimalToString() {
+    BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
+        new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("9328323982309091029831.002")
+    };
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(bigValues);
+         ColumnVector values = cv.castTo(DType.STRING);
+         ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
+             "9223372036854775808.191",
+             "9328323982309091029831.002")) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastStringToBigDecimal() {
     String[] bigValues = {"923121331938210123.321",

From 5491cc789bbfbaad7099124dcfe004719e7f013c Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 Dec 2021 03:30:50 +0530
Subject: [PATCH 04/25] Fix memory error due to lambda return type deduction
 limitation (#9778)

Fixes #9703
replace device lambda with device functor with return type. (due to [14. extended-lambda-restrictions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions) )
~add `__host__` to lambda for nvcc return type deduction to work properly.~
~replaced `auto` (generic lambda) with `size_type`.~
fixes shared memory write error caused in #9703

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9778
---
 cpp/src/sort/rank.cu         | 13 +++++++++----
 cpp/tests/sort/rank_test.cpp | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index e9589e6c4b3..de0a44e3234 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -194,6 +194,12 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        stream);
 }
 
+// Returns index, count
+template <typename T>
+struct index_counter {
+  __device__ T operator()(size_type i) { return T{i, 1}; }
+};
+
 void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
@@ -208,10 +214,9 @@ void rank_average(cudf::device_span<size_type const> group_keys,
   using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    cudf::detail::make_counting_transform_iterator(1,
-                                                   [] __device__(auto i) {
-                                                     return MinCount{i, 1};
-                                                   }),
+    // Use device functor with return type. Cannot use device lambda due to limitation.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions
+    cudf::detail::make_counting_transform_iterator(1, index_counter<MinCount>{}),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 94e389fc7ce..926ad1e203e 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct)
   this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true);
 }
 
+struct RankLarge : public BaseFixture {
+};
+
+TEST_F(RankLarge, average_large)
+{
+  // testcase of https://github.com/rapidsai/cudf/issues/9703
+  auto iter = thrust::counting_iterator<int64_t>(0);
+  fixed_width_column_wrapper<int64_t> col1(iter, iter + 10558);
+  auto result =
+    cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false);
+  fixed_width_column_wrapper<double, int> expected(iter + 1, iter + 10559);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 }  // namespace test
 }  // namespace cudf

From c10966cc3847ca9837ddc7ce5df9c4d9b7c743d8 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 2 Dec 2021 18:48:03 +0800
Subject: [PATCH 05/25] Fix make_empty_scalar_like on list_type (#9759)

Fixes #9758

In `make_empty_scalar_like`, we create list scalar with the list column itself, which is wrong. The correct way is with the child of list column.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9759
---
 cpp/src/scalar/scalar_factories.cpp      | 7 +++++--
 cpp/tests/reductions/reduction_tests.cpp | 8 ++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d2876435780..c18b57d220f 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -184,10 +185,12 @@ std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
-    case type_id::LIST:
-      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+    case type_id::LIST: {
+      auto const empty_child = empty_like(lists_column_view(column).child());
+      result                 = make_list_scalar(empty_child->view(), stream, mr);
       result->set_valid_async(false, stream);
       break;
+    }
     case type_id::STRUCT:
       // The input column must have at least 1 row to extract a scalar (row) from it.
       result = detail::get_element(column, 0, stream, mr);
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index d8ee8f9d08d..e138cd6f68e 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1961,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
       auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
       EXPECT_EQ(is_valid, list_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+      if (is_valid) {
+        CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view());
+      } else {
+        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view());
+      }
     };
 
     if (succeeded_condition) {
@@ -2047,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 
   // test against empty input
   this->reduction_test(LCW{},
-                       ElementCol{{0}, {0}},  // expected_value,
+                       ElementCol{},  // expected_value,
                        true,
                        false,
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));

From 582cc6e466c7d941e1b34893fd56fbd42fe90d68 Mon Sep 17 00:00:00 2001
From: Chong Gao <gaochong.gc@qq.com>
Date: Thu, 2 Dec 2021 21:12:01 +0800
Subject: [PATCH 06/25] Add sample JNI API (#9728)

Add sample JNI

Signed-off-by: Chong Gao <res_life@163.com>

Authors:
  - Chong Gao (https://github.com/res-life)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9728
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 30 +++++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 15 ++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 21 +++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b0791fb440f..b11808ed023 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -678,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
                                                                 boolean[] keysDescending,
                                                                 boolean[] keysNullSmallest);
 
+  private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -2801,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     return result;
   }
 
+
+  /**
+   * Gather `n` samples from table randomly
+   * Note: does not preserve the ordering
+   * Example:
+   * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}}
+   * n: 3
+   * replacement: false
+   *
+   * output:       {col1: {3, 1, 4}, col2: {8, 6, 9}}
+   *
+   * replacement: true
+   *
+   * output:       {col1: {3, 1, 1}, col2: {8, 6, 6}}
+   *
+   * throws "logic_error" if `n` > table rows and `replacement` == FALSE.
+   * throws "logic_error" if `n` < 0.
+   *
+   * @param n non-negative number of samples expected from table
+   * @param replacement Allow or disallow sampling of the same row more than once.
+   * @param seed Seed value to initiate random number generator.
+   *
+   * @return Table containing samples
+   */
+  public Table sample(long n, boolean replacement, long seed) {
+    return new Table(sample(nativeHandle, n, replacement, seed));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a78d40a58f7..f3377bb002d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -20,6 +20,7 @@
 #include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
@@ -3147,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong n, jboolean replacement,
+                                                              jlong seed) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto sample_with_replacement =
+        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index fa221e19387..0b2f56895e9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7584,4 +7584,25 @@ void testExplodeOuterPosition() {
       }
     }
   }
+
+  @Test
+  void testSample() {
+    try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) {
+      try (Table ret = t.sample(3, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(5, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(8, true, 0);
+           Table expected = new Table.TestBuilder()
+               .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+    }
+  }
 }

From 1077daeaad8ff710de6f4fbb99f2e7371b4af8de Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:51:04 -0600
Subject: [PATCH 07/25] Fix caching in `Series.applymap` (#9821)

The cache key we were generating for these functions didn't take into account the constants that could be different in the bytecode. Hence certain functions were causing cache hits when they actually differ by a constant value somewhere in the logic.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9821
---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 19 +++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py           |  4 +++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index dc126546f15..c9c2c440632 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -593,3 +593,22 @@ def func(row, c, k):
         return y
 
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
+
+
+def test_masked_udf_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    data = cudf.Series([1, 2, 3])
+    expect = data ** 2
+    got = data.applymap(lambda x: x ** 2)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # update the constant value being used and make sure
+    # it does not result in a cache hit
+
+    expect = data ** 3
+    got = data.applymap(lambda x: x ** 3)
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 5fa091a0081..f0533dcaa72 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -216,12 +216,14 @@ def make_cache_key(udf, sig):
     recompiling the same function for the same set of types
     """
     codebytes = udf.__code__.co_code
+    constants = udf.__code__.co_consts
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
         cvarbytes = dumps(cvars)
     else:
         cvarbytes = b""
-    return codebytes, cvarbytes, sig
+
+    return constants, codebytes, cvarbytes, sig
 
 
 def compile_udf(udf, type_signature):

From 50acf076d4a35bc57dc00a416f0d9507b1992c0f Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 2 Dec 2021 14:07:31 -0800
Subject: [PATCH 08/25] Fix stream usage in `segmented_gather()` (#9679)

`detail::segmented_gather()` inadvertently uses `cuda_default_stream` in some parts of its implementation, while using the user-specified stream in others.

This applies to the calls to `copy_range_in_place()`, `allocate_like()`, and `make_lists_column()`. ~This might produce race conditions, which might explain NVIDIA/spark-rapids/issues/4060. It's a rare failure that's quite hard to reproduce.~ This might lead to over-synchronization, though bad output is unlikely.

The commit here should sort this out, by switching to the `detail` APIs corresponding to the calls above.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9679
---
 cpp/src/lists/copying/segmented_gather.cu | 21 ++++++++++++---------
 cpp/src/lists/extract.cu                  |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 8cbcddc1c58..41187b96cdb 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -88,14 +88,15 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child       = std::move(child_table->release().front());
 
   // Create list offsets from gather_map.
-  auto output_offset = cudf::allocate_like(
-    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = cudf::detail::allocate_like(
+    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr);
   auto output_offset_view = output_offset->mutable_view();
-  cudf::copy_range_in_place(gather_map.offsets(),
-                            output_offset_view,
-                            gather_map.offset(),
-                            gather_map.offset() + output_offset_view.size(),
-                            0);
+  cudf::detail::copy_range_in_place(gather_map.offsets(),
+                                    output_offset_view,
+                                    gather_map.offset(),
+                                    gather_map.offset() + output_offset_view.size(),
+                                    0,
+                                    stream);
   // Assemble list column & return
   auto null_mask       = cudf::detail::copy_bitmask(value_column.parent(), stream, mr);
   size_type null_count = value_column.null_count();
@@ -103,7 +104,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                            std::move(output_offset),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 381864e1a68..7c6c612eb25 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
   // `segmented_gather()` on a null index should produce a null row.
   if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
 
-  auto const d_indices = column_device_view::create(indices);
+  auto const d_indices = column_device_view::create(indices, stream);
   // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
   auto const null_replaced_iter_begin =
     cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());

From b848dd5c9cfef7e3523810d67296e037f31945c1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 2 Dec 2021 14:40:57 -0800
Subject: [PATCH 09/25] Fix ORC writer crash with empty input columns (#9808)

Fixes https://github.com/rapidsai/cudf/issues/9783

Skip some parts of writing when the input table was zero rows.
Add is_empty to `hostdevice_2dvector`.
Add Python test with empty columns.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9808
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e53fb3589bc..db02125ce77 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 6b02874146e..dc176992434 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)

From 0c08543955a01470baa4fbdbab927298dcf6afd9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 3 Dec 2021 04:53:37 +0530
Subject: [PATCH 10/25] Update cmake and conda to 22.02 (#9746)

Changes related to update to 22.02 in one conda environment recipe (only 11.5) was missed. This adds that.
Also makes project version changes in cmake related to update from 21.12 to 22.02.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9746
---
 ci/release/update-version.sh     | 6 +++---
 cpp/CMakeLists.txt               | 2 +-
 cpp/libcudf_kafka/CMakeLists.txt | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eeb76a15fcc..86432a92128 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,13 +30,13 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
-sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
 # cpp cudf_jni update
-sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50bdc30b292..e2b317f2e03 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 435ff3b5987..d0874b57c2d 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES CXX
 )
 

From ce64e53264d21c6e59fe98548796a7b6bae24c07 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 2 Dec 2021 20:19:12 -0600
Subject: [PATCH 11/25] Add directory-partitioned data support to
 cudf.read_parquet (#9720)

Closes #9684
Closes #9690

This PR refactors path handling in `cudf.read_parquet` and uses `pyarrow.dataset` to support for directory-partitioned datasets (with full filterings support at row-group granularity). Since it is my understanding that some users may wish for directory-partitioned columns to be represented as a raw dtype (rather than always becoming categorical), I also added an optional `categorical_partitions` argument (open to suggestions on a better name).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9720
---
 python/cudf/cudf/io/json.py              |   2 +-
 python/cudf/cudf/io/orc.py               |   2 +-
 python/cudf/cudf/io/parquet.py           | 286 +++++++++++++++++++----
 python/cudf/cudf/tests/test_parquet.py   |  94 +++++++-
 python/cudf/cudf/tests/test_s3.py        |   9 +-
 python/cudf/cudf/utils/ioutils.py        |  26 ++-
 python/dask_cudf/dask_cudf/io/parquet.py |   7 +-
 7 files changed, 355 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index a48cfd07d3f..1f876214b16 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -37,7 +37,7 @@ def read_json(
         for source in path_or_buf:
             if ioutils.is_directory(source, **kwargs):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source
+                    passed_filesystem=None, path=source, **kwargs
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index ecb1b0cd185..c1cce3f996f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -316,7 +316,7 @@ def read_orc(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+                passed_filesystem=None, path=source, **kwargs,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 9d665d9a0a5..04d64969a16 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
+from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
 
 
@@ -80,7 +81,7 @@ def write_to_dataset(
         kwargs for to_parquet function.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path)
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
     metadata = []
 
@@ -163,11 +164,19 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-def _process_row_groups(paths, fs, filters=None, row_groups=None):
+def _process_dataset(
+    paths, fs, filters=None, row_groups=None, categorical_partitions=True,
+):
+    # Returns:
+    #     file_list - Expanded/filtered list of paths
+    #     row_groups - Filtered list of row-group selections
+    #     partition_keys - list of partition keys for each file
+    #     partition_categories - Categories for each partition
 
     # The general purpose of this function is to (1) expand
     # directory input into a list of paths (using the pyarrow
-    # dataset API), and (2) to apply row-group filters.
+    # dataset API), (2) to apply row-group filters, and (3)
+    # to discover directory-partitioning information
 
     # Deal with case that the user passed in a directory name
     file_list = paths
@@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None):
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
 
-    if filters is not None:
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+    # Deal with directory partitioning
+    # Get all partition keys (without filters)
+    partition_categories = defaultdict(list)
+    file_fragment = None
+    for file_fragment in dataset.get_fragments():
+        keys = ds._get_partition_keys(file_fragment.partition_expression)
+        if not (keys or partition_categories):
+            # Bail - This is not a directory-partitioned dataset
+            break
+        for k, v in keys.items():
+            if v not in partition_categories[k]:
+                partition_categories[k].append(v)
+        if not categorical_partitions:
+            # Bail - We don't need to discover all categories.
+            # We only need to save the partition keys from this
+            # first `file_fragment`
+            break
+
+    if partition_categories and file_fragment is not None:
+        # Check/correct order of `categories` using last file_frag,
+        # because `_get_partition_keys` does NOT preserve the
+        # partition-hierarchy order of the keys.
+        cat_keys = [
+            part.split("=")[0]
+            for part in file_fragment.path.split(fs.sep)
+            if "=" in part
+        ]
+        if set(partition_categories) == set(cat_keys):
+            partition_categories = {
+                k: partition_categories[k]
+                for k in cat_keys
+                if k in partition_categories
+            }
+
+    # If we do not have partitioned data and
+    # are not filtering, we can return here
+    if filters is None and not partition_categories:
+        return file_list, row_groups, [], {}
+
+    # Record initial row_groups input
+    row_groups_map = {}
+    if row_groups is not None:
+        # Make sure paths and row_groups map 1:1
+        # and save the initial mapping
+        if len(paths) != len(file_list):
+            raise ValueError(
+                "Cannot specify a row_group selection for a directory path."
+            )
+        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}
+
+    # Apply filters and discover partition columns
+    partition_keys = []
+    if partition_categories or filters is not None:
+        file_list = []
+        if filters is not None:
+            row_groups = []
+        for file_fragment in dataset.get_fragments(filter=filters):
+            path = file_fragment.path
+
+            # Extract hive-partition keys, and make sure they
+            # are orederd the same as they are in `partition_categories`
+            if partition_categories:
+                raw_keys = ds._get_partition_keys(
+                    file_fragment.partition_expression
+                )
+                partition_keys.append(
+                    [
+                        (name, raw_keys[name])
+                        for name in partition_categories.keys()
+                    ]
                 )
 
-    return file_list, row_groups
+            # Apply row-group filtering
+            selection = row_groups_map.get(path, None)
+            if selection is not None or filters is not None:
+                filtered_row_groups = [
+                    rg_info.id
+                    for rg_fragment in file_fragment.split_by_row_group(
+                        filters, schema=dataset.schema,
+                    )
+                    for rg_info in rg_fragment.row_groups
+                ]
+            file_list.append(path)
+            if filters is not None:
+                if selection is None:
+                    row_groups.append(filtered_row_groups)
+                else:
+                    row_groups.append(
+                        [
+                            rg_id
+                            for rg_id in filtered_row_groups
+                            if rg_id in selection
+                        ]
+                    )
+
+    return (
+        file_list,
+        row_groups,
+        partition_keys,
+        partition_categories if categorical_partitions else {},
+    )
 
 
 def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
@@ -319,6 +407,7 @@ def read_parquet(
     strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=False,
+    categorical_partitions=True,
     *args,
     **kwargs,
 ):
@@ -345,17 +434,29 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
-    filepath_or_buffer = paths if paths else filepath_or_buffer
-    if fs is None and filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
 
-    # Apply filters now (before converting non-local paths to buffers).
-    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
-    # into a full list of files if it is a directory.
-    if fs is not None:
-        filepath_or_buffer, row_groups = _process_row_groups(
-            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+    # Use pyarrow dataset to detect/process directory-partitioned
+    # data and apply filters. Note that we can only support partitioned
+    # data and filtering if the input is a single directory or list of
+    # paths.
+    partition_keys = []
+    partition_categories = {}
+    if fs and paths:
+        (
+            paths,
+            row_groups,
+            partition_keys,
+            partition_categories,
+        ) = _process_dataset(
+            paths,
+            fs,
+            filters=filters,
+            row_groups=row_groups,
+            categorical_partitions=categorical_partitions,
         )
+    elif filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # Check if we should calculate the specific byte-ranges
     # needed for each parquet file. We always do this when we
@@ -380,15 +481,6 @@ def read_parquet(
     filepaths_or_buffers = []
     for i, source in enumerate(filepath_or_buffer):
 
-        if ioutils.is_directory(source, **kwargs):
-            # Note: For now, we know `fs` is an fsspec filesystem
-            # object, but it may be an arrow object in the future
-            fsspec_fs = ioutils._ensure_filesystem(
-                passed_filesystem=fs, path=source
-            )
-            source = ioutils.stringify_pathlike(source)
-            source = fsspec_fs.sep.join([source, "*.parquet"])
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
@@ -410,6 +502,117 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    # Warn user if they are not using cudf for IO
+    # (There is a good chance this was not the intention)
+    if engine != "cudf":
+        warnings.warn(
+            "Using CPU via PyArrow to read Parquet dataset."
+            "This option is both inefficient and unstable!"
+        )
+        if filters is not None:
+            warnings.warn(
+                "Parquet row-group filtering is only supported with "
+                "'engine=cudf'. Use pandas or pyarrow API directly "
+                "for full CPU-based filtering functionality."
+            )
+
+    return _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        skiprows=skiprows,
+        num_rows=num_rows,
+        strings_to_categorical=strings_to_categorical,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        **kwargs,
+    )
+
+
+def _parquet_to_frame(
+    paths_or_buffers,
+    *args,
+    row_groups=None,
+    partition_keys=None,
+    partition_categories=None,
+    **kwargs,
+):
+
+    # If this is not a partitioned read, only need
+    # one call to `_read_parquet`
+    if not partition_keys:
+        return _read_parquet(
+            paths_or_buffers, *args, row_groups=row_groups, **kwargs,
+        )
+
+    # For partitioned data, we need a distinct read for each
+    # unique set of partition keys. Therefore, we start by
+    # aggregating all paths with matching keys using a dict
+    plan = {}
+    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
+        rgs = row_groups[i] if row_groups else None
+        tkeys = tuple(keys)
+        if tkeys in plan:
+            plan[tkeys][0].append(path)
+            if rgs is not None:
+                plan[tkeys][1].append(rgs)
+        else:
+            plan[tkeys] = ([path], None if rgs is None else [rgs])
+
+    dfs = []
+    for part_key, (key_paths, key_row_groups) in plan.items():
+        # Add new DataFrame to our list
+        dfs.append(
+            _read_parquet(
+                key_paths, *args, row_groups=key_row_groups, **kwargs,
+            )
+        )
+        # Add partition columns to the last DataFrame
+        for (name, value) in part_key:
+            if partition_categories and name in partition_categories:
+                # Build the categorical column from `codes`
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=len(dfs[-1]),
+                )
+                dfs[-1][name] = build_categorical_column(
+                    categories=partition_categories[name],
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
+                    ordered=False,
+                )
+            else:
+                # Not building categorical columns, so
+                # `value` is already what we want
+                dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+
+    # Concatenate dfs and return.
+    # Assume we can ignore the index if it has no name.
+    return (
+        cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
+        if len(dfs) > 1
+        else dfs[0]
+    )
+
+
+def _read_parquet(
+    filepaths_or_buffers,
+    engine,
+    columns=None,
+    row_groups=None,
+    skiprows=None,
+    num_rows=None,
+    strings_to_categorical=None,
+    use_pandas_metadata=None,
+    *args,
+    **kwargs,
+):
+    # Simple helper function to dispatch between
+    # cudf and pyarrow to read parquet data
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
@@ -421,7 +624,6 @@ def read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
-        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b6595be9566..516ee0d17d3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1578,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf):
 
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
-def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
+def test_parquet_partitioned(tmpdir_factory, cols, filename):
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1597,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
         gdf_dir, index=False, partition_cols=cols, partition_file_name=filename
     )
 
-    # Use pandas since dataset may be partitioned
-    expect = pd.read_parquet(pdf_dir)
-    got = pd.read_parquet(gdf_dir)
-    assert_eq(expect, got)
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
     if filename:
@@ -1629,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
     gdf.to_parquet(dir1, partition_cols=cols)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
-    # cudf read_parquet cannot handle partitioned dataset
-    expect = pd.read_parquet(dir1)
-    got = pd.read_parquet(dir2)
+    # Read back with cudf
+    expect = cudf.read_parquet(dir1)
+    got = cudf.read_parquet(dir2)
     assert_eq(expect, got)
 
     gdf = cudf.DataFrame(
@@ -1645,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         gdf.to_parquet(dir1, partition_cols=cols)
 
 
+@pytest.mark.parametrize(
+    "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]],
+)
+@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"])
+@pytest.mark.parametrize("use_cat", [True, False])
+def test_read_parquet_partitioned_filtered(
+    tmpdir, pfilters, selection, use_cat
+):
+    path = str(tmpdir)
+    size = 100
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    df.to_parquet(path, partition_cols=["c", "b"])
+
+    if selection == "files":
+        # Pass in a list of paths
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = None
+    elif selection == "row-groups":
+        # Pass in a list of paths AND row-group ids
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = [[0] for p in read_path]
+    else:
+        # Pass in a directory path
+        # (row-group selection not allowed in this case)
+        read_path = path
+        row_groups = None
+
+    # Filter on partitioned columns
+    expect = pd.read_parquet(read_path, filters=pfilters)
+    got = cudf.read_parquet(
+        read_path,
+        filters=pfilters,
+        row_groups=row_groups,
+        categorical_partitions=use_cat,
+    )
+    if use_cat:
+        assert got.dtypes["b"] == "category"
+        assert got.dtypes["c"] == "category"
+    else:
+        # Check that we didn't get categorical
+        # columns, but convert back to categorical
+        # for comparison with pandas
+        assert got.dtypes["b"] == "object"
+        assert got.dtypes["c"] == "int"
+        got["b"] = pd.Categorical(
+            got["b"].to_pandas(), categories=list("abcd")
+        )
+        got["c"] = pd.Categorical(
+            got["c"].to_pandas(), categories=np.arange(4)
+        )
+    assert_eq(expect, got)
+
+    # Filter on non-partitioned column.
+    # Cannot compare to pandas, since the pyarrow
+    # backend will filter by row (and cudf can
+    # only filter by column, for now)
+    filters = [("a", "==", 10)]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and 10 in got["a"]
+
+    # Filter on both kinds of columns
+    filters = [[("a", "==", 10)], [("c", "==", 1)]]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+
+
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
     gdf_fname = tmpdir.join("gdf.parquet")
     test_path = "test/path"
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index dea876891f8..5738e1f0d00 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
-def test_write_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("partition_cols", [None, ["String"]])
+def test_write_parquet(s3_base, s3so, pdf, partition_cols):
     fname = "test_parquet_writer.parquet"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so)
+        gdf.to_parquet(
+            "s3://{}/{}".format(bname, fname),
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
         assert s3fs.exists("s3://{}/{}".format(bname, fname))
 
         got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname)))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0f9d9d53b23..e6c031acac7 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -154,6 +154,9 @@
 strings_to_categorical : boolean, default False
     If True, return string columns as GDF_CATEGORY dtype; if False, return a
     as GDF_STRING dtype.
+categorical_partitions : boolean, default True
+    Whether directory-partitioned columns should be interpreted as categorical
+    or raw dtypes.
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
@@ -1129,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs, _, paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
         except ValueError as e:
@@ -1153,9 +1156,9 @@ def is_directory(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
-            )
+            )[0]
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
                 return False
@@ -1189,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         else:
             path_or_data = [path_or_data]
 
-        # Pyarrow did not support the protocol or storage options.
-        # Fall back to fsspec
         try:
-            fs, _, fs_paths = fsspec.get_fs_token_paths(
+            fs, _, fs_paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
             return_paths = fs_paths
@@ -1322,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
-        fs, _, _ = fsspec.get_fs_token_paths(
+        fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
-        )
+        )[0]
 
         if not _is_local_filesystem(fs):
             filepath_or_buffer = fsspec.open(
@@ -1513,11 +1514,12 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path):
+def _ensure_filesystem(passed_filesystem, path, **kwargs):
     if passed_filesystem is None:
-        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
-            0
-        ]
+        return get_fs_token_paths(
+            path[0] if isinstance(path, list) else path,
+            storage_options=kwargs.get("storage_options", {}),
+        )[0]
     return passed_filesystem
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b47a5e78095..a49d73493ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -126,11 +126,8 @@ def _read_paths(
 
                 # Build the column from `codes` directly
                 # (since the category is often a larger dtype)
-                codes = (
-                    as_column(partitions[i].keys.index(index2))
-                    .as_frame()
-                    .repeat(len(df))
-                    ._data[None]
+                codes = as_column(
+                    partitions[i].keys.index(index2), length=len(df),
                 )
                 df[name] = build_categorical_column(
                     categories=partitions[i].keys,

From e82cc62e2ea61211c64ba4784cb131d5b535644c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 3 Dec 2021 04:46:25 -0800
Subject: [PATCH 12/25] Fix join of MultiIndex to Index with one column and
 overlapping name. (#9830)

This PR resolves #9823

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9830
---
 python/cudf/cudf/core/_base_index.py   |  4 ++--
 python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d688b75ed14..2fcc976d8e1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1147,14 +1147,14 @@ def join(
         if isinstance(lhs, cudf.MultiIndex):
             if level is not None and isinstance(level, int):
                 on = lhs._data.select_by_index(level).names[0]
-            right_names = (on,) or right_names
+            right_names = (on,) if on is not None else right_names
             on = right_names[0]
             if how == "outer":
                 how = "left"
             elif how == "right":
                 how = "inner"
         else:
-            # Both are nomal indices
+            # Both are normal indices
             right_names = left_names
             on = right_names[0]
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 0518cc2c9b9..d25c6130bfb 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -2150,3 +2150,16 @@ def test_join_redundant_params():
         lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
     with pytest.raises(ValueError):
         lhs.merge(rhs, left_on="c", right_on="b")
+
+
+def test_join_multiindex_index():
+    # test joining a MultiIndex with an Index with overlapping name
+    lhs = (
+        cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]})
+        .set_index(["a", "b"])
+        .index
+    )
+    rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index
+    expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner")
+    got = lhs.join(rhs, how="inner")
+    assert_join_results_equal(expect, got, how="inner")

From 69e6dbbf447a951e4b08f15c737eedcbaf3291da Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 3 Dec 2021 10:18:04 -0500
Subject: [PATCH 13/25] Move the binary_ops common dispatcher logic to be
 executed on the CPU (#9816)

* move NullEquals to separate file

* To improve runtime performance move more binary_ops dispatch to host

* make sure to forceinline the operator_dispatcher

* Correct style issues found by ci

* Expand the binary-op compiled benchmark suite

* Ensure forceinline is on binary ops device dispatch functions

* Correct style issues found by ci

Co-authored-by: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Co-authored-by: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
---
 cpp/CMakeLists.txt                            |  1 +
 .../binaryop/compiled_binaryop_benchmark.cpp  | 66 ++++++++++---------
 .../cudf/utilities/type_dispatcher.hpp        | 14 ++--
 cpp/src/binaryop/compiled/NullEquals.cu       | 26 ++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  2 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      | 63 ++++++++++++------
 cpp/src/binaryop/compiled/equality_ops.cu     | 41 ++++++++----
 7 files changed, 141 insertions(+), 72 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullEquals.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 59dc3c74af2..37f93f1868b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -185,6 +185,7 @@ add_library(
   src/binaryop/compiled/LogicalOr.cu
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
+  src/binaryop/compiled/NullEquals.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index bc0818ace4b..8d04f8bdcb2 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -50,14 +50,14 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)                    \
+#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut)              \
   BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
-    COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+    COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop)  \
   (::benchmark::State & st)                                                            \
   {                                                                                    \
     BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
   }                                                                                    \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop)                                       \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                                        \
     ->Unit(benchmark::kMicrosecond)                                                    \
     ->UseManualTime()                                                                  \
     ->Arg(10000)      /* 10k */                                                        \
@@ -70,30 +70,36 @@ using namespace cudf;
 using namespace numeric;
 
 // clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(ADD_1,          float,        float,        ADD,                  float);
+BINARYOP_BENCHMARK_DEFINE(ADD_2,          timestamp_s,  duration_s,   ADD,                  timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(SUB_1,          duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(SUB_2,          int64_t,      int64_t,      SUB,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_1,          float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_2,          duration_s,   int64_t,      MUL,                  duration_s);
+BINARYOP_BENCHMARK_DEFINE(DIV_1,          int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(DIV_2,          duration_ms,  int32_t,      DIV,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(TRUE_DIV,       int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV,      int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(MOD_1,          double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(MOD_2,          duration_ms,  int64_t,      MOD,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(PMOD,           int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(PYMOD,          int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(POW,            int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(LOG_BASE,       float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(ATAN2,          float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT,     int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT,    int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT,   int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_AND,    int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_OR,     int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR,    int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND,    double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR,     int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_1,        int32_t,      int64_t,      EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_2,        duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL,      decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(LESS,           timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(GREATER,        timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS,    duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_MAX,       decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(NULL_MIN,       timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index a04b8309142..d7d38aba4f3 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -531,7 +531,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,9 +541,7 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(cudf::data_type type2,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -568,10 +566,10 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                                          cudf::data_type type2,
-                                                                          F&& f,
-                                                                          Ts&&... args)
+CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
+                                                          cudf::data_type type2,
+                                                          F&& f,
+                                                          Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
new file mode 100644
index 00000000000..3fc76e804f7
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7b0139a0082..71d9b615153 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -316,7 +316,6 @@ case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs
 case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::EQUAL:
 case binary_operator::NOT_EQUAL:
-case binary_operator::NULL_EQUALS:
 if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
 dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break;
 case binary_operator::LESS:                 apply_binary_op<ops::Less>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -337,6 +336,7 @@ case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op<ops::ShiftRightUnsig
 case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..f10d3dd1a62 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -177,35 +177,51 @@ struct ops2_wrapper {
 };
 
 /**
- * @brief Functor which does single, and double type dispatcher in device code
+ * @brief Functor which does single type dispatcher in device code
  *
  * single type dispatcher for lhs and rhs with common types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <class BinaryOperator>
+struct binary_op_device_dispatcher {
+  data_type common_data_type;
+  mutable_column_device_view out;
+  column_device_view lhs;
+  column_device_view rhs;
+  bool is_lhs_scalar;
+  bool is_rhs_scalar;
+
+  __forceinline__ __device__ void operator()(size_type i)
+  {
+    type_dispatcher(common_data_type,
+                    ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+                    i);
+  }
+};
+
+/**
+ * @brief Functor which does double type dispatcher in device code
+ *
  * double type dispatcher for lhs and rhs without common types.
  *
  * @tparam BinaryOperator binary operator functor
  */
 template <class BinaryOperator>
-struct device_type_dispatcher {
+struct binary_op_double_device_dispatcher {
   mutable_column_device_view out;
   column_device_view lhs;
   column_device_view rhs;
   bool is_lhs_scalar;
   bool is_rhs_scalar;
-  std::optional<data_type> common_data_type;
 
-  __device__ void operator()(size_type i)
+  __forceinline__ __device__ void operator()(size_type i)
   {
-    if (common_data_type) {
-      type_dispatcher(*common_data_type,
-                      ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-                      i);
-    } else {
-      double_type_dispatcher(
-        lhs.type(),
-        rhs.type(),
-        ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-        i);
-    }
+    double_type_dispatcher(
+      lhs.type(),
+      rhs.type(),
+      ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+      i);
   }
 };
 
@@ -263,10 +279,19 @@ void apply_binary_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Create binop functor instance
-  auto binop_func = device_type_dispatcher<BinaryOperator>{
-    outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype};
-  // Execute it on every element
-  for_each(stream, outd.size(), binop_func);
+  if (common_dtype) {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_device_dispatcher<BinaryOperator>{
+               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  } else {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_double_device_dispatcher<BinaryOperator>{
+               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  }
 }
 
 }  // namespace compiled
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index feee310716a..03c3e373476 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -28,19 +28,32 @@ void dispatch_equality_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Execute it on every element
-  for_each(
-    stream,
-    outd.size(),
-    [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) {
-      // clang-format off
-      // Similar enabled template types should go together (better performance)
-      switch (op) {
-      case binary_operator::EQUAL:         device_type_dispatcher<ops::Equal>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NOT_EQUAL:     device_type_dispatcher<ops::NotEqual>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NULL_EQUALS:   device_type_dispatcher<ops::NullEquals>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      default:;
-      }
-      // clang-format on
-    });
+
+  if (common_dtype) {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::Equal>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::NotEqual>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  } else {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::Equal>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::NotEqual>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  }
 }
+
 }  // namespace cudf::binops::compiled

From 62103c6a99b4f2df00965e733542e08ce4b11448 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 3 Dec 2021 08:34:47 -0800
Subject: [PATCH 14/25] Added a few more tests for Decimal to String cast
 (#9818)

This PR adds a few more edge cases as a sanity test on the request of @sameerz

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9818
---
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 31a52eb2ec0..7120a40a26a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3376,6 +3376,8 @@ void testFixedWidthCast() {
   void testCastBigDecimalToString() {
     BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
         new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("-9.223"),
+        new BigDecimal("0.000"),
         new BigDecimal("9328323982309091029831.002")
     };
 
@@ -3383,9 +3385,21 @@ void testCastBigDecimalToString() {
          ColumnVector values = cv.castTo(DType.STRING);
          ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
              "9223372036854775808.191",
-             "9328323982309091029831.002")) {
+             "-9.223",
+             "0.000",
+            "9328323982309091029831.002")) {
       assertColumnsAreEqual(expected, values);
     }
+
+    BigDecimal[] bigValues0 = {new BigDecimal("992983283728193827182918744829283742232")};
+    try {
+      ColumnVector cv = ColumnVector.fromDecimals(bigValues0);
+      if (cv != null) {
+        cv.close();
+      }
+      fail("Precision check should've thrown an IllegalArgumentException");
+    } catch (IllegalArgumentException iae) {
+    }
   }
 
   @Test

From a93d333a0bf2e71145bbd53475e6c81dfe45b4df Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 3 Dec 2021 09:47:26 -0800
Subject: [PATCH 15/25] Fix ORC writer crash with empty input columns (#9831)

* skip some parts of writing if no rows; test

* expand test

* Apply suggestions from code review

Co-authored-by: Conor Hoekstra <36027403+codereport@users.noreply.github.com>

* fix

* remove integer-suffix

Co-authored-by: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 9e493c192e4..bf92af8ec75 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 99b5652110b..fe0bb6905cf 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)

From fdd9bb00dc0ba5ac373feaa079b782029130dae3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Fri, 3 Dec 2021 16:13:28 -0700
Subject: [PATCH 16/25] Add JNI for `cudf::drop_duplicates` (#9841)

This adds Java binding for `cudf::drop_duplicates`.

Note that when choosing which duplicate element to keep, only `KEEP_FIRST` or `KEEP_LAST` option can be selected. In other words, this does not support `KEEP_NONE` to remove all duplicate elements.

Closes #9115.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9841
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 92 ++++++++++++-------
 java/src/main/native/src/TableJni.cpp         | 26 ++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 26 ++++++
 3 files changed, 112 insertions(+), 32 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b11808ed023..e32d466e853 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -645,6 +645,10 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left
 
   private static native long[] filter(long input, long mask);
 
+  private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns,
+                                              boolean keepFirst, boolean nullsEqual,
+                                              boolean nullsBefore) throws CudfException;
+
   private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds);
 
   private static native long[] convertToRows(long nativeHandle);
@@ -1820,6 +1824,30 @@ public Table filter(ColumnView mask) {
     return new Table(filter(nativeHandle, mask.getNativeView()));
   }
 
+  /**
+   * Copy rows of the current table to an output table such that duplicate rows in the key columns
+   * are ignored (i.e., only one row from the duplicate ones will be copied). These keys columns are
+   * a subset of the current table columns and their indices are specified by an input array.
+   *
+   * Currently, the output table is sorted by key columns, using stable sort. However, this is not
+   * guaranteed in the future.
+   *
+   * @param keyColumns Array of indices representing key columns from the current table.
+   * @param keepFirst If it is true, the first row with a duplicated key will be copied. Otherwise,
+   *                  copy the last row with a duplicated key.
+   * @param nullsEqual Flag to denote whether nulls are treated as equal when comparing rows of the
+   *                   key columns to check for uniqueness.
+   * @param nullsBefore Flag to specify whether nulls in the key columns will appear before or
+   *                    after non-null elements when sorting the table.
+   *
+   * @return Table with unique keys.
+   */
+  public Table dropDuplicates(int[] keyColumns, boolean keepFirst, boolean nullsEqual,
+                              boolean nullsBefore) {
+    assert keyColumns.length >= 1 : "Input keyColumns must contain indices of at least one column";
+    return new Table(dropDuplicates(nativeHandle, keyColumns, keepFirst, nullsEqual, nullsBefore));
+  }
+
   /**
    * Split a table at given boundaries, but the result of each split has memory that is laid out
    * in a contiguous range of memory.  This allows for us to optimize copying the data in a single
@@ -3005,27 +3033,27 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) {
     }
 
     /**
-     * Computes row-based window aggregation functions on the Table/projection, 
+     * Computes row-based window aggregation functions on the Table/projection,
      * based on windows specified in the argument.
-     * 
+     *
      * This method enables queries such as the following SQL:
-     * 
-     *  SELECT user_id, 
-     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date 
+     *
+     *  SELECT user_id,
+     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date
      *                             ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING)
      *  FROM my_sales_table WHERE ...
-     * 
+     *
      * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the number of rows preceding and following the current row, within a window,
      *  3. the minimum number of observations within the defined window
-     * 
+     *
      * This method returns a {@link Table} instance, with one result column for each specified
      * window aggregation.
-     * 
+     *
      * In this example, for the following input:
-     * 
+     *
      *  [ // user_id,  sales_amt
      *    { "user1",     10      },
      *    { "user2",     20      },
@@ -3037,19 +3065,19 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) {
      *    { "user1",     60      },
      *    { "user2",     40      }
      *  ]
-     * 
-     * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector 
+     *
+     * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
      * (with 2 groups, one for each distinct `user_id`):
-     * 
+     *
      *    [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
      *      <-------user1-------->|<------user2------->
-     * 
+     *
      * The SUM aggregation is applied with 1 preceding and 1 following
      * row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
      * yielding the following column:
-     * 
+     *
      *    [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
-     * 
+     *
      * @param windowAggregates the window-aggregations to be performed
      * @return Table instance, with each column containing the result of each aggregation.
      * @throws IllegalArgumentException if the window arguments are not of type
@@ -3068,7 +3096,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
       for (int outputIndex = 0; outputIndex < windowAggregates.length; outputIndex++) {
         AggregationOverWindow agg = windowAggregates[outputIndex];
         if (agg.getWindowOptions().getFrameType() != WindowOptions.FrameType.ROWS) {
-          throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: " 
+          throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: "
                   + agg.getWindowOptions().getFrameType());
         }
         ColumnWindowOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnWindowOps());
@@ -3129,27 +3157,27 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
     /**
      * Computes range-based window aggregation functions on the Table/projection,
      * based on windows specified in the argument.
-     * 
+     *
      * This method enables queries such as the following SQL:
-     * 
-     *  SELECT user_id, 
-     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date 
+     *
+     *  SELECT user_id,
+     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date
      *                             RANGE BETWEEN INTERVAL 1 DAY PRECEDING and CURRENT ROW)
      *  FROM my_sales_table WHERE ...
-     * 
+     *
      * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the index for the timestamp column to base the window definitions on
      *  2. the number of DAYS preceding and following the current row's date, to consider in the window
      *  3. the minimum number of observations within the defined window
-     * 
+     *
      * This method returns a {@link Table} instance, with one result column for each specified
      * window aggregation.
-     * 
+     *
      * In this example, for the following input:
-     * 
-     *  [ // user,  sales_amt,  YYYYMMDD (date)  
+     *
+     *  [ // user,  sales_amt,  YYYYMMDD (date)
      *    { "user1",   10,      20200101    },
      *    { "user2",   20,      20200101    },
      *    { "user1",   20,      20200102    },
@@ -3160,19 +3188,19 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
      *    { "user1",   60,      20200107    },
      *    { "user2",   40,      20200104    }
      *  ]
-     * 
-     * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector 
+     *
+     * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector
      * (with 2 groups, one for each distinct `user_id`):
-     * 
+     *
      * Date :(202001-)  [ 01,  02,  03,  07,  07,    01,   01,   02,  04 ]
      * Input:           [ 10,  20,  10,  50,  60,    20,   30,   80,  40 ]
      *                    <-------user1-------->|<---------user2--------->
-     * 
-     * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period. 
+     *
+     * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period.
      * The aggregation window is thus 3 *days* wide, yielding the following output column:
-     * 
+     *
      *  Results:        [ 30,  40,  30,  110, 110,  130,  130,  130,  40 ]
-     * 
+     *
      * @param windowAggregates the window-aggregations to be performed
      * @return Table instance, with each column containing the result of each aggregation.
      * @throws IllegalArgumentException if the window arguments are not of type
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f3377bb002d..18e7936f322 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2676,6 +2676,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
+    JNIEnv *env, jclass, jlong input_jtable, jintArray key_columns, jboolean keep_first,
+    jboolean nulls_equal, jboolean nulls_before) {
+  JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
+  JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+
+    static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched.");
+    auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns);
+    auto const keys_indices =
+        std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
+
+    auto result = cudf::drop_duplicates(
+        *input, keys_indices,
+        keep_first ? cudf::duplicate_keep_option::KEEP_FIRST :
+                     cudf::duplicate_keep_option::KEEP_LAST,
+        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
+        nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER,
+        rmm::mr::get_current_device_resource());
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
                                                               jlong j_map, jboolean check_bounds) {
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 0b2f56895e9..a5779bf9dbb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6592,6 +6592,32 @@ void testTableBasedFilter() {
     }
   }
 
+  @Test
+  void testDropDuplicates() {
+    int[] keyColumns = new int[]{ 1 };
+
+    try (ColumnVector col1 = ColumnVector.fromBoxedInts(5, null, 3, 5, 8, 1);
+         ColumnVector col2 = ColumnVector.fromBoxedInts(20, null, null, 19, 21, 19);
+         Table input = new Table(col1, col2)) {
+
+      // Keep the first duplicate element.
+      try (Table result = input.dropDuplicates(keyColumns, true, true, true);
+           ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(null, 5, 5, 8);
+           ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21);
+           Table expected = new Table(expectedCol1, expectedCol2)) {
+        assertTablesAreEqual(expected, result);
+      }
+
+      // Keep the last duplicate element.
+      try (Table result = input.dropDuplicates(keyColumns, false, true, true);
+           ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(3, 1, 5, 8);
+           ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21);
+           Table expected = new Table(expectedCol1, expectedCol2)) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
   private enum Columns {
     BOOL("BOOL"),
     INT("INT"),

From 8002cbd87367135a941d1145c9d489a8f82dc76d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 6 Dec 2021 08:38:41 -0500
Subject: [PATCH 17/25] Allow runtime has_nulls parameter for row operators
 (#9623)

Closes #6952

This PR allows the `has_nulls` template parameter for row operators to be used a runtime parameter in places where the null-handling logic has little to no affect on runtime performance.
This can improve compile time as described in #6952.

This will also close #9152 and #9580

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9623
---
 cpp/benchmarks/groupby/group_sum_benchmark.cu |   7 +-
 cpp/benchmarks/hashing/hash_benchmark.cpp     |  37 +-
 .../cudf/column/column_device_view.cuh        | 262 ++++----------
 cpp/include/cudf/detail/iterator.cuh          | 330 +++++-------------
 cpp/include/cudf/detail/merge.cuh             |   4 +-
 cpp/include/cudf/table/row_operators.cuh      | 173 +++++----
 cpp/src/copying/copy.cu                       |  12 +-
 cpp/src/copying/segmented_shift.cu            |  19 +-
 cpp/src/groupby/hash/groupby.cu               |  49 ++-
 cpp/src/groupby/sort/group_nunique.cu         |   4 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  34 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |  21 +-
 .../sort/group_single_pass_reduction_util.cuh |  21 +-
 cpp/src/groupby/sort/sort_helper.cu           |  34 +-
 cpp/src/hash/hashing.cu                       |  55 +--
 cpp/src/hash/murmur_hash.cu                   |  34 +-
 cpp/src/join/hash_join.cu                     |  10 +-
 cpp/src/join/hash_join.cuh                    |   4 +-
 cpp/src/join/join_common_utils.cuh            |   6 +-
 cpp/src/join/join_common_utils.hpp            |   4 +-
 cpp/src/join/semi_join.cu                     |   8 +-
 cpp/src/partitioning/partitioning.cu          |   3 +-
 cpp/src/reductions/arg_minmax_util.cuh        |  19 +-
 cpp/src/reductions/scan/rank_scan.cu          |  30 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  15 +-
 cpp/src/reductions/simple.cuh                 |  16 +-
 cpp/src/replace/clamp.cu                      |  10 +-
 cpp/src/replace/nans.cu                       |   4 +-
 cpp/src/search/search.cu                      |  18 +-
 cpp/src/sort/is_sorted.cu                     |  18 +-
 cpp/src/sort/rank.cu                          |  28 +-
 cpp/src/sort/sort_impl.cuh                    |  49 +--
 cpp/src/stream_compaction/distinct_count.cu   |  30 +-
 cpp/src/stream_compaction/drop_duplicates.cu  |  41 +--
 cpp/src/transform/one_hot_encode.cu           |  20 +-
 cpp/tests/iterator/optional_iterator_test.cuh |  22 +-
 .../optional_iterator_test_numeric.cu         |   2 +-
 cpp/tests/table/table_view_tests.cu           |   4 +-
 cpp/tests/utilities/column_utilities.cu       |  16 +-
 39 files changed, 542 insertions(+), 931 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu
index f64022690d9..0e9f5061a1a 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu
@@ -44,7 +44,6 @@ void BM_basic_sum(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
 
-  // const cudf::size_type num_columns{(cudf::size_type)state.range(0)};
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
 
   auto data_it = cudf::detail::make_counting_transform_iterator(
@@ -53,7 +52,7 @@ void BM_basic_sum(benchmark::State& state)
   wrapper keys(data_it, data_it + column_size);
   wrapper vals(data_it, data_it + column_size);
 
-  cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
@@ -73,7 +72,9 @@ BENCHMARK_REGISTER_F(Groupby, Basic)
   ->UseManualTime()
   ->Unit(benchmark::kMillisecond)
   ->Arg(10000)
-  ->Arg(10000000);
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
 
 void BM_pre_sorted_sum(benchmark::State& state)
 {
diff --git a/cpp/benchmarks/hashing/hash_benchmark.cpp b/cpp/benchmarks/hashing/hash_benchmark.cpp
index 77b10399693..4ccb0bfad9d 100644
--- a/cpp/benchmarks/hashing/hash_benchmark.cpp
+++ b/cpp/benchmarks/hashing/hash_benchmark.cpp
@@ -25,10 +25,14 @@
 class HashBenchmark : public cudf::benchmark {
 };
 
-static void BM_hash(benchmark::State& state, cudf::hash_id hid)
+enum contains_nulls { no_nulls, nulls };
+
+static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const data = create_random_table({cudf::type_id::INT64}, 1, row_count{n_rows});
+  if (has_nulls == contains_nulls::no_nulls)
+    data->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -36,16 +40,25 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid)
   }
 }
 
-#define HASH_BENCHMARK_DEFINE(name)                               \
-  BENCHMARK_DEFINE_F(HashBenchmark, name)                         \
-  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::name); } \
-  BENCHMARK_REGISTER_F(HashBenchmark, name)                       \
-    ->RangeMultiplier(4)                                          \
-    ->Ranges({{1 << 14, 1 << 24}})                                \
-    ->UseManualTime()                                             \
+#define concat(a, b, c) a##b##c
+
+#define H_BENCHMARK_DEFINE(name, hid, n)                                            \
+  BENCHMARK_DEFINE_F(HashBenchmark, name)                                           \
+  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::hid, contains_nulls::n); } \
+  BENCHMARK_REGISTER_F(HashBenchmark, name)                                         \
+    ->RangeMultiplier(4)                                                            \
+    ->Ranges({{1 << 14, 1 << 24}})                                                  \
+    ->UseManualTime()                                                               \
     ->Unit(benchmark::kMillisecond);
 
-HASH_BENCHMARK_DEFINE(HASH_MURMUR3)
-HASH_BENCHMARK_DEFINE(HASH_MD5)
-HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3)
-HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3)
+#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n)
+
+HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls)
+HASH_BENCHMARK_DEFINE(HASH_MD5, nulls)
+HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, nulls)
+HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls)
+
+HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls)
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 6ecb0796283..a15f20ef52d 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -44,23 +44,30 @@
 namespace cudf {
 
 /**
- * @brief Policy for what assumptions the optional iterator has about null values
+ * @brief Indicates the presence of nulls at compile-time or runtime.
  *
- * - `YES` means that the column supports nulls and has null values, therefore
- *    the optional might not contain a value
+ * If used at compile-time, this indicator can tell the optimizer
+ * to include or exclude any null-checking clauses.
  *
- * - `NO` means that the column has no null values, therefore the optional will
- *    always have a value
- *
- * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
- *    on construction of the iterator if column has nulls.
  */
-struct contains_nulls {
-  struct YES {
+struct nullate {
+  struct YES : std::bool_constant<true> {
   };
-  struct NO {
+  struct NO : std::bool_constant<false> {
   };
   struct DYNAMIC {
+    DYNAMIC() = delete;
+    /**
+     * @brief Create a runtime nullate object.
+     *
+     * @see cudf::column_device_view::optional_begin for example usage
+     *
+     * @param b True if nulls are expected in the operation in which this
+     *          object is applied.
+     */
+    constexpr explicit DYNAMIC(bool b) noexcept : value{b} {}
+    constexpr operator bool() const noexcept { return value; }
+    bool value;  ///< True if nulls are expected
   };
 };
 
@@ -282,7 +289,7 @@ class alignas(16) column_device_view_base {
 // Forward declaration
 template <typename T>
 struct value_accessor;
-template <typename T, typename contains_nulls_mode>
+template <typename T, typename Nullate>
 struct optional_accessor;
 template <typename T, bool has_nulls>
 struct pair_accessor;
@@ -493,11 +500,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief optional iterator for navigating this column
+   * @brief Optional iterator for navigating this column
    */
-  template <typename T, typename contains_nulls_mode>
+  template <typename T, typename Nullate>
   using const_optional_iterator =
-    thrust::transform_iterator<detail::optional_accessor<T, contains_nulls_mode>, count_it>;
+    thrust::transform_iterator<detail::optional_accessor<T, Nullate>, count_it>;
 
   /**
    * @brief Pair iterator for navigating this column
@@ -520,117 +527,57 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * Dereferencing the returned iterator returns a `thrust::optional<T>`.
    *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
+   * The element of this iterator contextually converts to bool. The conversion returns true
    * if the object contains a value and false if it does not contain a value.
    *
-   * optional_begin with mode `DYNAMIC` defers the assumption of nullability to
-   * runtime, with the user stating on construction of the iterator if column has nulls.
-   * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
-   * iterators and you don't want to compile all the combinations of iterator types
-   *
-   * Example:
+   * Calling this method with `nullate::DYNAMIC` defers the assumption of nullability to
+   * runtime with the caller indicating if the column has nulls. The `nullate::DYNAMIC` is
+   * useful when an algorithm is going to execute on multiple iterators and all the combinations of
+   * iterator types are not required at compile time.
    *
-   * \code{.cpp}
+   * @code{.cpp}
    * template<typename T>
    * void some_function(cudf::column_view<T> const& col_view){
    *    auto d_col = cudf::column_device_view::create(col_view);
    *    // Create a `DYNAMIC` optional iterator
-   *    auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{},
-   *                                                      col_view.has_nulls());
-   * }
-   * \endcode
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
-   *         the user has stated nulls exist
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::DYNAMIC, bool has_nulls) const
-  {
-    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the first element of the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
-   * if the object contains a value and false if it does not contain a value.
-   *
-   * optional_begin with mode `YES` means that the column supports nulls and
-   * potentially has null values, therefore the optional might not contain a value
-   *
-   * Example:
-   *
-   * \code{.cpp}
-   * template<typename T, bool has_nulls>
-   * void some_function(cudf::column_view<T> const& col_view){
-   *    auto d_col = cudf::column_device_view::create(col_view);
-   *    if constexpr(has_nulls) {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
-   *      //use optional_iterator
-   *    } else {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
-   *      //use optional_iterator
-   *    }
+   *    auto optional_iterator =
+   *       d_col->optional_begin<T>(cudf::nullate::DYNAMIC{col_view.has_nulls()});
    * }
-   * \endcode
+   * @endcode
    *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
+   * Calling this method with `nullate::YES` means that the column supports nulls and
+   * the optional returned might not contain a value.
    *
-   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::YES) const
-  {
-    return const_optional_iterator<T, contains_nulls::YES>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the first element of the column.
+   * Calling this method with `nullate::NO` means that the column has no null values
+   * and the optional returned will always contain a value.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
-   * if the object contains a value and false if it does not contain a value.
-   *
-   * optional_begin with mode `NO` means that the column has no null values,
-   * therefore the optional will always contain a value.
-   *
-   * Example:
-   *
-   * \code{.cpp}
+   * @code{.cpp}
    * template<typename T, bool has_nulls>
    * void some_function(cudf::column_view<T> const& col_view){
    *    auto d_col = cudf::column_device_view::create(col_view);
    *    if constexpr(has_nulls) {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::nullate::YES{});
    *      //use optional_iterator
    *    } else {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::nullate::NO{});
    *      //use optional_iterator
    *    }
    * }
-   * \endcode
+   * @endcode
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
    *
+   * @throws cudf::logic_error if the column is not nullable and `has_nulls` evaluates to true.
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::NO) const
+  template <typename T,
+            typename Nullate,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_begin(Nullate has_nulls) const
   {
-    return const_optional_iterator<T, contains_nulls::NO>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+    return const_optional_iterator<T, Nullate>{
+      count_it{0}, detail::optional_accessor<T, Nullate>{*this, has_nulls}};
   }
 
   /**
@@ -695,57 +642,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @brief Return an optional iterator to the element following the last element of
    * the column.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   * The returned iterator represents a `thrust::optional<T>` element.
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
    *
-   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
-   *         the user has stated nulls exist
+   * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::DYNAMIC, bool has_nulls) const
-  {
-    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the element following the last element of
-   * the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::YES) const
-  {
-    return const_optional_iterator<T, contains_nulls::YES>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the element following the last element of
-   * the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::NO) const
+  template <typename T,
+            typename Nullate,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_end(Nullate has_nulls) const
   {
-    return const_optional_iterator<T, contains_nulls::NO>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+    return const_optional_iterator<T, Nullate>{
+      count_it{size()}, detail::optional_accessor<T, Nullate>{*this, has_nulls}};
   }
 
   /**
@@ -1201,77 +1112,56 @@ struct value_accessor {
  * @brief optional accessor of a column
  *
  *
- * The optional_accessor always returns a thrust::optional of column[i]. The validity
- * of the optional is determined by the contains_nulls_mode template parameter
- * which has the following modes:
+ * The optional_accessor always returns a `thrust::optional` of `column[i]`. The validity
+ * of the optional is determined by the `Nullate` parameter which may be one of the following:
  *
- * - `YES` means that the column supports nulls and has null values, therefore
- *    the optional might be valid or invalid
+ * - `nullate::YES` means that the column supports nulls and the optional returned
+ *    might be valid or invalid.
  *
- * - `NO` the user has attested that the column has no null values,
+ * - `nullate::NO` means the caller attests that the column has no null values,
  *    no checks will occur and `thrust::optional{column[i]}` will be
  *    return for each `i`.
  *
- * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
- *    on construction of the iterator if column has nulls.
- *    When `with_nulls=true` the return value validity will be determined if column[i]
- *    is not null.
- *    When `with_nulls=false` the return value will always be valid
+ * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
+ *    specifies if the column has nulls at runtime.
+ *    For `DYNAMIC{true}` the return value will be `thrust::optional{column[i]}` if
+ *      element `i` is not null and `thrust::optional{}` if element `i` is null.
+ *    For `DYNAMIC{false}` the return value will always be `thrust::optional{column[i]}`.
  *
  * @throws cudf::logic_error if column datatype and template T type mismatch.
- * @throws cudf::logic_error if the column is not nullable, and `with_nulls=true`
- *
+ * @throws cudf::logic_error if the column is not nullable and `with_nulls` evaluates to true
  *
  * @tparam T The type of elements in the column
- * @tparam contains_nulls_mode Specifies if nulls are checked at runtime or compile time.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <typename T, typename contains_nulls_mode>
+template <typename T, typename Nullate>
 struct optional_accessor {
   column_device_view const col;  ///< column view of column in device
 
   /**
-   * @brief constructor
-   * @param[in] _col column device view of cudf column
+   * @brief Constructor
+   *
+   * @param col Column on which to iterator over its elements.
+   * @param with_nulls Indicates if the `col` should be checked for nulls.
    */
-  optional_accessor(column_device_view const& _col) : col{_col}
+  optional_accessor(column_device_view const& _col, Nullate with_nulls)
+    : col{_col}, has_nulls{with_nulls}
   {
     CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
+    if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
   CUDA_DEVICE_CALLABLE
   thrust::optional<T> operator()(cudf::size_type i) const
   {
-    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+    if (has_nulls) {
       return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
                                        : thrust::optional<T>{thrust::nullopt};
     }
     return thrust::optional<T>{col.element<T>(i)};
   }
-};
-
-template <typename T>
-struct optional_accessor<T, contains_nulls::DYNAMIC> {
-  column_device_view const col;  ///< column view of column in device
-  bool has_nulls;
-
-  /**
-   * @brief constructor
-   * @param[in] _col column device view of cudf column
-   * @param[in] with_nulls Indicates if @p _col has nulls
-   */
-  optional_accessor(column_device_view const& _col, bool with_nulls)
-    : col{_col}, has_nulls{with_nulls}
-  {
-    CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
-    if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
-  }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::optional<T> operator()(cudf::size_type i) const
-  {
-    return (has_nulls and col.is_null_nocheck(i)) ? thrust::optional<T>{thrust::nullopt}
-                                                  : thrust::optional<T>{col.element<T>(i)};
-  }
+  Nullate has_nulls{};
 };
 
 /**
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 3e789299716..01742384972 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -171,127 +171,61 @@ auto make_null_replacement_iterator(column_device_view const& column,
  *
  * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
  *
- * When the element of an iterator contextually converted to bool, the conversion returns true
+ * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
  *
- * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
- * runtime, with the user stating on construction of the iterator if column has nulls.
- * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
- * iterators and you don't want to compile all the combinations of iterator types
+ * Calling this function with `nullate::DYNAMIC` defers the assumption
+ * of nullability to runtime with the caller indicating if the column has nulls.
+ * This is useful when an algorithm is going to execute on multiple iterators and all
+ * the combinations of iterator types are not required at compile time.
  *
- * Example:
- *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T>
  * void some_function(cudf::column_view<T> const& col_view){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    // Create a `DYNAMIC` optional iterator
- *    auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::DYNAMIC{},
- *                                                col_view.has_nulls());
+ *    auto optional_iterator =
+ *      cudf::detail::make_optional_iterator<T>(
+ *        d_col, cudf::nullate::DYNAMIC{col_view.has_nulls()});
  * }
- * \endcode
- *
- * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
- *         the user has stated nulls exist
- * @throws cudf::logic_error if column datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the column
- * @param column The column to iterate
- * @return Iterator that returns valid column elements and the validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column,
-                            contains_nulls::DYNAMIC,
-                            bool has_nulls)
-{
-  return column.optional_begin<Element>(contains_nulls::DYNAMIC{}, has_nulls);
-}
-
-/**
- * @brief Constructs an optional iterator over a column's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * make_optional_iterator with mode `YES` means that the column supports nulls and
- * potentially has null values, therefore the optional might not contain a value
+ * @endcode
  *
- * Example:
+ * Calling this function with `nullate::YES` means that the column supports
+ * nulls and the optional returned might not contain a value.
+ * Calling this function with `nullate::NO` means that the column has no
+ * null values and the optional returned will always contain a value.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T, bool has_nulls>
  * void some_function(cudf::column_view<T> const& col_view){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    if constexpr(has_nulls) {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::YES{});
+ *      auto optional_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::YES{});
  *      //use optional_iterator
  *    } else {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::NO{});
+ *      auto optional_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::NO{});
  *      //use optional_iterator
  *    }
  * }
- * \endcode
+ * @endcode
  *
- * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
+ * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true.
  * @throws cudf::logic_error if column datatype and Element type mismatch.
  *
- * @tparam Element The type of elements in the column
- * @param column The column to iterate
- * @return Iterator that returns column elements and the validity of the
- * element as a thrust::optional
- */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column, contains_nulls::YES)
-{
-  return column.optional_begin<Element>(contains_nulls::YES{});
-}
-
-/**
- * @brief Constructs an optional iterator over a column's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * make_optional_iterator with mode `NO` means that the column has no null values,
- * therefore the optional will always contain a value.
- *
- * Example:
- *
- * \code{.cpp}
- * template<typename T, bool has_nulls>
- * void some_function(cudf::column_view<T> const& col_view){
- *    auto d_col = cudf::column_device_view::create(col_view);
- *    if constexpr(has_nulls) {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::YES{});
- *      //use optional_iterator
- *    } else {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::NO{});
- *      //use optional_iterator
- *    }
- * }
- * \endcode
- *
- * @throws cudf::logic_error if column datatype and Element type mismatch.
+ * @tparam Element The type of elements in the column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  *
- * @tparam Element The type of elements in the column
  * @param column The column to iterate
- * @return Iterator that returns column elements and the validity of the
- * element in a thrust::optional
+ * @param has_nulls Indicates whether `column` is checked for nulls.
+ * @return Iterator that returns valid column elements and the validity of the
+ * element in a `thrust::optional`
  */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column, contains_nulls::NO)
+template <typename Element, typename Nullate>
+auto make_optional_iterator(column_device_view const& column, Nullate has_nulls)
 {
-  return column.optional_begin<Element>(contains_nulls::NO{});
+  return column.optional_begin<Element, Nullate>(has_nulls);
 }
 
 /**
@@ -447,40 +381,38 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
                                          scalar_value_accessor<Element>{scalar_value});
 }
 
-template <typename Element, typename contains_nulls_mode>
-struct scalar_optional_accessor;
-
 /**
- * @brief optional accessor of a maybe-nullable scalar
- *
- * The scalar_optional_accessor always returns a thrust::optional of the scalar.
- * The validity of the optional is determined by the contains_nulls_mode template parameter
- * which has the following modes:
+ * @brief Optional accessor for a scalar
  *
- * `DYNAMIC`: Defer nullability checks to runtime
+ * The `scalar_optional_accessor` always returns a `thrust::optional` of the scalar.
+ * The validity of the optional is determined by the `Nullate` parameter which may
+ * be one of the following:
  *
- *  - When `with_nulls=true` the return value will be a `thrust::optional{scalar}`
- *    when scalar is valid, and `thrust::optional{}` when the scalar is invalid.
+ * - `nullate::YES` means that the scalar may be valid or invalid and the optional returned
+ *    will contain a value only if the scalar is valid.
  *
- *  - When `with_nulls=false` the return value will always be `thrust::optional{scalar}`
+ * - `nullate::NO` means the caller attests that the scalar will always be valid,
+ *    no checks will occur and `thrust::optional{column[i]}` will return a value
+ *    for each `i`.
  *
- * `NO`: No null values will occur for this scalar, no checks will occur
- *  and `thrust::optional{scalar}` will always be returned.
- *
- * `YES`: null values will occur for this scalar,
- *  and `thrust::optional{scalar}` will always be returned.
+ * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
+ *    specifies if the scalar may be valid or invalid.
+ *    For `DYNAMIC{true}` the return value will be a `thrust::optional{scalar}` when the
+ *      scalar is valid and a `thrust::optional{}` when the scalar is invalid.
+ *    For `DYNAMIC{false}` the return value will always be a `thrust::optional{scalar}`.
  *
  * @throws `cudf::logic_error` if scalar datatype and Element type mismatch.
  *
  * @tparam Element The type of return type of functor
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <typename Element, typename contains_nulls_mode>
+template <typename Element, typename Nullate>
 struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   using super_t    = scalar_value_accessor<Element>;
   using value_type = thrust::optional<Element>;
 
-  scalar_optional_accessor(scalar const& scalar_value)
-    : scalar_value_accessor<Element>(scalar_value)
+  scalar_optional_accessor(scalar const& scalar_value, Nullate with_nulls)
+    : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
   {
   }
 
@@ -494,32 +426,14 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   CUDA_HOST_DEVICE_CALLABLE
   const value_type operator()(size_type) const
   {
-    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+    if (has_nulls) {
       return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
                                            : value_type{thrust::nullopt};
     }
     return Element{super_t::dscalar.value()};
   }
-};
 
-template <typename Element>
-struct scalar_optional_accessor<Element, cudf::contains_nulls::DYNAMIC>
-  : public scalar_value_accessor<Element> {
-  using super_t    = scalar_value_accessor<Element>;
-  using value_type = thrust::optional<Element>;
-  bool has_nulls;
-
-  scalar_optional_accessor(scalar const& scalar_value, bool with_nulls)
-    : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
-  {
-  }
-
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
-  {
-    return (has_nulls and !super_t::dscalar.is_valid()) ? value_type{thrust::nullopt}
-                                                        : Element{super_t::dscalar.value()};
-  }
+  Nullate has_nulls{};
 };
 
 /**
@@ -622,156 +536,70 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  *
  * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
  *
- * When the element of an iterator contextually converted to bool, the conversion returns true
+ * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
  *
  * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
  *
- * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
- * runtime, with the user stating on construction of the iterator if scalar has nulls.
- *
- * Example:
+ * Calling this function with `nullate::DYNAMIC` defers the assumption
+ * of nullability to runtime with the caller indicating if the scalar is valid.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T>
  * void some_function(cudf::column_view<T> const& col_view,
  *                    scalar const& scalar_value,
  *                    bool col_has_nulls){
  *    auto d_col = cudf::column_device_view::create(col_view);
- *    auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
-                                      cudf::contains_nulls::DYNAMIC{}, col_has_nulls);
- *    auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
-                                      cudf::contains_nulls::DYNAMIC{}, scalar_value.is_valid());
+ *    auto column_iterator = cudf::detail::make_optional_iterator<T>(
+ *      d_col, cudf::nullate::DYNAMIC{col_has_nulls});
+ *    auto scalar_iterator = cudf::detail::make_optional_iterator<T>(
+ *      scalar_value, cudf::nullate::DYNAMIC{scalar_value.is_valid()});
  *    //use iterators
  * }
- * \endcode
- *
- * @throws cudf::logic_error if the scalar is not nullable, and `DYNAMIC` mode used and
- *         the user has stated nulls exist
- * @throws cudf::logic_error if scalar datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the scalar
- * @tparam has_nulls If the scalar value will have a null at runtime
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value,
-                                   contains_nulls::DYNAMIC,
-                                   bool has_nulls)
-{
-  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
-               "the data type mismatch");
-  return thrust::make_transform_iterator(
-    thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::DYNAMIC>{scalar_value, has_nulls});
-}
-
-/**
- * @brief Constructs an optional iterator over a scalar's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
- *
- * make_optional_iterator ith mode `YES` means that the scalar supports nulls and
- * potentially has null values, therefore the optional might not contain a value
- * therefore the optional will always contain a value.
+ * @endcode
  *
- * Example:
+ * Calling this function with `nullate::YES` means that the scalar maybe invalid
+ * and the optional return might not contain a value.
+ * Calling this function with `nullate::NO` means that the scalar is valid
+ * and the optional returned will always contain a value.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T, bool any_nulls>
  * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    if constexpr(any_nulls) {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::YES{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::YES{});
+ *      auto column_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::YES{});
+ *      auto scalar_iterator =
+ *        cudf::detail::make_optional_iterator<T>(scalar_value, cudf::nullate::YES{});
  *      //use iterators
  *    } else {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::NO{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::NO{});
+ *      auto column_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::NO{});
+ *      auto scalar_iterator =
+ *        cudf::detail::make_optional_iterator<T>(scalar_value, cudf::nullate::NO{});
  *      //use iterators
  *    }
  * }
- * \endcode
+ * @endcode
  *
- * @throws cudf::logic_error if the scalar is not nullable, and `YES` mode used
  * @throws cudf::logic_error if scalar datatype and Element type mismatch.
  *
  * @tparam Element The type of elements in the scalar
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and the validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::YES)
-{
-  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
-               "the data type mismatch");
-  return thrust::make_transform_iterator(
-    thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::YES>{scalar_value});
-}
-
-/**
- * @brief Constructs an optional iterator over a scalar's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
- *
- * make_optional_iterator with mode `NO` means that the scalar has no null values,
- * therefore the optional will always contain a value.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  *
- * Example:
- *
- * \code{.cpp}
- * template<typename T, bool any_nulls>
- * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
- *    auto d_col = cudf::column_device_view::create(col_view);
- *    if constexpr(any_nulls) {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::YES{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::YES{});
- *      //use iterators
- *    } else {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::NO{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::NO{});
- *      //use iterators
- *    }
- * }
- * \endcode
- *
- * @throws cudf::logic_error if scalar datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the scalar
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and the validity of the
- * element in a thrust::optional
+ * @param scalar_value The scalar to be returned by the iterator.
+ * @param has_nulls Indicates if the scalar value may be invalid.
+ * @return Iterator that returns scalar and the validity of the scalar in a thrust::optional
  */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::NO)
+template <typename Element, typename Nullate>
+auto inline make_optional_iterator(scalar const& scalar_value, Nullate has_nulls)
 {
   CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
                "the data type mismatch");
   return thrust::make_transform_iterator(
     thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::NO>{scalar_value});
+    scalar_optional_accessor<Element, Nullate>{scalar_value, has_nulls});
 }
 
 /**
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index ec83e348e33..f141d9b5d59 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -90,8 +90,8 @@ struct tagged_element_relational_comparator {
 
     column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs};
 
-    auto erl_comparator =
-      element_relational_comparator<has_nulls>(*ptr_left_dview, *ptr_right_dview, null_precedence);
+    auto erl_comparator = element_relational_comparator(
+      nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence);
 
     return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx);
   }
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 70ccac2f75d..0f3ca073380 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -50,9 +50,9 @@ namespace detail {
 /**
  * @brief Compare the elements ordering with respect to `lhs`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @param lhs first element
+ * @param rhs second element
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element>
@@ -69,14 +69,15 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 
 /**
  * @brief A specialization for floating-point `Element` type relational comparison
- * to derive the order of the elements with respect to `lhs`. Specialization is to
- * handle `nan` in the order shown below.
+ * to derive the order of the elements with respect to `lhs`.
+ *
+ * This Specialization handles `nan` in the following order:
  * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)`
  * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)`
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @param lhs first element
+ * @param rhs second element
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element, std::enable_if_t<std::is_floating_point<Element>::value>* = nullptr>
@@ -119,7 +120,7 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord
  *
  * @param[in] lhs first element
  * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element, std::enable_if_t<not std::is_floating_point<Element>::value>* = nullptr>
@@ -132,9 +133,9 @@ __device__ weak_ordering relational_compare(Element lhs, Element rhs)
  * @brief A specialization for floating-point `Element` type to check if
  * `lhs` is equivalent to `rhs`. `nan == nan`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return bool `true` if `lhs` == `rhs` else `false`.
+ * @param lhs first element
+ * @param rhs second element
+ * @return `true` if `lhs` == `rhs` else `false`.
  */
 template <typename Element, std::enable_if_t<std::is_floating_point<Element>::value>* = nullptr>
 __device__ bool equality_compare(Element lhs, Element rhs)
@@ -147,9 +148,9 @@ __device__ bool equality_compare(Element lhs, Element rhs)
  * @brief A specialization for non-floating-point `Element` type to check if
  * `lhs` is equivalent to `rhs`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return bool `true` if `lhs` == `rhs` else `false`.
+ * @param lhs first element
+ * @param rhs second element
+ * @return `true` if `lhs` == `rhs` else `false`.
  */
 template <typename Element, std::enable_if_t<not std::is_floating_point<Element>::value>* = nullptr>
 __device__ bool equality_compare(Element const lhs, Element const rhs)
@@ -160,9 +161,9 @@ __device__ bool equality_compare(Element const lhs, Element const rhs)
 /**
  * @brief Performs an equality comparison between two elements in two columns.
  *
- * @tparam has_nulls Indicates the potential for null values in either column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class element_equality_comparator {
  public:
   /**
@@ -171,14 +172,17 @@ class element_equality_comparator {
    *
    * @note `lhs` and `rhs` may be the same.
    *
+   * @param has_nulls Indicates if either input column contains nulls.
    * @param lhs The column containing the first element
    * @param rhs The column containing the second element (may be the same as lhs)
    * @param nulls_are_equal Indicates if two null elements are treated as equivalent
    */
-  __host__ __device__ element_equality_comparator(column_device_view lhs,
-                                                  column_device_view rhs,
-                                                  bool nulls_are_equal = true)
-    : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal}
+  __host__ __device__
+  element_equality_comparator(Nullate has_nulls,
+                              column_device_view lhs,
+                              column_device_view rhs,
+                              null_equality nulls_are_equal = null_equality::EQUAL)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
   {
   }
 
@@ -188,18 +192,17 @@ class element_equality_comparator {
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
    * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
-   *
    */
   template <typename Element,
             std::enable_if_t<cudf::is_equality_comparable<Element, Element>()>* = nullptr>
   __device__ bool operator()(size_type lhs_element_index,
                              size_type rhs_element_index) const noexcept
   {
-    if (has_nulls) {
+    if (nulls) {
       bool const lhs_is_null{lhs.is_null(lhs_element_index)};
       bool const rhs_is_null{rhs.is_null(rhs_element_index)};
       if (lhs_is_null and rhs_is_null) {
-        return nulls_are_equal;
+        return nulls_are_equal == null_equality::EQUAL;
       } else if (lhs_is_null != rhs_is_null) {
         return false;
       }
@@ -220,14 +223,18 @@ class element_equality_comparator {
  private:
   column_device_view lhs;
   column_device_view rhs;
-  bool nulls_are_equal;
+  Nullate nulls;
+  null_equality nulls_are_equal;
 };
 
-template <bool has_nulls = true>
+template <typename Nullate>
 class row_equality_comparator {
  public:
-  row_equality_comparator(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true)
-    : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal}
+  row_equality_comparator(Nullate has_nulls,
+                          table_device_view lhs,
+                          table_device_view rhs,
+                          null_equality nulls_are_equal = true)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
   {
     CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(), "Mismatched number of columns.");
   }
@@ -236,7 +243,7 @@ class row_equality_comparator {
   {
     auto equal_elements = [=](column_device_view l, column_device_view r) {
       return cudf::type_dispatcher(l.type(),
-                                   element_equality_comparator<has_nulls>{l, r, nulls_are_equal},
+                                   element_equality_comparator{nulls, l, r, nulls_are_equal},
                                    lhs_row_index,
                                    rhs_row_index);
     };
@@ -247,15 +254,16 @@ class row_equality_comparator {
  private:
   table_device_view lhs;
   table_device_view rhs;
-  bool nulls_are_equal;
+  Nullate nulls;
+  null_equality nulls_are_equal;
 };
 
 /**
  * @brief Performs a relational comparison between two elements in two columns.
  *
- * @tparam has_nulls Indicates the potential for null values in either column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class element_relational_comparator {
  public:
   /**
@@ -266,13 +274,21 @@ class element_relational_comparator {
    *
    * @param lhs The column containing the first element
    * @param rhs The column containing the second element (may be the same as lhs)
-   * @param null_precedence Indicates how null values are ordered with other
-   * values
+   * @param has_nulls Indicates if either input column contains nulls.
+   * @param null_precedence Indicates how null values are ordered with other values
    */
-  __host__ __device__ element_relational_comparator(column_device_view lhs,
+  __host__ __device__ element_relational_comparator(Nullate has_nulls,
+                                                    column_device_view lhs,
                                                     column_device_view rhs,
                                                     null_order null_precedence)
-    : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, null_precedence{null_precedence}
+  {
+  }
+
+  __host__ __device__ element_relational_comparator(Nullate has_nulls,
+                                                    column_device_view lhs,
+                                                    column_device_view rhs)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}
   {
   }
 
@@ -281,7 +297,7 @@ class element_relational_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
-   * @return weak_ordering Indicates the relationship between the elements in
+   * @return Indicates the relationship between the elements in
    * the `lhs` and `rhs` columns.
    */
   template <typename Element,
@@ -289,11 +305,11 @@ class element_relational_comparator {
   __device__ weak_ordering operator()(size_type lhs_element_index,
                                       size_type rhs_element_index) const noexcept
   {
-    if (has_nulls) {
+    if (nulls) {
       bool const lhs_is_null{lhs.is_null(lhs_element_index)};
       bool const rhs_is_null{rhs.is_null(rhs_element_index)};
 
-      if (lhs_is_null or rhs_is_null) {  // atleast one is null
+      if (lhs_is_null or rhs_is_null) {  // at least one is null
         return null_compare(lhs_is_null, rhs_is_null, null_precedence);
       }
     }
@@ -313,7 +329,8 @@ class element_relational_comparator {
  private:
   column_device_view lhs;
   column_device_view rhs;
-  null_order null_precedence;
+  Nullate nulls;
+  null_order null_precedence{};
 };
 
 /**
@@ -329,9 +346,9 @@ class element_relational_comparator {
  * second letter in both words is the first non-equal letter, and `a < b`, thus
  * `aac < abb`.
  *
- * @tparam has_nulls Indicates the potential for null values in either row.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class row_lexicographic_comparator {
  public:
   /**
@@ -343,6 +360,7 @@ class row_lexicographic_comparator {
    *
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
+   * @param has_nulls Indicates if either input table contains columns with nulls.
    * @param column_order Optional, device array the same length as a row that
    * indicates the desired ascending/descending order of each column in a row.
    * If `nullptr`, it is assumed all columns are sorted in ascending order.
@@ -351,11 +369,16 @@ class row_lexicographic_comparator {
    * it is nullptr, then null precedence would be `null_order::BEFORE` for all
    * columns.
    */
-  row_lexicographic_comparator(table_device_view lhs,
+  row_lexicographic_comparator(Nullate has_nulls,
+                               table_device_view lhs,
                                table_device_view rhs,
                                order const* column_order         = nullptr,
                                null_order const* null_precedence = nullptr)
-    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
+    : _lhs{lhs},
+      _rhs{rhs},
+      _nulls{has_nulls},
+      _column_order{column_order},
+      _null_precedence{null_precedence}
   {
     CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
     CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs),
@@ -376,14 +399,14 @@ class row_lexicographic_comparator {
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
 
-      weak_ordering state{weak_ordering::EQUIVALENT};
       null_order null_precedence =
         _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
 
       auto comparator =
-        element_relational_comparator<has_nulls>{_lhs.column(i), _rhs.column(i), null_precedence};
+        element_relational_comparator{_nulls, _lhs.column(i), _rhs.column(i), null_precedence};
 
-      state = cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
+      weak_ordering state =
+        cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
 
       if (state == weak_ordering::EQUIVALENT) { continue; }
 
@@ -395,6 +418,7 @@ class row_lexicographic_comparator {
  private:
   table_device_view _lhs;
   table_device_view _rhs;
+  Nullate _nulls{};
   null_order const* _null_precedence{};
   order const* _column_order{};
 };  // class row_lexicographic_comparator
@@ -403,9 +427,9 @@ class row_lexicographic_comparator {
  * @brief Computes the hash value of an element in the given column.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class element_hasher {
  public:
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
@@ -421,15 +445,20 @@ class element_hasher {
     cudf_assert(false && "Unsupported type in hash.");
     return {};
   }
+
+  Nullate has_nulls;
 };
 
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class element_hasher_with_seed {
  public:
-  element_hasher_with_seed() = default;
-  __device__ element_hasher_with_seed(uint32_t seed) : _seed{seed} {}
-  __device__ element_hasher_with_seed(uint32_t seed, hash_value_type null_hash)
-    : _seed{seed}, _null_hash(null_hash)
+  __device__ element_hasher_with_seed(Nullate has_nulls, uint32_t seed)
+    : _seed{seed}, _has_nulls{has_nulls}
+  {
+  }
+
+  __device__ element_hasher_with_seed(Nullate has_nulls, uint32_t seed, hash_value_type null_hash)
+    : _seed{seed}, _null_hash{null_hash}, _has_nulls{has_nulls}
   {
   }
 
@@ -450,20 +479,24 @@ class element_hasher_with_seed {
  private:
   uint32_t _seed{DEFAULT_HASH_SEED};
   hash_value_type _null_hash{std::numeric_limits<hash_value_type>::max()};
+  Nullate _has_nulls;
 };
 
 /**
  * @brief Computes the hash value of a row in the given table.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the table.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class row_hasher {
  public:
   row_hasher() = delete;
-  row_hasher(table_device_view t) : _table{t} {}
-  row_hasher(table_device_view t, uint32_t seed) : _table{t}, _seed(seed) {}
+  row_hasher(Nullate has_nulls, table_device_view t) : _table{t}, _has_nulls{has_nulls} {}
+  row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
+    : _table{t}, _seed(seed), _has_nulls{has_nulls}
+  {
+  }
 
   __device__ auto operator()(size_type row_index) const
   {
@@ -476,7 +509,7 @@ class row_hasher {
       hash_combiner(hash_value_type{0},
                     type_dispatcher<dispatch_storage_type>(
                       _table.column(0).type(),
-                      element_hasher_with_seed<hash_function, has_nulls>{_seed},
+                      element_hasher_with_seed<hash_function, Nullate>{_has_nulls, _seed},
                       _table.column(0),
                       row_index));
 
@@ -484,7 +517,7 @@ class row_hasher {
     auto hasher = [=](size_type column_index) {
       return cudf::type_dispatcher<dispatch_storage_type>(
         _table.column(column_index).type(),
-        element_hasher<hash_function, has_nulls>{},
+        element_hasher<hash_function, Nullate>{_has_nulls},
         _table.column(column_index),
         row_index);
     };
@@ -502,6 +535,7 @@ class row_hasher {
 
  private:
   table_device_view _table;
+  Nullate _has_nulls;
   uint32_t _seed{DEFAULT_HASH_SEED};
 };
 
@@ -510,14 +544,14 @@ class row_hasher {
  * initial hash value for each column.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the table.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class row_hasher_initial_values {
  public:
   row_hasher_initial_values() = delete;
-  row_hasher_initial_values(table_device_view t, hash_value_type* initial_hash)
-    : _table{t}, _initial_hash(initial_hash)
+  row_hasher_initial_values(Nullate has_nulls, table_device_view t, hash_value_type* initial_hash)
+    : _table{t}, _initial_hash(initial_hash), _has_nulls{has_nulls}
   {
   }
 
@@ -529,11 +563,11 @@ class row_hasher_initial_values {
 
     // Hashes an element in a column and combines with an initial value
     auto hasher = [=](size_type column_index) {
-      auto hash_value =
-        cudf::type_dispatcher<dispatch_storage_type>(_table.column(column_index).type(),
-                                                     element_hasher<hash_function, has_nulls>{},
-                                                     _table.column(column_index),
-                                                     row_index);
+      auto hash_value = cudf::type_dispatcher<dispatch_storage_type>(
+        _table.column(column_index).type(),
+        element_hasher<hash_function, Nullate>{_has_nulls},
+        _table.column(column_index),
+        row_index);
 
       return hash_combiner(_initial_hash[column_index], hash_value);
     };
@@ -550,6 +584,7 @@ class row_hasher_initial_values {
  private:
   table_device_view _table;
   hash_value_type* _initial_hash;
+  Nullate _has_nulls;
 };
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index f4d09c8e0be..55173fd409f 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -75,10 +75,8 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    auto lhs_iter =
-      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
-    auto rhs_iter =
-      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
+    auto lhs_iter = cudf::detail::make_optional_iterator<T>(lhs, nullate::DYNAMIC{left_nullable});
+    auto rhs_iter = cudf::detail::make_optional_iterator<T>(rhs, nullate::DYNAMIC{right_nullable});
     return detail::copy_if_else(left_nullable || right_nullable,
                                 lhs_iter,
                                 lhs_iter + size,
@@ -112,10 +110,8 @@ struct copy_if_else_functor_impl<string_view> {
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    auto lhs_iter =
-      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
-    auto rhs_iter =
-      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
+    auto lhs_iter = cudf::detail::make_optional_iterator<T>(lhs, nullate::DYNAMIC{left_nullable});
+    auto rhs_iter = cudf::detail::make_optional_iterator<T>(rhs, nullate::DYNAMIC{right_nullable});
     return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
   }
 };
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index 62f992012cd..b08eaa0862c 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -76,11 +76,10 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
-    auto input_iterator =
-      cudf::detail::make_optional_iterator<T>(
-        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
-      offset;
-    auto fill_iterator = cudf::detail::make_optional_iterator<T>(fill_value, contains_nulls::YES{});
+    auto input_iterator     = cudf::detail::make_optional_iterator<T>(
+                            *values_device_view, nullate::DYNAMIC{segmented_values.has_nulls()}) -
+                          offset;
+    auto fill_iterator = cudf::detail::make_optional_iterator<T>(fill_value, nullate::YES{});
     return copy_if_else(nullable,
                         input_iterator,
                         input_iterator + segmented_values.size(),
@@ -105,12 +104,10 @@ struct segmented_shift_functor<string_view> {
                                      rmm::mr::device_memory_resource* mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
-    auto input_iterator =
-      make_optional_iterator<cudf::string_view>(
-        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
-      offset;
-    auto fill_iterator =
-      make_optional_iterator<cudf::string_view>(fill_value, contains_nulls::YES{});
+    auto input_iterator     = make_optional_iterator<cudf::string_view>(
+                            *values_device_view, nullate::DYNAMIC{segmented_values.has_nulls()}) -
+                          offset;
+    auto fill_iterator = make_optional_iterator<cudf::string_view>(fill_value, nullate::YES{});
     return strings::detail::copy_if_else(input_iterator,
                                          input_iterator + segmented_values.size(),
                                          fill_iterator,
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e35fa36a289..0c3e79ea36c 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -416,8 +416,8 @@ void sparse_to_dense_results(table_view const& keys,
  * @brief Construct hash map that uses row comparator and row hasher on
  * `d_keys` table and stores indices
  */
-template <bool keys_have_nulls>
 auto create_hash_map(table_device_view const& d_keys,
+                     bool keys_have_nulls,
                      null_policy include_null_keys,
                      rmm::cuda_stream_view stream)
 {
@@ -426,15 +426,17 @@ auto create_hash_map(table_device_view const& d_keys,
 
   using map_type = concurrent_unordered_map<size_type,
                                             size_type,
-                                            row_hasher<default_hash, keys_have_nulls>,
-                                            row_equality_comparator<keys_have_nulls>>;
+                                            row_hasher<default_hash, nullate::DYNAMIC>,
+                                            row_equality_comparator<nullate::DYNAMIC>>;
 
   using allocator_type = typename map_type::allocator_type;
 
-  bool const null_keys_are_equal{include_null_keys == null_policy::INCLUDE};
+  auto const null_keys_are_equal =
+    include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
 
-  row_hasher<default_hash, keys_have_nulls> hasher{d_keys};
-  row_equality_comparator<keys_have_nulls> rows_equal{d_keys, d_keys, null_keys_are_equal};
+  row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
+  row_equality_comparator rows_equal{
+    nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
 
   return map_type::create(compute_hash_table_size(d_keys.num_rows()),
                           stream,
@@ -481,14 +483,13 @@ auto create_sparse_results_table(table_view const& flattened_values,
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
- *
- * @see groupby_null_templated()
  */
-template <bool keys_have_nulls, typename Map>
+template <typename Map>
 void compute_single_pass_aggs(table_view const& keys,
                               host_span<aggregation_request const> requests,
                               cudf::detail::result_cache* sparse_results,
                               Map& map,
+                              bool keys_have_nulls,
                               null_policy include_null_keys,
                               rmm::cuda_stream_view stream)
 {
@@ -578,24 +579,24 @@ rmm::device_uvector<size_type> extract_populated_keys(Map map,
  * results using the aforementioned index vector. Dense results are stored into
  * the in/out parameter `cache`.
  */
-template <bool keys_have_nulls>
-std::unique_ptr<table> groupby_null_templated(table_view const& keys,
-                                              host_span<aggregation_request const> requests,
-                                              cudf::detail::result_cache* cache,
-                                              null_policy include_null_keys,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> groupby(table_view const& keys,
+                               host_span<aggregation_request const> requests,
+                               cudf::detail::result_cache* cache,
+                               bool keys_have_nulls,
+                               null_policy include_null_keys,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto d_keys_ptr = table_device_view::create(keys, stream);
-  auto map        = create_hash_map<keys_have_nulls>(*d_keys_ptr, include_null_keys, stream);
+  auto map        = create_hash_map(*d_keys_ptr, keys_have_nulls, include_null_keys, stream);
 
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash map
   cudf::detail::result_cache sparse_results(requests.size());
 
   // Compute all single pass aggs first
-  compute_single_pass_aggs<keys_have_nulls>(
-    keys, requests, &sparse_results, *map, include_null_keys, stream);
+  compute_single_pass_aggs(
+    keys, requests, &sparse_results, *map, keys_have_nulls, include_null_keys, stream);
 
   // Extract the populated indices from the hash map and create a gather map.
   // Gathering using this map from sparse results will give dense results.
@@ -664,14 +665,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
 {
   cudf::detail::result_cache cache(requests.size());
 
-  std::unique_ptr<table> unique_keys;
-  if (has_nulls(keys)) {
-    unique_keys =
-      groupby_null_templated<true>(keys, requests, &cache, include_null_keys, stream, mr);
-  } else {
-    unique_keys =
-      groupby_null_templated<false>(keys, requests, &cache, include_null_keys, stream, mr);
-  }
+  std::unique_ptr<table> unique_keys =
+    groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
 
   return std::make_pair(std::move(unique_keys), extract_results(requests, cache));
 }
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index b8c4571961d..5154c867095 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -49,7 +49,7 @@ struct nunique_functor {
 
     auto values_view = column_device_view::create(values, stream);
     if (values.has_nulls()) {
-      auto equal              = element_equality_comparator<true>{*values_view, *values_view};
+      auto equal = element_equality_comparator{nullate::YES{}, *values_view, *values_view};
       auto is_unique_iterator = thrust::make_transform_iterator(
         thrust::make_counting_iterator<size_type>(0),
         [v = *values_view,
@@ -72,7 +72,7 @@ struct nunique_functor {
                             thrust::make_discard_iterator(),
                             result->mutable_view().begin<size_type>());
     } else {
-      auto equal              = element_equality_comparator<false>{*values_view, *values_view};
+      auto equal = element_equality_comparator{nullate::NO{}, *values_view, *values_view};
       auto is_unique_iterator = thrust::make_transform_iterator(
         thrust::make_counting_iterator<size_type>(0),
         [v = *values_view,
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index f36bdc0a660..62aa3df8e5c 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -31,7 +31,6 @@ namespace {
 /**
  * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results
  *
- * @tparam has_nulls if the order_by column has nulls
  * @tparam value_resolver flag value resolver function with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
  * @param order_by input column to generate ranks for
@@ -39,23 +38,26 @@ namespace {
  * @param group_offsets group index offsets with group ID indices
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
+ * @param has_nulls true if nulls are included in the `order_by` column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <bool has_nulls, typename value_resolver, typename scan_operator>
+template <typename value_resolver, typename scan_operator>
 std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        device_span<size_type const> group_labels,
                                        device_span<size_type const> group_offsets,
                                        value_resolver resolver,
                                        scan_operator scan_op,
+                                       bool has_nulls,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
     table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
+  row_equality_comparator comparator(
+    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -92,22 +94,13 @@ std::unique_ptr<column> rank_scan(column_view const& order_by,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      group_labels,
-      group_offsets,
-      [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
-      DeviceMax{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
     group_labels,
     group_offsets,
     [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
     DeviceMax{},
+    has_nested_nulls(table_view{{order_by}}),
     stream,
     mr);
 }
@@ -118,22 +111,13 @@ std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      group_labels,
-      group_offsets,
-      [] __device__(bool equality, auto row_index) { return equality; },
-      DeviceSum{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
     group_labels,
     group_offsets,
     [] __device__(bool equality, auto row_index) { return equality; },
     DeviceSum{},
+    has_nested_nulls(table_view{{order_by}}),
     stream,
     mr);
 }
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index e25fdd6fc27..08f65536466 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -220,21 +220,12 @@ struct group_scan_functor<K,
 
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
-    if (values.has_nulls()) {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
-                                                         *d_flattened_values_ptr,
-                                                         flattened_null_precedences.data(),
-                                                         K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    } else {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
-                                                          *d_flattened_values_ptr,
-                                                          flattened_null_precedences.data(),
-                                                          K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    }
+    auto const binop      = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
+                                                                  *d_flattened_values_ptr,
+                                                                  values.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  K == aggregation::MIN);
+    do_scan(count_iter, map_begin, binop);
 
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 95a36f40e57..4fde825c0e0 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -270,14 +270,14 @@ struct group_reduction_functor<
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
-    if (values.has_nulls()) {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
-                                                         *d_flattened_values_ptr,
-                                                         flattened_null_precedences.data(),
-                                                         K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
+    auto const binop        = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
+                                                                  *d_flattened_values_ptr,
+                                                                  values.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  K == aggregation::ARGMIN);
+    do_reduction(count_iter, result_begin, binop);
 
+    if (values.has_nulls()) {
       // Generate bitmask for the output by segmented reduction of the input bitmask.
       auto const d_values_ptr = column_device_view::create(values, stream);
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
@@ -288,13 +288,6 @@ struct group_reduction_functor<
       auto [null_mask, null_count] =
         cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
-    } else {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
-                                                          *d_flattened_values_ptr,
-                                                          flattened_null_precedences.data(),
-                                                          K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
     }
 
     return result;
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 7adb4ccec76..8d09728b771 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -50,9 +50,8 @@ namespace {
  * @brief Compares two `table` rows for equality as if the table were
  * ordered according to a specified permutation map.
  */
-template <bool nullable = true>
 struct permuted_row_equality_comparator {
-  cudf::row_equality_comparator<nullable> _comparator;
+  cudf::row_equality_comparator<cudf::nullate::DYNAMIC> _comparator;
   cudf::size_type const* _map;
 
   /**
@@ -60,10 +59,12 @@ struct permuted_row_equality_comparator {
    *
    * @param t The `table` whose rows will be compared
    * @param map The permutation map that specifies the effective ordering of
-   *`t`. Must be the same size as `t.num_rows()`
+   * `t`. Must be the same size as `t.num_rows()`
    */
-  permuted_row_equality_comparator(cudf::table_device_view const& t, cudf::size_type const* map)
-    : _comparator(t, t, true), _map{map}
+  permuted_row_equality_comparator(cudf::table_device_view const& t,
+                                   cudf::size_type const* map,
+                                   bool nullable = true)
+    : _comparator(cudf::nullate::DYNAMIC{nullable}, t, t, cudf::null_equality::EQUAL), _map{map}
   {
   }
 
@@ -76,7 +77,7 @@ struct permuted_row_equality_comparator {
    *
    * @param lhs The index of the first row
    * @param rhs The index of the second row
-   * @returns if the two specified rows in the permuted order are equivalent
+   * @returns true if the two specified rows in the permuted order are equivalent
    */
   CUDA_DEVICE_CALLABLE
   bool operator()(cudf::size_type lhs, cudf::size_type rhs)
@@ -196,21 +197,12 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   auto sorted_order       = key_sort_order(stream).data<size_type>();
   decltype(_group_offsets->begin()) result_end;
 
-  if (has_nulls(_keys)) {
-    result_end = thrust::unique_copy(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys(stream)),
-      _group_offsets->begin(),
-      permuted_row_equality_comparator<true>(*device_input_table, sorted_order));
-  } else {
-    result_end = thrust::unique_copy(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys(stream)),
-      _group_offsets->begin(),
-      permuted_row_equality_comparator<false>(*device_input_table, sorted_order));
-  }
+  result_end = thrust::unique_copy(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_keys(stream)),
+    _group_offsets->begin(),
+    permuted_row_equality_comparator(*device_input_table, sorted_order, has_nulls(_keys)));
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
   _group_offsets->set_element(num_groups, num_keys(stream), stream);
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index a882b33bcdf..039652e0012 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -64,43 +64,24 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
   auto const device_input = table_device_view::create(leaf_table, stream);
   auto output_view        = output->mutable_view();
 
-  if (has_nulls(leaf_table)) {
-    thrust::tabulate(rmm::exec_policy(stream),
-                     output_view.begin<int32_t>(),
-                     output_view.end<int32_t>(),
-                     [device_input = *device_input, seed] __device__(auto row_index) {
-                       return thrust::reduce(
-                         thrust::seq,
-                         device_input.begin(),
-                         device_input.end(),
-                         seed,
-                         [rindex = row_index] __device__(auto hash, auto column) {
-                           return cudf::type_dispatcher(
-                             column.type(),
-                             element_hasher_with_seed<hash_function, true>{hash, hash},
-                             column,
-                             rindex);
-                         });
-                     });
-  } else {
-    thrust::tabulate(rmm::exec_policy(stream),
-                     output_view.begin<int32_t>(),
-                     output_view.end<int32_t>(),
-                     [device_input = *device_input, seed] __device__(auto row_index) {
-                       return thrust::reduce(
-                         thrust::seq,
-                         device_input.begin(),
-                         device_input.end(),
-                         seed,
-                         [rindex = row_index] __device__(auto hash, auto column) {
-                           return cudf::type_dispatcher(
-                             column.type(),
-                             element_hasher_with_seed<hash_function, false>{hash, hash},
-                             column,
-                             rindex);
-                         });
-                     });
-  }
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    output_view.begin<int32_t>(),
+    output_view.end<int32_t>(),
+    [device_input = *device_input, nulls = has_nulls(leaf_table), seed] __device__(auto row_index) {
+      return thrust::reduce(thrust::seq,
+                            device_input.begin(),
+                            device_input.end(),
+                            seed,
+                            [rindex = row_index, nulls] __device__(auto hash, auto column) {
+                              return cudf::type_dispatcher(
+                                column.type(),
+                                element_hasher_with_seed<hash_function, nullate::DYNAMIC>{
+                                  nullate::DYNAMIC{nulls}, hash, hash},
+                                column,
+                                rindex);
+                            });
+    });
 
   return output;
 }
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmur_hash.cu
index 81be4d0eabe..a761d058180 100644
--- a/cpp/src/hash/murmur_hash.cu
+++ b/cpp/src/hash/murmur_hash.cu
@@ -50,31 +50,17 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
                  "Expected same size of initial hash values as number of columns");
     auto device_initial_hash = make_device_uvector_async(initial_hash, stream);
 
-    if (nullable) {
-      thrust::tabulate(
-        rmm::exec_policy(stream),
-        output_view.begin<int32_t>(),
-        output_view.end<int32_t>(),
-        row_hasher_initial_values<MurmurHash3_32, true>(*device_input, device_initial_hash.data()));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher_initial_values<MurmurHash3_32, false>(
-                         *device_input, device_initial_hash.data()));
-    }
+    thrust::tabulate(rmm::exec_policy(stream),
+                     output_view.begin<int32_t>(),
+                     output_view.end<int32_t>(),
+                     row_hasher_initial_values<MurmurHash3_32, nullate::DYNAMIC>(
+                       nullate::DYNAMIC{nullable}, *device_input, device_initial_hash.data()));
   } else {
-    if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, true>(*device_input));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, false>(*device_input));
-    }
+    thrust::tabulate(
+      rmm::exec_policy(stream),
+      output_view.begin<int32_t>(),
+      output_view.end<int32_t>(),
+      row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
   }
 
   return output;
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c5b680f129e..eee0a8cc6f0 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -81,7 +81,7 @@ void build_join_hash_table(cudf::table_view const& build,
   CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
   CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
 
-  row_hash hash_build{*build_table_ptr};
+  row_hash hash_build{nullate::YES{}, *build_table_ptr};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_build, empty_key_sentinel};
 
@@ -147,9 +147,9 @@ probe_join_hash_table(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
@@ -212,9 +212,9 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 10b0e420ef6..976b0c81ead 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -117,9 +117,9 @@ std::size_t compute_join_output_size(table_device_view build_table,
     }
   }
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index cec633765c7..4b33772dd69 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -32,8 +32,10 @@ namespace detail {
  */
 class pair_equality {
  public:
-  pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true)
-    : _check_row_equality{lhs, rhs, nulls_are_equal}
+  pair_equality(table_device_view lhs,
+                table_device_view rhs,
+                null_equality nulls_are_equal = null_equality::EQUAL)
+    : _check_row_equality{cudf::nullate::YES{}, lhs, rhs, nulls_are_equal}
   {
   }
 
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index d6eb5e93a98..84506daf2f1 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -45,9 +45,9 @@ using multimap_type =
                         default_allocator<char>,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
-using row_hash = cudf::row_hasher<default_hash>;
+using row_hash = cudf::row_hasher<default_hash, cudf::nullate::YES>;
 
-using row_equality = cudf::row_equality_comparator<true>;
+using row_equality = cudf::row_equality_comparator<cudf::nullate::YES>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 5b5dd418a97..3d27c5740f4 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -77,13 +77,13 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Create hash table containing all keys found in right table
   auto right_rows_d            = table_device_view::create(right_flattened_keys, stream);
   size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  row_hash hash_build{*right_rows_d};
-  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+  row_hash hash_build{cudf::nullate::YES{}, *right_rows_d};
+  row_equality equality_build{cudf::nullate::YES{}, *right_rows_d, *right_rows_d, compare_nulls};
 
   // Going to join it with left table
   auto left_rows_d = table_device_view::create(left_flattened_keys, stream);
-  row_hash hash_probe{*left_rows_d};
-  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+  row_hash hash_probe{cudf::nullate::YES{}, *left_rows_d};
+  row_equality equality_probe{cudf::nullate::YES{}, *left_rows_d, *right_rows_d, compare_nulls};
 
   auto hash_table_ptr = hash_table_type::create(hash_table_size,
                                                 stream,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index e8c56cdafd8..7b3b7b0f3fd 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -486,7 +486,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
 
   auto const device_input = table_device_view::create(table_to_hash, stream);
-  auto const hasher       = row_hasher<hash_function, hash_has_nulls>(*device_input, seed);
+  auto const hasher       = row_hasher<hash_function, nullate::DYNAMIC>(
+    nullate::DYNAMIC{hash_has_nulls}, *device_input, seed);
 
   // If the number of partitions is a power of two, we can compute the partition
   // number of each row more efficiently with bitwise operations
diff --git a/cpp/src/reductions/arg_minmax_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
index 40df23bcd8e..5694d0ed0fa 100644
--- a/cpp/src/reductions/arg_minmax_util.cuh
+++ b/cpp/src/reductions/arg_minmax_util.cuh
@@ -24,21 +24,20 @@ namespace detail {
 
 /**
  * @brief Binary operator ArgMin/ArgMax with index values into the input table.
- *
- * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
- * '<' operator.
  */
-template <bool has_nulls>
 struct row_arg_minmax_fn {
   size_type const num_rows;
-  row_lexicographic_comparator<has_nulls> const comp;
+  row_lexicographic_comparator<nullate::DYNAMIC> const comp;
   bool const arg_min;
 
-  row_arg_minmax_fn(size_type const num_rows_,
-                    table_device_view const& table_,
-                    null_order const* null_precedence_,
-                    bool const arg_min_)
-    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence_), arg_min(arg_min_)
+  row_arg_minmax_fn(size_type const num_rows,
+                    table_device_view const& table,
+                    bool has_nulls,
+                    null_order const* null_precedence,
+                    bool const arg_min)
+    : num_rows(num_rows),
+      comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
+      arg_min(arg_min)
   {
   }
 
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index e7f1e867a41..9ac4db3a34b 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -33,18 +33,19 @@ namespace {
 /**
  * @brief generate row ranks or dense ranks using a row comparison then scan the results
  *
- * @tparam has_nulls if the order_by column has nulls
  * @tparam value_resolver flag value resolver with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
  * @param order_by input column to generate ranks for
+ * @param has_nulls if the order_by column has nested nulls
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <bool has_nulls, typename value_resolver, typename scan_operator>
+template <typename value_resolver, typename scan_operator>
 std::unique_ptr<column> rank_generator(column_view const& order_by,
+                                       bool has_nulls,
                                        value_resolver resolver,
                                        scan_operator scan_op,
                                        rmm::cuda_stream_view stream,
@@ -53,7 +54,8 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
     table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
+  row_equality_comparator comparator(
+    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -85,16 +87,9 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in dense_rank scan.");
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      [] __device__(bool equality, auto row_index) { return equality; },
-      DeviceSum{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
+    has_nested_nulls(table_view{{order_by}}),
     [] __device__(bool equality, auto row_index) { return equality; },
     DeviceSum{},
     stream,
@@ -107,16 +102,9 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in rank scan.");
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
-      DeviceMax{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
+    has_nested_nulls(table_view{{order_by}}),
     [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
     DeviceMax{},
     stream,
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index b0e761c4c3b..2b1ac8aa704 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -182,15 +182,12 @@ struct scan_functor<Op, cudf::struct_view> {
       is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
                 : rmm::device_uvector<cudf::null_order>(0, stream);
 
-    if (input.has_nulls()) {
-      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
-        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-      do_scan(binop);
-    } else {
-      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
-        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-      do_scan(binop);
-    }
+    auto const binop = cudf::reduction::detail::row_arg_minmax_fn(input.size(),
+                                                                  *d_flattened_input_ptr,
+                                                                  input.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  is_min_op);
+    do_scan(binop);
 
     // Gather the children columns of the input column. Must use `get_sliced_child` to properly
     // handle input in case it is a sliced view.
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 7dd54e9250a..e5633341ffa 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -317,15 +317,13 @@ struct same_element_type_dispatcher {
     };
 
     auto const minmax_idx = [&] {
-      if (input.has_nulls()) {
-        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
-          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-        return do_reduction(binop);
-      } else {
-        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
-          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-        return do_reduction(binop);
-      }
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn(input.size(),
+                                                   *d_flattened_input_ptr,
+                                                   input.has_nulls(),
+                                                   flattened_null_precedences.data(),
+                                                   is_min_op);
+      return do_reduction(binop);
     }();
 
     return cudf::detail::get_element(input, minmax_idx, stream, mr);
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 713b3b27a2b..fae02805620 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -180,7 +180,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   };
 
   auto input_pair_iterator =
-    make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+    make_optional_iterator<T>(*input_device_view, nullate::DYNAMIC{input.has_nulls()});
   thrust::transform(rmm::exec_policy(stream),
                     input_pair_iterator,
                     input_pair_iterator + input.size(),
@@ -232,10 +232,10 @@ struct dispatch_clamp {
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
-    auto lo_itr         = make_optional_iterator<T>(lo, contains_nulls::YES{});
-    auto hi_itr         = make_optional_iterator<T>(hi, contains_nulls::YES{});
-    auto lo_replace_itr = make_optional_iterator<T>(lo_replace, contains_nulls::NO{});
-    auto hi_replace_itr = make_optional_iterator<T>(hi_replace, contains_nulls::NO{});
+    auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
+    auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
+    auto lo_replace_itr = make_optional_iterator<T>(lo_replace, nullate::NO{});
+    auto hi_replace_itr = make_optional_iterator<T>(hi_replace, nullate::NO{});
 
     return clamp<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
   }
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index c1c26573692..7b47f8df28d 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -56,9 +56,9 @@ struct replace_nans_functor {
     };
 
     auto input_iterator =
-      make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+      make_optional_iterator<T>(*input_device_view, nullate::DYNAMIC{input.has_nulls()});
     auto replacement_iterator =
-      make_optional_iterator<T>(replacement, contains_nulls::DYNAMIC{}, replacement_nullable);
+      make_optional_iterator<T>(replacement, nullate::DYNAMIC{replacement_nullable});
     return copy_if_else(input.has_nulls() or replacement_nullable,
                         input_iterator,
                         input_iterator + size,
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 462d0678eab..9a677d7907a 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -124,17 +124,13 @@ std::unique_ptr<column> search_ordered(table_view const& t,
     detail::make_device_uvector_async(null_precedence_flattened, stream);
 
   auto const count_it = thrust::make_counting_iterator<size_type>(0);
-  if (has_null_elements) {
-    auto const comp = row_lexicographic_comparator<true>(
-      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
-    launch_search(
-      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
-  } else {
-    auto const comp = row_lexicographic_comparator<false>(
-      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
-    launch_search(
-      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
-  }
+  auto const comp     = row_lexicographic_comparator(nullate::DYNAMIC{has_null_elements},
+                                                 lhs,
+                                                 rhs,
+                                                 column_order_dv.data(),
+                                                 null_precedence_dv.data());
+  launch_search(
+    count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
 
   return result;
 }
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index b08baaa0261..a8820204c22 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -27,12 +27,14 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/sort.h>
+
 namespace cudf {
 namespace detail {
 
-template <bool has_nulls>
 auto is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
+               bool has_nulls,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
 {
@@ -45,8 +47,11 @@ auto is_sorted(cudf::table_view const& in,
                                    ? make_device_uvector_async(flattened.null_orders(), stream)
                                    : rmm::device_uvector<null_order>(0, stream);
 
-  auto comparator = row_lexicographic_comparator<has_nulls>(
-    *d_input, *d_input, d_column_order.data(), d_null_precedence.data());
+  auto comparator = row_lexicographic_comparator(nullate::DYNAMIC{has_nulls},
+                                                 *d_input,
+                                                 *d_input,
+                                                 d_column_order.data(),
+                                                 d_null_precedence.data());
 
   auto sorted = thrust::is_sorted(rmm::exec_policy(stream),
                                   thrust::make_counting_iterator(0),
@@ -76,11 +81,8 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  if (has_nulls(in)) {
-    return detail::is_sorted<true>(in, column_order, null_precedence, rmm::cuda_stream_default);
-  } else {
-    return detail::is_sorted<false>(in, column_order, null_precedence, rmm::cuda_stream_default);
-  }
+  return detail::is_sorted(
+    in, column_order, has_nulls(in), null_precedence, rmm::cuda_stream_default);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index de0a44e3234..e17a18997e8 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -37,10 +37,11 @@ namespace cudf {
 namespace detail {
 namespace {
 // Functor to identify unique elements in a sorted order table/column
-template <bool has_nulls, typename ReturnType, typename Iterator>
+template <typename ReturnType, typename Iterator>
 struct unique_comparator {
-  unique_comparator(table_device_view device_table, Iterator const sorted_order)
-    : comparator(device_table, device_table, true), permute(sorted_order)
+  unique_comparator(table_device_view device_table, Iterator const sorted_order, bool has_nulls)
+    : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL),
+      permute(sorted_order)
   {
   }
   __device__ ReturnType operator()(size_type index) const noexcept
@@ -49,7 +50,7 @@ struct unique_comparator {
   };
 
  private:
-  row_equality_comparator<has_nulls> comparator;
+  row_equality_comparator<nullate::DYNAMIC> comparator;
   Iterator const permute;
 };
 
@@ -63,21 +64,12 @@ rmm::device_uvector<size_type> sorted_dense_rank(column_view input_col,
   rmm::device_uvector<size_type> dense_rank_sorted(input_size, stream);
   auto sorted_index_order = thrust::make_permutation_iterator(
     sorted_order_view.begin<size_type>(), thrust::make_counting_iterator<size_type>(0));
-  if (input_col.has_nulls()) {
-    auto conv = unique_comparator<true, size_type, decltype(sorted_index_order)>(
-      *device_table, sorted_index_order);
-    auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
+  auto conv = unique_comparator<size_type, decltype(sorted_index_order)>(
+    *device_table, sorted_index_order, input_col.has_nulls());
+  auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
 
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
-  } else {
-    auto conv = unique_comparator<false, size_type, decltype(sorted_index_order)>(
-      *device_table, sorted_index_order);
-    auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
-
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
-  }
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
   return dense_rank_sorted;
 }
 
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 25f0815e645..881503a49e3 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -127,40 +127,25 @@ std::unique_ptr<column> sorted_order(table_view input,
   auto device_table = table_device_view::create(flattened, stream);
   auto const d_column_order = make_device_uvector_async(flattened.orders(), stream);
 
-  if (has_nulls(flattened)) {
-    auto const d_null_precedence = make_device_uvector_async(flattened.null_orders(), stream);
-    auto const comparator        = row_lexicographic_comparator<true>(
-      *device_table, *device_table, d_column_order.data(), d_null_precedence.data());
-    if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream),
-                          mutable_indices_view.begin<size_type>(),
-                          mutable_indices_view.end<size_type>(),
-                          comparator);
-    } else {
-      thrust::sort(rmm::exec_policy(stream),
-                   mutable_indices_view.begin<size_type>(),
-                   mutable_indices_view.end<size_type>(),
-                   comparator);
-    }
-    // protection for temporary d_column_order and d_null_precedence
-    stream.synchronize();
+  auto const d_null_precedence = make_device_uvector_async(flattened.null_orders(), stream);
+  auto const comparator = row_lexicographic_comparator(nullate::DYNAMIC{has_nulls(flattened)},
+                                                       *device_table,
+                                                       *device_table,
+                                                       d_column_order.data(),
+                                                       d_null_precedence.data());
+  if (stable) {
+    thrust::stable_sort(rmm::exec_policy(stream),
+                        mutable_indices_view.begin<size_type>(),
+                        mutable_indices_view.end<size_type>(),
+                        comparator);
   } else {
-    auto const comparator =
-      row_lexicographic_comparator<false>(*device_table, *device_table, d_column_order.data());
-    if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream),
-                          mutable_indices_view.begin<size_type>(),
-                          mutable_indices_view.end<size_type>(),
-                          comparator);
-    } else {
-      thrust::sort(rmm::exec_policy(stream),
-                   mutable_indices_view.begin<size_type>(),
-                   mutable_indices_view.end<size_type>(),
-                   comparator);
-    }
-    // protection for temporary d_column_order
-    stream.synchronize();
+    thrust::sort(rmm::exec_policy(stream),
+                 mutable_indices_view.begin<size_type>(),
+                 mutable_indices_view.end<size_type>(),
+                 comparator);
   }
+  // protection for temporary d_column_order and d_null_precedence
+  stream.synchronize();
 
   return sorted_indices;
 }
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 4e142204a29..5c695f8a16f 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -50,27 +50,15 @@ cudf::size_type distinct_count(table_view const& keys,
   auto sorted_row_index   = sorted_indices->view().data<cudf::size_type>();
   auto device_input_table = cudf::table_device_view::create(keys, stream);
 
-  if (cudf::has_nulls(keys)) {
-    row_equality_comparator<true> comp(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [sorted_row_index, comp] __device__(cudf::size_type i) {
-        return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-      });
-  } else {
-    row_equality_comparator<false> comp(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [sorted_row_index, comp] __device__(cudf::size_type i) {
-        return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-      });
-  }
+  row_equality_comparator comp(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
+    [sorted_row_index, comp] __device__(cudf::size_type i) {
+      return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
+    });
 }
 
 /**
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index f236e6a5f53..abc34663aee 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -83,35 +83,18 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
   // extract unique indices
   auto device_input_table = cudf::table_device_view::create(keys, stream);
 
-  if (cudf::has_nulls(keys)) {
-    auto comp = row_equality_comparator<true>(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                  sorted_indices->view().end<cudf::size_type>(),
-                                  unique_indices.begin<cudf::size_type>(),
-                                  comp,
-                                  keep,
-                                  stream);
-
-    return cudf::detail::slice(
-      column_view(unique_indices),
-      0,
-      thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
-  } else {
-    auto comp = row_equality_comparator<false>(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                  sorted_indices->view().end<cudf::size_type>(),
-                                  unique_indices.begin<cudf::size_type>(),
-                                  comp,
-                                  keep,
-                                  stream);
-
-    return cudf::detail::slice(
-      column_view(unique_indices),
-      0,
-      thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
-  }
+  auto comp = row_equality_comparator(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
+  auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
+                                sorted_indices->view().end<cudf::size_type>(),
+                                unique_indices.begin<cudf::size_type>(),
+                                comp,
+                                keep,
+                                stream);
+
+  return cudf::detail::slice(column_view(unique_indices),
+                             0,
+                             thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
 }
 }  // namespace
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 5b5c35df551..16aee349bb5 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -37,10 +37,11 @@ namespace detail {
 
 namespace {
 
-template <typename InputType, bool has_nulls>
+template <typename InputType>
 struct one_hot_encode_functor {
-  one_hot_encode_functor(column_device_view input, column_device_view category)
-    : _equality_comparator{input, category}, _input_size{input.size()}
+  one_hot_encode_functor(column_device_view input, column_device_view category, bool nulls)
+    : _equality_comparator{nullate::DYNAMIC{nulls}, input, category, null_equality::EQUAL},
+      _input_size{input.size()}
   {
   }
 
@@ -52,13 +53,12 @@ struct one_hot_encode_functor {
   }
 
  private:
-  element_equality_comparator<has_nulls> const _equality_comparator;
+  element_equality_comparator<nullate::DYNAMIC> const _equality_comparator;
   size_type const _input_size;
 };
 
 }  // anonymous namespace
 
-template <bool has_nulls>
 struct one_hot_encode_launcher {
   template <typename InputType, CUDF_ENABLE_IF(is_equality_comparable<InputType, InputType>())>
   std::pair<std::unique_ptr<column>, table_view> operator()(column_view const& input_column,
@@ -72,8 +72,8 @@ struct one_hot_encode_launcher {
 
     auto d_input_column    = column_device_view::create(input_column, stream);
     auto d_category_column = column_device_view::create(categories, stream);
-    one_hot_encode_functor<InputType, has_nulls> one_hot_encoding_compute_f(*d_input_column,
-                                                                            *d_category_column);
+    one_hot_encode_functor<InputType> one_hot_encoding_compute_f(
+      *d_input_column, *d_category_column, input_column.nullable() || categories.nullable());
 
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
@@ -118,11 +118,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
     return std::make_pair(std::move(empty_data), table_view{views});
   }
 
-  return (!input.nullable() && !categories.nullable())
-           ? type_dispatcher(
-               input.type(), one_hot_encode_launcher<false>{}, input, categories, stream, mr)
-           : type_dispatcher(
-               input.type(), one_hot_encode_launcher<true>{}, input, categories, stream, mr);
+  return type_dispatcher(input.type(), one_hot_encode_launcher{}, input, categories, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/iterator/optional_iterator_test.cuh b/cpp/tests/iterator/optional_iterator_test.cuh
index 6b1c2b360eb..d19c9e49ad9 100644
--- a/cpp/tests/iterator/optional_iterator_test.cuh
+++ b/cpp/tests/iterator/optional_iterator_test.cuh
@@ -36,11 +36,11 @@ void nonull_optional_iterator(IteratorTest<T>& testFixture)
   // GPU test
   testFixture.iterator_test_thrust(
     replaced_array,
-    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::DYNAMIC{}, false),
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::nullate::DYNAMIC{false}),
     host_values.size());
   testFixture.iterator_test_thrust(
     replaced_array,
-    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::NO{}),
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::nullate::NO{}),
     host_values.size());
 }
 
@@ -72,22 +72,20 @@ void null_optional_iterator(IteratorTest<T>& testFixture)
                  [](auto s, bool b) { return thrust::optional<T>{s}; });
 
   // GPU test for correct null mapping
-  testFixture.iterator_test_thrust(optional_values,
-                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, true),
-                                   host_values.size());
+  testFixture.iterator_test_thrust(
+    optional_values, d_col->optional_begin<T>(cudf::nullate::DYNAMIC{true}), host_values.size());
 
   testFixture.iterator_test_thrust(
-    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+    optional_values, d_col->optional_begin<T>(cudf::nullate::YES{}), host_values.size());
   testFixture.iterator_test_thrust(
-    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+    optional_values, d_col->optional_begin<T>(cudf::nullate::YES{}), host_values.size());
 
   // GPU test for ignoring null mapping
-  testFixture.iterator_test_thrust(value_all_valid,
-                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, false),
-                                   host_values.size());
+  testFixture.iterator_test_thrust(
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::DYNAMIC{false}), host_values.size());
 
   testFixture.iterator_test_thrust(
-    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::NO{}), host_values.size());
   testFixture.iterator_test_thrust(
-    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::NO{}), host_values.size());
 }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index a8c135a726f..1d23d73bf36 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -99,7 +99,7 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
   // GPU test
-  auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+  auto it_dev         = d_col->optional_begin<T>(cudf::nullate::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
 
   // this can be computed with a single reduce and without a temporary output vector
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 3a792573108..c94963525a0 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -45,8 +45,8 @@ void row_comparison(cudf::table_view input1,
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
   auto d_column_order = cudf::detail::make_device_uvector_sync(column_order);
 
-  auto comparator = cudf::row_lexicographic_comparator<false>(
-    *device_table_1, *device_table_2, d_column_order.data());
+  auto comparator = cudf::row_lexicographic_comparator(
+    cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
 
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator(0),
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 8341425e9e7..2d9fe7775d5 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
@@ -35,16 +36,17 @@
 
 #include <jit/type.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/equal.h>
+#include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/sequence.h>
 
 #include <numeric>
 #include <sstream>
-#include "cudf/detail/utilities/vector_factories.hpp"
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -325,11 +327,13 @@ class corresponding_rows_unequal {
                              column_device_view lhs_row_indices_,
                              column_device_view rhs_row_indices_,
                              size_type /*fp_ulps*/)
-    : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_)
+    : comp(cudf::nullate::YES{}, d_lhs, d_rhs, cudf::null_equality::EQUAL),
+      lhs_row_indices(lhs_row_indices_),
+      rhs_row_indices(rhs_row_indices_)
   {
   }
 
-  cudf::row_equality_comparator<true> comp;
+  cudf::row_equality_comparator<cudf::nullate::YES> comp;
 
   __device__ bool operator()(size_type index)
   {
@@ -358,7 +362,7 @@ class corresponding_rows_not_equivalent {
                                     size_type fp_ulps_)
     : d_lhs(d_lhs),
       d_rhs(d_rhs),
-      comp(d_lhs, d_rhs),
+      comp(cudf::nullate::YES{}, d_lhs, d_rhs, null_equality::EQUAL),
       lhs_row_indices(lhs_row_indices_),
       rhs_row_indices(rhs_row_indices_),
       fp_ulps(fp_ulps_)
@@ -404,7 +408,7 @@ class corresponding_rows_not_equivalent {
     }
   };
 
-  cudf::row_equality_comparator<true> comp;
+  cudf::row_equality_comparator<cudf::nullate::YES> comp;
 
   __device__ bool operator()(size_type index)
   {

From 3b93f5c356035164b41d918f86efe419ba85cc2b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 6 Dec 2021 16:10:45 -0500
Subject: [PATCH 18/25] Use stream allocator adaptor for hash join table
 (#9704)

Depends on https://github.com/NVIDIA/cuCollections/pull/119

This PR replaces the default hash join allocator with the corresponding `rmm::mr::stream_allocator_adaptor`. It accommodates new `cuco::allocator` APIs that don't take stream as input argument.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9704
---
 cpp/cmake/thirdparty/get_cucollections.cmake |  2 +-
 cpp/src/join/hash_join.cu                    |  3 ++-
 cpp/src/join/join_common_utils.hpp           | 10 ++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index b6cb9757ae8..b58bdb55de3 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG f0eecb203590f1f4ac4a9f1700229f4434ac64dc
+    GIT_TAG 6433e8ad7571f14cc5384051b049029c60dd1ce0
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index eee0a8cc6f0..57303ed7795 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -293,7 +293,8 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
     _hash_table{compute_hash_table_size(build.num_rows()),
                 std::numeric_limits<hash_value_type>::max(),
                 cudf::detail::JoinNoneValue,
-                stream.value()}
+                stream.value(),
+                detail::hash_table_allocator_type{default_allocator<char>{}, stream}}
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 84506daf2f1..c4692a50fec 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -15,11 +15,15 @@
  */
 #pragma once
 
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <hash/concurrent_unordered_multimap.cuh>
+#include <hash/hash_allocator.cuh>
+#include <hash/helper_functions.cuh>
+
+#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_multimap.cuh>
 
@@ -38,11 +42,13 @@ using pair_type = cuco::pair_type<hash_value_type, size_type>;
 
 using hash_type = cuco::detail::MurmurHash3_32<hash_value_type>;
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
 using multimap_type =
   cuco::static_multimap<hash_value_type,
                         size_type,
                         cuda::thread_scope_device,
-                        default_allocator<char>,
+                        hash_table_allocator_type,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
 using row_hash = cudf::row_hasher<default_hash, cudf::nullate::YES>;

From 8ceed73b96ed93409febf6ac2d8e8c213f290b60 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 6 Dec 2021 16:44:33 -0600
Subject: [PATCH 19/25] Use vector factories for host-device copies. (#9806)

See: https://github.com/rapidsai/cudf/pull/9588#discussion_r751889583

In a recent PR review, @jrhemstad suggested switching to vector factories for one-way host-device data copying (that is, cases where using a `hostdevice_vector` isn't the right choice).

This PR applies that suggestion more broadly across the code base, replacing a number of simple cases where a (device) vector was being constructed followed by a call to `CUDA_TRY(cudaMemcpyAsync(...))` with the corresponding factory functions. This makes the code a little more concise and encourages broader use of these factory functions in the future.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9806
---
 cpp/src/dictionary/detail/concatenate.cu | 10 ++--------
 cpp/src/io/orc/timezone.cpp              | 18 +++++-------------
 cpp/src/io/parquet/page_enc.cu           | 18 +++++-------------
 cpp/src/io/parquet/writer_impl.cu        |  7 +------
 cpp/src/strings/filter_chars.cu          |  8 ++------
 cpp/src/strings/replace/backref_re.cu    |  9 +++------
 cpp/src/strings/translate.cu             |  9 +++------
 cpp/src/transform/row_bit_count.cu       |  8 ++------
 8 files changed, 23 insertions(+), 64 deletions(-)

diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index a3cac6ac5c1..fd86d8ec7d4 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -104,14 +105,7 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    auto d_offsets = rmm::device_uvector<offsets_pair>(offsets.size(), stream);
-    CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                             offsets.data(),
-                             offsets.size() * sizeof(offsets_pair),
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    stream.synchronize();
-    return d_offsets;
+    return cudf::detail::make_device_uvector_sync(offsets, stream);
   }
 
  private:
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 77fde0d1e75..3a1e8bf898a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -15,6 +15,8 @@
  */
 #include "timezone.cuh"
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+
 #include <algorithm>
 #include <fstream>
 
@@ -459,19 +461,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
-                           ttimes.data(),
-                           ttimes.size() * sizeof(int64_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                           offsets.data(),
-                           offsets.size() * sizeof(int32_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
+  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
+  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
   stream.synchronize();
 
   return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 48490426db7..3ca53d9e651 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1716,19 +1717,10 @@ dremel_data get_dremel_data(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
-  CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
-                           d_column_offsets.data(),
-                           d_column_offsets.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-  thrust::host_vector<size_type> column_ends(d_column_ends.size());
-  CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
-                           d_column_ends.data(),
-                           d_column_ends.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-
+  thrust::host_vector<size_type> column_offsets =
+    cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  thrust::host_vector<size_type> column_ends =
+    cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 62803432157..d1101b24d7e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -673,12 +673,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
-                           _nullability.data(),
-                           _nullability.size() * sizeof(uint8_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
 
   _is_list = (_max_rep_level > 0);
 
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 883a7fada75..7e45a609d34 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -126,12 +127,7 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table(table_size, stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           table_size * sizeof(char_range),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 87603e4c35b..99c55998fb9 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -24,6 +24,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -116,12 +117,8 @@ std::unique_ptr<column> replace_with_backrefs(
 
   // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(repl);
-  rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
-                           parse_result.second.data(),
-                           sizeof(backref_type) * backrefs.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<backref_type> backrefs =
+    cudf::detail::make_device_uvector_async(parse_result.second, stream);
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index ad3515e8058..8761deab4a4 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -101,12 +102,8 @@ std::unique_ptr<column> translate(
     return lhs.first < rhs.first;
   });
   // copy translate table to device memory
-  rmm::device_uvector<translate_table> table(htable.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           sizeof(translate_table) * htable.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<translate_table> table =
+    cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 06b03a6b36f..ff720daa5cb 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -496,12 +497,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info(info.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(d_info.data(),
-                           info.data(),
-                           sizeof(column_info) * info.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer

From ccab7aefbb49fbcc33cf53d8a63a60304bea5b3c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 6 Dec 2021 17:59:44 -0500
Subject: [PATCH 20/25] Fix build instructions for libcudf doxygen (#9837)

Updates doxygen build instructions in the libcudf documentation guide.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9837
---
 cpp/docs/DOCUMENTATION.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md
index b219543e3d6..2382a0eb022 100644
--- a/cpp/docs/DOCUMENTATION.md
+++ b/cpp/docs/DOCUMENTATION.md
@@ -9,7 +9,7 @@ The following is the license header comment that should appear at the beginning
 
 ```c++
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ The comment should start with `/*` and not `/**` so it is not processed by doxyg
 Also, here are the rules for the copyright year.
 
 - A new file should have the year in which it was created
-- A modified file should span the year it was created and the year it was modified (e.g. `2019-2020`)
+- A modified file should span the year it was created and the year it was modified (e.g. `2019-2021`)
 
 Changing the copyright year may not be necessary if no content has changed (e.g. reformatting only).
 
@@ -48,7 +48,7 @@ Here are some of the custom options in the Doxyfile for libcudf.
 | Option | Setting | Description |
 | ------ | ------- | ----------- |
 | PROJECT_NAME | libcudf | Title used on the main page |
-| PROJECT_NUMBER | 0.14 | Version number |
+| PROJECT_NUMBER | 22.02.00 | Version number |
 | EXTENSION_MAPPING | cu=C++ cuh=C++ | Process `cu` and `cuh` as C++ |
 | INPUT | main_page.md regex.md unicode.md ../include | Embedded markdown files and source code directories to process |
 | FILE_PATTERNS | *.cpp *.hpp *.h *.c *.cu *.cuh | File extensions to process |
@@ -459,6 +459,7 @@ We recommend installing Doxygen using conda (`conda install doxygen`) or a Linux
 Alternatively you can [build and install doxygen from source](http://www.doxygen.nl/manual/install.html).
 
 To build the libcudf HTML documentation simply run the `doxygen` command from the `cpp/doxygen` directory containing the `Doxyfile`.
+The libcudf documentation can also be built using `make docs_cudf` from the cmake build directory (e.g. `cpp/build`).
 Doxygen reads and processes all appropriate source files under the `cpp/include/` directory.
 The output is generated in the `cpp/doxygen/html/` directory.
 You can load the local `index.html` file generated there into any web browser to view the result.
@@ -466,10 +467,6 @@ You can load the local `index.html` file generated there into any web browser to
 To view docs built on a remote server, you can run a simple HTTP server using Python: `cd html && python -m http.server`.
 Then open `http://<IP address>:8000` in your local web browser, inserting the IP address of the machine on which you ran the HTTP server.
 
-By default, doxygen uses the `graphviz dot` tool to build diagrams of the class, namespace, and module relationships.
-If the `dot` tool cannot be found then doxygen generates output without diagrams.
-The doxygen installation page does not include instructions for downloading and installing `graphviz dot`.
-
 The doxygen output is intended for building documentation only for the public APIs and classes.
 For example, the output should not include documentation for `detail` or `/src` files, and these directories are excluded in the `Doxyfile` configuration.
 When published by the build/CI system, the doxygen output will appear on our external [RAPIDS web site](https://docs.rapids.ai/api/libcudf/stable/index.html).

From 8c82d6acd14b28a74228ba20b765925beea2c383 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Tue, 7 Dec 2021 09:32:55 -0500
Subject: [PATCH 21/25] adding `series.transpose` (#9835)

Fixes: #9605

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9835
---
 docs/cudf/source/api_docs/series.rst  |  1 +
 python/cudf/cudf/core/dataframe.py    | 17 +----------------
 python/cudf/cudf/core/series.py       |  8 ++++++++
 python/cudf/cudf/tests/test_series.py | 21 +++++++++++++++++++++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index b90ee628332..a3b621a9f7d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -206,6 +206,7 @@ Reshaping, sorting
    Series.scatter_by_map
    Series.searchsorted
    Series.repeat
+   Series.transpose
 
 Combining / comparing / joining / merging / encoding
 ----------------------------------------------------
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c0cb6f1917f..279b1f44961 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3520,22 +3520,7 @@ def transpose(self):
         result.columns = columns
         return result
 
-    @property
-    def T(self):
-        """
-        Transpose index and columns.
-
-        Reflect the DataFrame over its main diagonal by writing rows
-        as columns and vice-versa. The property T is an accessor to
-        the method transpose().
-
-        Returns
-        -------
-        out : DataFrame
-            The transposed DataFrame.
-        """
-
-        return self.transpose()
+    T = property(transpose, doc=transpose.__doc__)
 
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index cf035ef457d..0f0ebe55043 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2788,6 +2788,14 @@ def cov(self, other, min_periods=None):
 
         return lhs._column.cov(rhs._column)
 
+    def transpose(self):
+        """Return the transpose, which is by definition self.
+        """
+
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
+
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 73fe46746ce..20f5f3a19e4 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1346,3 +1346,24 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     gsr = cudf.Series(data, dtype=bool_dtype)
 
     assert_eq(psr, gsr.to_pandas(nullable=True))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [0, 1, 2, 3],
+        ["abc", "a", None, "hello world", "foo buzz", "", None, "rapids ai"],
+    ],
+)
+def test_series_transpose(data):
+    psr = pd.Series(data=data)
+    csr = cudf.Series(data=data)
+
+    cudf_transposed = csr.transpose()
+    pd_transposed = psr.transpose()
+    cudf_property = csr.T
+    pd_property = psr.T
+
+    assert_eq(pd_transposed, cudf_transposed)
+    assert_eq(pd_property, cudf_property)
+    assert_eq(cudf_transposed, csr)

From 0ce9571e61c88f1bb7acc622497fe5711f493f2c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 7 Dec 2021 13:30:37 -0600
Subject: [PATCH 22/25] Remove deprecated methods from Java Table class (#9853)

Relates to #9851.  This removes deprecated methods from the Java Table class, including APIs that would allow writing ORC files without specifying metadata via writer options needed for some columns (e.g.: precision for decimal columns).  This also fixes a test where decimal precisions where not specified despite trying to write decimal columns to ORC.  Javadoc errors introduced by #9096 were also corrected.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9853
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 89 +------------------
 .../test/java/ai/rapids/cudf/TableTest.java   | 37 ++++----
 2 files changed, 19 insertions(+), 107 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index e32d466e853..a34d4afdc56 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1091,20 +1091,6 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options,
     }
   }
 
-  /**
-   * Writes this table to a Parquet file on the host
-   *
-   * @param options parameters for the writer
-   * @param outputFile file to write the table to
-   * @deprecated please use writeParquetChunked instead
-   */
-  @Deprecated
-  public void writeParquet(ParquetWriterOptions options, File outputFile) {
-    try (TableWriter writer = writeParquetChunked(options, outputFile)) {
-      writer.write(this);
-    }
-  }
-
   private static class ORCTableWriter implements TableWriter {
     private long handle;
     HostBufferConsumer consumer;
@@ -1179,33 +1165,6 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo
     return new ORCTableWriter(options, consumer);
   }
 
-  /**
-   * Writes this table to a file on the host.
-   * @param outputFile - File to write the table to
-   * @deprecated please use writeORCChunked instead
-   */
-  @Deprecated
-  public void writeORC(File outputFile) {
-    // Need to specify the number of columns but leave all column names undefined
-    String[] names = new String[getNumberOfColumns()];
-    Arrays.fill(names, "");
-    ORCWriterOptions opts = ORCWriterOptions.builder().withColumns(true, names).build();
-    writeORC(opts, outputFile);
-  }
-
-  /**
-   * Writes this table to a file on the host.
-   * @param outputFile - File to write the table to
-   * @deprecated please use writeORCChunked instead
-   */
-  @Deprecated
-  public void writeORC(ORCWriterOptions options, File outputFile) {
-    assert options.getTopLevelChildren() == getNumberOfColumns() : "must specify names for all columns";
-    try (TableWriter writer = Table.writeORCChunked(options, outputFile)) {
-      writer.write(this);
-    }
-  }
-
   private static class ArrowIPCTableWriter implements TableWriter {
     private final ArrowIPCWriterOptions.DoneOnGpu callback;
     private long handle;
@@ -2082,26 +2041,6 @@ public Table gather(ColumnView gatherMap) {
     return gather(gatherMap, OutOfBoundsPolicy.NULLIFY);
   }
 
-  /**
-   * Gathers the rows of this table according to `gatherMap` such that row "i"
-   * in the resulting table's columns will contain row "gatherMap[i]" from this table.
-   * The number of rows in the result table will be equal to the number of elements in
-   * `gatherMap`.
-   *
-   * A negative value `i` in the `gatherMap` is interpreted as `i+n`, where
-   * `n` is the number of rows in this table.
-   *
-   * @deprecated Use {@link #gather(ColumnView, OutOfBoundsPolicy)}
-   * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
-   * @param checkBounds if true bounds checking is performed on the value. Be very careful
-   *                    when setting this to false.
-   * @return the resulting Table.
-   */
-  @Deprecated
-  public Table gather(ColumnView gatherMap, boolean checkBounds) {
-    return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
-  }
-
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
@@ -2256,7 +2195,7 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable,
    * the left and right tables, respectively, to produce the result of the left join.
    * It is the responsibility of the caller to close the resulting gather map instances.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join in the join
@@ -2396,7 +2335,7 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable,
    * the left and right tables, respectively, to produce the result of the inner join.
    * It is the responsibility of the caller to close the resulting gather map instances.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join in the join
@@ -2588,7 +2527,7 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
    * to produce the result of the left semi join.
    * It is the responsibility of the caller to close the resulting gather map instance.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join
@@ -2667,7 +2606,7 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
    * to produce the result of the left anti join.
    * It is the responsibility of the caller to close the resulting gather map instance.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join
@@ -3449,14 +3388,6 @@ public ContiguousTable[] contiguousSplitGroups() {
           groupByOptions.getKeysDescending(),
           groupByOptions.getKeysNullSmallest());
     }
-
-    /**
-     * @deprecated use aggregateWindowsOverRanges
-     */
-    @Deprecated
-    public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggregates) {
-      return aggregateWindowsOverRanges(windowAggregates);
-    }
   }
 
   public static final class TableOperation {
@@ -3651,18 +3582,6 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
-
-    /**
-     * Hash partition a table into the specified number of partitions.
-     * @deprecated Use {@link #hashPartition(int)}
-     * @param numberOfPartitions - number of partitions to use
-     * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
-     * {@link Table} class
-     */
-    @Deprecated
-    public PartitionedTable partition(int numberOfPartitions) {
-      return hashPartition(numberOfPartitions);
-    }
   }
 
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index a5779bf9dbb..21e3b3784fc 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7177,19 +7177,6 @@ void testORCWriteMapChunked() throws IOException {
     }
   }
 
-  @Test
-  void testORCWriteToFile() throws IOException {
-    File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
-      table0.writeORC(tempFile.getAbsoluteFile());
-      try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) {
-        assertTablesAreEqual(table0, table1);
-      }
-    } finally {
-      tempFile.delete();
-    }
-  }
-
   @Test
   void testORCWriteToFileWithColNames() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
@@ -7198,7 +7185,9 @@ void testORCWriteToFileWithColNames() throws IOException {
       ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
       WriteUtils.buildWriterOptions(optBuilder, colNames);
       ORCWriterOptions options = optBuilder.build();
-      table0.writeORC(options, tempFile.getAbsoluteFile());
+      try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       ORCOptions opts = ORCOptions.builder().includeColumn(colNames).build();
       try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table1);
@@ -7217,7 +7206,9 @@ void testORCReadAndWriteForDecimal128() throws IOException {
       ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
       WriteUtils.buildWriterOptions(optBuilder, colNames);
       ORCWriterOptions options = optBuilder.build();
-      table0.writeORC(options, tempFile.getAbsoluteFile());
+      try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       ORCOptions opts = ORCOptions.builder()
           .includeColumn(colNames)
           .decimal128Column(Columns.DECIMAL128.name,
@@ -7236,13 +7227,15 @@ void testORCReadAndWriteForDecimal128() throws IOException {
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
     try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
-      String[] colNames = new String[table0.getNumberOfColumns()];
-      Arrays.fill(colNames, "");
-      ORCWriterOptions opts = ORCWriterOptions.builder()
-              .withColumns(true, colNames)
-              .withCompressionType(CompressionType.NONE)
-              .build();
-      table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile());
+      String[] colNames = WriteUtils.getNonNestedColumns(false);
+      ORCWriterOptions.Builder optsBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optsBuilder, colNames);
+      optsBuilder.withCompressionType(CompressionType.NONE);
+      ORCWriterOptions opts = optsBuilder.build();
+      try (TableWriter writer =
+               Table.writeORCChunked(opts,tempFileUncompressed.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table2);
       }

From a5633c2045b2946b5ea8b83b89f3af6ab6d0fcfa Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Tue, 7 Dec 2021 15:29:27 -0500
Subject: [PATCH 23/25] Adding support for `Series.autocorr` (#9833)

Fixes: #9635

TODO:
- [x] add implementation
- [x] tests
- [x] add to `.rst` files for documentation

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9833
---
 docs/cudf/source/api_docs/series.rst  |  1 +
 python/cudf/cudf/core/frame.py        |  1 -
 python/cudf/cudf/core/series.py       | 25 +++++++++++++++++++++++++
 python/cudf/cudf/tests/test_series.py | 17 +++++++++++++++++
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index a3b621a9f7d..d234dfc4bcb 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -118,6 +118,7 @@ Computations / descriptive stats
    Series.abs
    Series.all
    Series.any
+   Series.autocorr
    Series.ceil
    Series.clip
    Series.corr
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d7a75cb9f40..9969b9ac0fa 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1919,7 +1919,6 @@ def round(self, decimals=0, how="half_even"):
         2   0.7   0.0
         3   0.2   0.0
         """
-
         if isinstance(decimals, cudf.Series):
             decimals = decimals.to_pandas()
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0f0ebe55043..3aae79af4e8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2824,6 +2824,31 @@ def corr(self, other, method="pearson", min_periods=None):
 
         return lhs._column.corr(rhs._column)
 
+    def autocorr(self, lag=1):
+        """Compute the lag-N autocorrelation. This method computes the Pearson
+        correlation between the Series and its shifted self.
+
+        Parameters
+        ----------
+        lag : int, default 1
+            Number of lags to apply before performing autocorrelation.
+
+        Returns
+        -------
+        result : float
+            The Pearson correlation between self and self.shift(lag).
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05])
+        >>> s.autocorr()
+        0.10355263309024071
+        >>> s.autocorr(lag=2)
+        -0.9999999999999999
+        """
+        return self.corr(self.shift(lag))
+
     def isin(self, values):
         """Check whether values are contained in Series.
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 20f5f3a19e4..d59e3ba7571 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1348,6 +1348,23 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     assert_eq(psr, gsr.to_pandas(nullable=True))
 
 
+@pytest.mark.parametrize(
+    "cudf_series",
+    [
+        cudf.Series([0.25, 0.5, 0.2, -0.05]),
+        cudf.Series([0, 1, 2, np.nan, 4, cudf.NA, 6]),
+    ],
+)
+@pytest.mark.parametrize("lag", [1, 2, 3, 4])
+def test_autocorr(cudf_series, lag):
+    psr = cudf_series.to_pandas()
+
+    cudf_corr = cudf_series.autocorr(lag=lag)
+    pd_corr = psr.autocorr(lag=lag)
+
+    assert_eq(pd_corr, cudf_corr)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From ba3aedbb869ef95cae517f56ffb2bab305bffa40 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 Dec 2021 15:23:43 -0600
Subject: [PATCH 24/25] Raise temporary error for `decimal128` types in parquet
 reader (#9804)

This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: https://github.com/rapidsai/cudf/pull/9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug).

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9804
---
 python/cudf/cudf/io/parquet.py                |  37 ++++++++++++++++++
 .../parquet/nested_decimal128_file.parquet    | Bin 0 -> 1692 bytes
 python/cudf/cudf/tests/test_parquet.py        |  22 +++++++++--
 3 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 04d64969a16..f9b39bf2cfa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import fsspec
+import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -614,6 +615,34 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
+        # Temporary error to probe a parquet file
+        # and raise decimal128 support error.
+        if len(filepaths_or_buffers) > 0:
+            try:
+                metadata = pq.read_metadata(filepaths_or_buffers[0])
+            except TypeError:
+                # pq.read_metadata only supports reading metadata from
+                # certain types of file inputs, like str-filepath or file-like
+                # objects, and errors for the rest of inputs. Hence this is
+                # to avoid failing on other types of file inputs.
+                pass
+            else:
+                arrow_schema = metadata.schema.to_arrow_schema()
+                check_cols = arrow_schema.names if columns is None else columns
+                for col_name, arrow_type in zip(
+                    arrow_schema.names, arrow_schema.types
+                ):
+                    if col_name not in check_cols:
+                        continue
+                    if isinstance(arrow_type, pa.ListType):
+                        val_field_types = arrow_type.value_field.flatten()
+                        for val_field_type in val_field_types:
+                            _check_decimal128_type(val_field_type.type)
+                    elif isinstance(arrow_type, pa.StructType):
+                        _ = cudf.StructDtype.from_arrow(arrow_type)
+                    else:
+                        _check_decimal128_type(arrow_type)
+
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list):
 
 
 ParquetWriter = libparquet.ParquetWriter
+
+
+def _check_decimal128_type(arrow_type):
+    if isinstance(arrow_type, pa.Decimal128Type):
+        if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
+            raise NotImplementedError(
+                "Decimal type greater than Decimal64 is not yet supported"
+            )
diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7440d357a1263ce9c28b6681e38c35540d5e2af8
GIT binary patch
literal 1692
zcmcgtPj3=I6n{HA(1qBhHl4|CvQZ9nLn5uU2}xt{Fbfr$f&!*0iI;^C#I_az#rOgI
z6nqfRo;(>39zA;O(W8mJH&f^Z4jN?wZ{DBZ`#;InRO61|qAd0*!V!D}APul;L2}jM
zuz5X^z5$mfBS54?I!iDS7jfTB%uH8oQbJ~~9<l3rhSW|Hb!=vThv28>i#0FDA@1#4
z5kKt>N4z)coCzi!`JTxiR`57*{xp@B#S*z-fF;ufbS=L{u3P*Vo3&WBS__(-@Zhuk
zPuj#4Rj7O1<a*x1*6Gj)=-@mygf!F9D|Dt7*O_8*fpfIzoJbLvGs|Ydh2slu_Ljk>
zTDUJRKUwmN6?|I>CM&)>C&q%vbGF62Ne<lju>HJq76}eH=30?pB3{X32|4cSmqH6(
z!-klv-^^AL5t-~WqJ%=9)?fMxzD3?3x#xTNrM=Q%nQ!Gk?@)e?Nsg0~A7YyTp6^~$
zjmE<#cg;@S3;Q;my>6(Ks^g!|ky2w7cGZ2Qx~kCCx)1?X;tEO~(v9-;Mv2%WsY#FI
z9R}S;=HNj|%VVMkL@*?BI;hD<v_$@4l}dS3LqhQSp|+Qiy(;aJ)p$xerg{)!ibQK@
zoM!o}*aT{|-Q*{LF4E8@$wx0R>8%sWPuB1)Ql*+i43s!ifAq9*+R{l1$As7MwW+9O
zaw(cB+T>*Xb?Ot6^^{9mMp8F+1YsQPr>VlRnX@)(hgm(kiN*Dpd7vHRXz1}QpNVr1
v+?)Mq@alDEwB7Fax1+(}e4KjI84P>pFH^al-JM-8?*o`0{IUUR_+9!9wrcs_

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 516ee0d17d3..597ae6c05c0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir):
 def test_parquet_reader_spark_decimals(datadir):
     fname = datadir / "spark_decimal.parquet"
 
-    expect = pd.read_parquet(fname)
-    got = cudf.read_parquet(fname)
+    # expect = pd.read_parquet(fname)
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname)
 
     # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
     # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    expect = expect.apply(pd.to_numeric)
+    # expect = expect.apply(pd.to_numeric)
 
     # np.testing.assert_allclose(expect, got)
-    assert_eq(expect, got)
+    # assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
+def test_parquet_reader_decimal128_error_validation(datadir, columns):
+    fname = datadir / "nested_decimal128_file.parquet"
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname, columns=columns)
 
 
 def test_parquet_reader_microsecond_timestamps(datadir):

From a72f19e9eb5f23ed58f87768cdafcf244e6424df Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 7 Dec 2021 16:24:17 -0500
Subject: [PATCH 25/25] Enforce boolean `ascending` for dask-cudf `sort_values`
 (#9814)

It is possible to pass a list of `ascending` booleans to dask-cudf's `sort_values`, which is not yet supported by `quantile_divisions` (which computes divisions for all sort-by columns sorted in ascending order), and can cause undefined behavior.

This small PR adds a check that `ascending` is a boolean before computing quantile divisions; note that this check happens _after_ the single-partition case is handled, as cuDF can handle lists of `ascending` booleans.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9814
---
 python/dask_cudf/dask_cudf/sorting.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 5f2af445170..e8551493bb1 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -224,6 +224,8 @@ def sort_values(
     na_position="last",
 ):
     """Sort by the given list/tuple of column names."""
+    if not isinstance(ascending, bool):
+        raise ValueError("ascending must be either True or False")
     if na_position not in ("first", "last"):
         raise ValueError("na_position must be either 'first' or 'last'")