From 11b81262634a12e520a653e0bb861da698d353da Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 19 Jan 2024 14:59:59 -0800
Subject: [PATCH 01/16] Rewrite hash groupby with hash set

---
 cpp/src/groupby/hash/groupby.cu             | 101 ++++++++++----------
 cpp/src/groupby/hash/groupby_kernels.cuh    |  18 ++--
 cpp/src/groupby/hash/multi_pass_kernels.cuh |  13 ++-
 3 files changed, 66 insertions(+), 66 deletions(-)
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 32693487c32..59c416e2892 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,9 +45,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
-#include <hash/concurrent_unordered_map.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+
+#include <cuco/static_set.cuh>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
@@ -67,15 +71,24 @@ namespace detail {
 namespace hash {
 namespace {
 
-// TODO: replace it with `cuco::static_map`
-// https://github.com/rapidsai/cudf/issues/10401
+int constexpr cg_size = 1; ///< Number of threads used to handle each input key
+int constexpr window_size = 1; ///< Number of slots checked per thread
+cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot
+
+using probing_scheme_type = cuco::experimental::linear_probing<cg_size,
+cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                 cudf::nullate::DYNAMIC>>;
+using allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<cudf::size_type>>;
+
 template <typename ComparatorType>
-using map_type = concurrent_unordered_map<
-  cudf::size_type,
+using set_type = cuco::experimental::static_set<
   cudf::size_type,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>,
-  ComparatorType>;
+  cuco::experimental::extent<cudf::size_type>,
+  cuda::thread_scope_device,
+  ComparatorType,
+  probing_scheme_type,
+  allocator_type,
+  cuco::experimental::storage<window_size>>;
 
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
@@ -191,14 +204,14 @@ class groupby_simple_aggregations_collector final
   }
 };
 
-template <typename ComparatorType>
+template <typename SetType>
 class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
   column_view col;
   data_type result_type;
   cudf::detail::result_cache* sparse_results;
   cudf::detail::result_cache* dense_results;
   device_span<size_type const> gather_map;
-  map_type<ComparatorType> const& map;
+  SetType set;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
   rmm::mr::device_memory_resource* mr;
@@ -210,7 +223,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               cudf::detail::result_cache* sparse_results,
                               cudf::detail::result_cache* dense_results,
                               device_span<size_type const> gather_map,
-                              map_type<ComparatorType> const& map,
+                              SetType set,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
@@ -218,7 +231,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       sparse_results(sparse_results),
       dense_results(dense_results),
       gather_map(gather_map),
-      map(map),
+      set(set),
       row_bitmask(row_bitmask),
       stream(stream),
       mr(mr)
@@ -341,8 +354,8 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
       col.size(),
-      ::cudf::detail::var_hash_functor<map_type<ComparatorType>>{
-        map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+      ::cudf::detail::var_hash_functor{
+        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
     sparse_results->add_result(col, agg, std::move(var_result));
     dense_results->add_result(col, agg, to_dense_agg_result(agg));
   }
@@ -399,13 +412,13 @@ flatten_single_pass_aggs(host_span<aggregation_request const> requests)
  *
  * @see groupby_null_templated()
  */
-template <typename ComparatorType>
+template <typename SetType>
 void sparse_to_dense_results(table_view const& keys,
                              host_span<aggregation_request const> requests,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
-                             map_type<ComparatorType> const& map,
+                             SetType set,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
                              rmm::cuda_stream_view stream,
@@ -424,7 +437,7 @@ void sparse_to_dense_results(table_view const& keys,
     // Given an aggregation, this will get the result from sparse_results and
     // convert and return dense, compacted result
     auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr);
+      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
     for (auto&& agg : agg_v) {
       agg->finalize(finalizer);
     }
@@ -468,11 +481,11 @@ auto create_sparse_results_table(table_view const& flattened_values,
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
  */
-template <typename ComparatorType>
+template <typename SetType>
 void compute_single_pass_aggs(table_view const& keys,
                               host_span<aggregation_request const> requests,
                               cudf::detail::result_cache* sparse_results,
-                              map_type<ComparatorType>& map,
+                              SetType set,
                               bool keys_have_nulls,
                               null_policy include_null_keys,
                               rmm::cuda_stream_view stream)
@@ -498,8 +511,8 @@ void compute_single_pass_aggs(table_view const& keys,
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator(0),
                      keys.num_rows(),
-                     hash::compute_single_pass_aggs_fn<map_type<ComparatorType>>{
-                       map,
+                     hash::compute_single_pass_aggs_fn{
+                       set,
                        *d_values,
                        *d_sparse_table,
                        d_aggs.data(),
@@ -518,23 +531,15 @@ void compute_single_pass_aggs(table_view const& keys,
  * @brief Computes and returns a device vector containing all populated keys in
  * `map`.
  */
-template <typename ComparatorType>
-rmm::device_uvector<size_type> extract_populated_keys(map_type<ComparatorType> const& map,
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
                                                       size_type num_keys,
                                                       rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<size_type> populated_keys(num_keys, stream);
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
 
-  auto const get_key = cuda::proclaim_return_type<typename map_type<ComparatorType>::key_type>(
-    [] __device__(auto const& element) { return element.first; });  // first = key
-  auto const key_used = [unused = map.get_unused_key()] __device__(auto key) {
-    return key != unused;
-  };
-  auto const key_itr = thrust::make_transform_iterator(map.data(), get_key);
-  auto const end_it  = cudf::detail::copy_if_safe(
-    key_itr, key_itr + map.capacity(), populated_keys.begin(), key_used, stream);
-
-  populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream);
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
   return populated_keys;
 }
 
@@ -581,30 +586,26 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
-  size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
-  size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
-
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash map
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    using allocator_type = typename map_type<decltype(d_key_equal)>::allocator_type;
-
-    auto const map = map_type<decltype(d_key_equal)>::create(compute_hash_table_size(num_keys),
-                                                             stream,
-                                                             unused_key,
-                                                             unused_value,
-                                                             d_row_hash,
-                                                             d_key_equal,
-                                                             allocator_type());
-    // Compute all single pass aggs first
+    auto const set = set_type<decltype(d_key_equal)>{num_keys,
+      0.5, // desired load factor
+      cuco::empty_key{key_sentinel},
+      d_key_equal,
+      probing_scheme_type{d_row_hash},
+      allocator_type{default_allocator<cudf::size_type>{}, stream},
+      stream.value()};
+    
+      // Compute all single pass aggs first
     compute_single_pass_aggs(
-      keys, requests, &sparse_results, *map, keys_have_nulls, include_null_keys, stream);
+      keys, requests, &sparse_results, set.ref(cuco::experimental::insert_and_find), keys_have_nulls, include_null_keys, stream);
 
     // Extract the populated indices from the hash map and create a gather map.
     // Gathering using this map from sparse results will give dense results.
-    auto gather_map = extract_populated_keys(*map, keys.num_rows(), stream);
+    auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
 
     // Compact all results from sparse_results and insert into cache
     sparse_to_dense_results(keys,
@@ -612,7 +613,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                             &sparse_results,
                             cache,
                             gather_map,
-                            *map,
+                            set.ref(cuco::experimental::find),
                             keys_have_nulls,
                             include_null_keys,
                             stream,
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index eedb07200a5..1ffe870eeb4 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,11 +60,11 @@ namespace hash {
  * rows. In this way, after all rows are aggregated, `output_values` will likely
  * be "sparse", meaning that not all rows contain the result of an aggregation.
  *
- * @tparam Map The type of the hash map
+ * @tparam SetType The type of the hash set device ref
  */
-template <typename Map>
+template <typename SetType>
 struct compute_single_pass_aggs_fn {
-  Map map;
+  SetType set;
   table_device_view input_values;
   mutable_table_device_view output_values;
   aggregation::Kind const* __restrict__ aggs;
@@ -74,7 +74,7 @@ struct compute_single_pass_aggs_fn {
   /**
    * @brief Construct a new compute_single_pass_aggs_fn functor object
    *
-   * @param map Hash map object to insert key,value pairs into.
+   * @param set_ref Hash map object to insert key,value pairs into.
    * @param input_values The table whose rows will be aggregated in the values
    * of the hash map
    * @param output_values Table that stores the results of aggregating rows of
@@ -87,13 +87,13 @@ struct compute_single_pass_aggs_fn {
    * null values should be skipped. It `true`, it is assumed `row_bitmask` is a
    * bitmask where bit `i` indicates the presence of a null value in row `i`.
    */
-  compute_single_pass_aggs_fn(Map map,
+  compute_single_pass_aggs_fn(SetType set,
                               table_device_view input_values,
                               mutable_table_device_view output_values,
                               aggregation::Kind const* aggs,
                               bitmask_type const* row_bitmask,
                               bool skip_rows_with_nulls)
-    : map(map),
+    : set(set),
       input_values(input_values),
       output_values(output_values),
       aggs(aggs),
@@ -105,10 +105,10 @@ struct compute_single_pass_aggs_fn {
   __device__ void operator()(size_type i)
   {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
-      auto result = map.insert(thrust::make_pair(i, i));
+      auto const result = set.insert_and_find(i);
 
       cudf::detail::aggregate_row<true, true>(
-        output_values, result.first->second, input_values, i, aggs);
+        output_values, *result.first, input_values, i, aggs);
     }
   }
 };
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index 4bc73631732..51432ed5de8 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,23 +31,23 @@
 namespace cudf {
 namespace detail {
 
-template <typename Map, bool target_has_nulls = true, bool source_has_nulls = true>
+template <typename SetType, bool target_has_nulls = true, bool source_has_nulls = true>
 struct var_hash_functor {
-  Map const map;
+  SetType set;
   bitmask_type const* __restrict__ row_bitmask;
   mutable_column_device_view target;
   column_device_view source;
   column_device_view sum;
   column_device_view count;
   size_type ddof;
-  var_hash_functor(Map const map,
+  var_hash_functor(SetType set,
                    bitmask_type const* row_bitmask,
                    mutable_column_device_view target,
                    column_device_view source,
                    column_device_view sum,
                    column_device_view count,
                    size_type ddof)
-    : map(map),
+    : set(set),
       row_bitmask(row_bitmask),
       target(target),
       source(source),
@@ -96,8 +96,7 @@ struct var_hash_functor {
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {
-      auto result       = map.find(source_index);
-      auto target_index = result->second;
+      auto const target_index       = *set.find(source_index);
 
       auto col         = source;
       auto source_type = source.type();

From 166ed494d85ced274f1e813038d2d34dfcab7a70 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 19 Jan 2024 15:06:22 -0800
Subject: [PATCH 02/16] Formatting

---
 cpp/src/groupby/hash/groupby.cu             | 79 +++++++++++----------
 cpp/src/groupby/hash/groupby_kernels.cuh    |  3 +-
 cpp/src/groupby/hash/multi_pass_kernels.cuh |  2 +-
 3 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 59c416e2892..6e83a081130 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -37,6 +37,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -45,12 +46,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
-#include <cudf/hashing/detail/hash_allocator.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
-
 #include <cuco/static_set.cuh>
 
 #include <thrust/copy.h>
@@ -71,24 +70,24 @@ namespace detail {
 namespace hash {
 namespace {
 
-int constexpr cg_size = 1; ///< Number of threads used to handle each input key
-int constexpr window_size = 1; ///< Number of slots checked per thread
-cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot
+int constexpr cg_size                  = 1;   ///< Number of threads used to handle each input key
+int constexpr window_size              = 1;   ///< Number of slots checked per thread
+cudf::size_type constexpr key_sentinel = -1;  ///< Sentinel value indicating an empty slot
 
-using probing_scheme_type = cuco::experimental::linear_probing<cg_size,
-cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                 cudf::nullate::DYNAMIC>>;
+using probing_scheme_type = cuco::experimental::linear_probing<
+  cg_size,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>>;
 using allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<cudf::size_type>>;
 
 template <typename ComparatorType>
-using set_type = cuco::experimental::static_set<
-  cudf::size_type,
-  cuco::experimental::extent<cudf::size_type>,
-  cuda::thread_scope_device,
-  ComparatorType,
-  probing_scheme_type,
-  allocator_type,
-  cuco::experimental::storage<window_size>>;
+using set_type = cuco::experimental::static_set<cudf::size_type,
+                                                cuco::experimental::extent<cudf::size_type>,
+                                                cuda::thread_scope_device,
+                                                ComparatorType,
+                                                probing_scheme_type,
+                                                allocator_type,
+                                                cuco::experimental::storage<window_size>>;
 
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
@@ -508,16 +507,16 @@ void compute_single_pass_aggs(table_view const& keys,
       ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
       : rmm::device_buffer{};
 
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0),
-                     keys.num_rows(),
-                     hash::compute_single_pass_aggs_fn{
-                       set,
-                       *d_values,
-                       *d_sparse_table,
-                       d_aggs.data(),
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_key_rows_with_nulls});
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    keys.num_rows(),
+    hash::compute_single_pass_aggs_fn{set,
+                                      *d_values,
+                                      *d_sparse_table,
+                                      d_aggs.data(),
+                                      static_cast<bitmask_type*>(row_bitmask.data()),
+                                      skip_key_rows_with_nulls});
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
   for (size_t i = 0; i < aggs.size(); i++) {
@@ -591,17 +590,23 @@ std::unique_ptr<table> groupby(table_view const& keys,
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = set_type<decltype(d_key_equal)>{num_keys,
-      0.5, // desired load factor
-      cuco::empty_key{key_sentinel},
-      d_key_equal,
-      probing_scheme_type{d_row_hash},
-      allocator_type{default_allocator<cudf::size_type>{}, stream},
-      stream.value()};
-    
-      // Compute all single pass aggs first
-    compute_single_pass_aggs(
-      keys, requests, &sparse_results, set.ref(cuco::experimental::insert_and_find), keys_have_nulls, include_null_keys, stream);
+    auto const set =
+      set_type<decltype(d_key_equal)>{num_keys,
+                                      0.5,  // desired load factor
+                                      cuco::empty_key{key_sentinel},
+                                      d_key_equal,
+                                      probing_scheme_type{d_row_hash},
+                                      allocator_type{default_allocator<cudf::size_type>{}, stream},
+                                      stream.value()};
+
+    // Compute all single pass aggs first
+    compute_single_pass_aggs(keys,
+                             requests,
+                             &sparse_results,
+                             set.ref(cuco::experimental::insert_and_find),
+                             keys_have_nulls,
+                             include_null_keys,
+                             stream);
 
     // Extract the populated indices from the hash map and create a gather map.
     // Gathering using this map from sparse results will give dense results.
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 1ffe870eeb4..a6a09435589 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -107,8 +107,7 @@ struct compute_single_pass_aggs_fn {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
       auto const result = set.insert_and_find(i);
 
-      cudf::detail::aggregate_row<true, true>(
-        output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
     }
   }
 };
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index 51432ed5de8..7043eafdc10 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -96,7 +96,7 @@ struct var_hash_functor {
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {
-      auto const target_index       = *set.find(source_index);
+      auto const target_index = *set.find(source_index);
 
       auto col         = source;
       auto source_type = source.type();

From b1db24397ffbc9533464fcdf1592e945f936773a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 22 Jan 2024 16:11:29 -0800
Subject: [PATCH 03/16] Minor cleanups

---
 cpp/src/groupby/hash/groupby.cu | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 6e83a081130..c5a0f30bf77 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -71,7 +71,6 @@ namespace hash {
 namespace {
 
 int constexpr cg_size                  = 1;   ///< Number of threads used to handle each input key
-int constexpr window_size              = 1;   ///< Number of slots checked per thread
 cudf::size_type constexpr key_sentinel = -1;  ///< Sentinel value indicating an empty slot
 
 using probing_scheme_type = cuco::experimental::linear_probing<
@@ -80,15 +79,6 @@ using probing_scheme_type = cuco::experimental::linear_probing<
                                                    cudf::nullate::DYNAMIC>>;
 using allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<cudf::size_type>>;
 
-template <typename ComparatorType>
-using set_type = cuco::experimental::static_set<cudf::size_type,
-                                                cuco::experimental::extent<cudf::size_type>,
-                                                cuda::thread_scope_device,
-                                                ComparatorType,
-                                                probing_scheme_type,
-                                                allocator_type,
-                                                cuco::experimental::storage<window_size>>;
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -591,13 +581,13 @@ std::unique_ptr<table> groupby(table_view const& keys,
 
   auto const comparator_helper = [&](auto const d_key_equal) {
     auto const set =
-      set_type<decltype(d_key_equal)>{num_keys,
-                                      0.5,  // desired load factor
-                                      cuco::empty_key{key_sentinel},
-                                      d_key_equal,
-                                      probing_scheme_type{d_row_hash},
-                                      allocator_type{default_allocator<cudf::size_type>{}, stream},
-                                      stream.value()};
+      cuco::experimental::static_set{num_keys,
+                                     0.5,  // desired load factor
+                                     cuco::empty_key{key_sentinel},
+                                     d_key_equal,
+                                     probing_scheme_type{d_row_hash},
+                                     allocator_type{default_allocator<cudf::size_type>{}, stream},
+                                     stream.value()};
 
     // Compute all single pass aggs first
     compute_single_pass_aggs(keys,

From ca6829d62d926565e2fef8a01ae7eaa8b3855bb3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 12:44:56 -0800
Subject: [PATCH 04/16] Update cuco code

---
 cpp/src/groupby/hash/groupby.cu | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c5a0f30bf77..1718848ae7e 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -26,6 +26,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -37,7 +38,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -48,7 +48,6 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_set.cuh>
 
@@ -73,11 +72,10 @@ namespace {
 int constexpr cg_size                  = 1;   ///< Number of threads used to handle each input key
 cudf::size_type constexpr key_sentinel = -1;  ///< Sentinel value indicating an empty slot
 
-using probing_scheme_type = cuco::experimental::linear_probing<
+using probing_scheme_type = cuco::linear_probing<
   cg_size,
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>>;
-using allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<cudf::size_type>>;
 
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
@@ -580,20 +578,21 @@ std::unique_ptr<table> groupby(table_view const& keys,
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set =
-      cuco::experimental::static_set{num_keys,
-                                     0.5,  // desired load factor
-                                     cuco::empty_key{key_sentinel},
-                                     d_key_equal,
-                                     probing_scheme_type{d_row_hash},
-                                     allocator_type{default_allocator<cudf::size_type>{}, stream},
-                                     stream.value()};
+    auto const set = cuco::static_set{num_keys,
+                                      0.5,  // desired load factor
+                                      cuco::empty_key{key_sentinel},
+                                      d_key_equal,
+                                      probing_scheme_type{d_row_hash},
+                                      cuco::thread_scope_device,
+                                      cuco::storage<1>{},
+                                      cudf::detail::cuco_allocator{stream},
+                                      stream.value()};
 
     // Compute all single pass aggs first
     compute_single_pass_aggs(keys,
                              requests,
                              &sparse_results,
-                             set.ref(cuco::experimental::insert_and_find),
+                             set.ref(cuco::insert_and_find),
                              keys_have_nulls,
                              include_null_keys,
                              stream);
@@ -608,7 +607,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                             &sparse_results,
                             cache,
                             gather_map,
-                            set.ref(cuco::experimental::find),
+                            set.ref(cuco::find),
                             keys_have_nulls,
                             include_null_keys,
                             stream,

From 0c10a0b875dc28fa5e2a9bef49ef7a27ef8c1b2a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 13:09:12 -0800
Subject: [PATCH 05/16] Add CUCO_CUDF_SIZE_TYPE_SENTINEL

---
 cpp/include/cudf/detail/cuco_helpers.hpp | 6 ++++++
 cpp/src/groupby/hash/groupby.cu          | 7 ++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
index 5f3c31479de..a030354fb3c 100644
--- a/cpp/include/cudf/detail/cuco_helpers.hpp
+++ b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -16,11 +16,17 @@
 
 #pragma once
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 namespace cudf::detail {
 
+/// Sentinel value for `cudf::size_type`
+cudf::size_type constexpr CUCO_CUDF_SIZE_TYPE_SENTINEL = -1;
+// TODO: is it a mouthful? Maybe `CUCO_SIZE_TYPE_SENTINEL`?
+
 /**
  * @brief Stream-ordered allocator adaptor used for cuco data structures
  *
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 1718848ae7e..821e760704a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -69,11 +69,8 @@ namespace detail {
 namespace hash {
 namespace {
 
-int constexpr cg_size                  = 1;   ///< Number of threads used to handle each input key
-cudf::size_type constexpr key_sentinel = -1;  ///< Sentinel value indicating an empty slot
-
 using probing_scheme_type = cuco::linear_probing<
-  cg_size,
+  1,  ///< Number of threads used to handle each input key
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>>;
 
@@ -580,7 +577,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const comparator_helper = [&](auto const d_key_equal) {
     auto const set = cuco::static_set{num_keys,
                                       0.5,  // desired load factor
-                                      cuco::empty_key{key_sentinel},
+                                      cuco::empty_key{cudf::detail::CUCO_CUDF_SIZE_TYPE_SENTINEL},
                                       d_key_equal,
                                       probing_scheme_type{d_row_hash},
                                       cuco::thread_scope_device,

From 2470c684f15ce4d0f8aeeba3927540887baf67af Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 13:26:47 -0800
Subject: [PATCH 06/16] Header cleanups

---
 cpp/src/groupby/hash/groupby.cu | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 821e760704a..ec3b7fc2760 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -21,7 +21,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
@@ -30,15 +29,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/replace.hpp>
 #include <cudf/detail/unary.hpp>
-#include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -51,13 +46,8 @@
 
 #include <cuco/static_set.cuh>
 
-#include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <cuda/functional>
-#include <cuda/std/atomic>
 
 #include <memory>
 #include <unordered_set>

From 7da8c5561de48f046665db644c3de8b26ec2817c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 13:42:53 -0800
Subject: [PATCH 07/16] Update docs

---
 cpp/src/groupby/hash/groupby.cu          |  4 ++--
 cpp/src/groupby/hash/groupby_kernels.cuh | 30 +++++++++---------------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index ec3b7fc2760..ad5f43342d4 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -561,7 +561,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
   // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash map
+  // column is indexed by the hash set
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
@@ -584,7 +584,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                              include_null_keys,
                              stream);
 
-    // Extract the populated indices from the hash map and create a gather map.
+    // Extract the populated indices from the hash set and create a gather map.
     // Gathering using this map from sparse results will give dense results.
     auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
 
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index a6a09435589..a8e02266593 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -30,30 +30,22 @@ namespace detail {
 namespace hash {
 /**
  * @brief Compute single-pass aggregations and store results into a sparse
- * `output_values` table, and populate `map` with indices of unique keys
+ * `output_values` table, and populate `set` with indices of unique keys
  *
- * The hash map is built by inserting every row `i` from the `keys` and
- * `values` tables as a single (key,value) pair. When the pair is inserted, if
- * the key was not already present in the map, then the corresponding value is
- * simply copied to the output. If the key was already present in the map,
- * then the inserted `values` row is aggregated with the existing row. This
- * aggregation is done for every element `j` in the row by applying aggregation
- * operation `j` between the new and existing element.
+ * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If
+ * the index was not present in the set, insert they index and then copy it to the output. If the
+ * key was already present in the set, then the inserted index is aggregated with the existing row.
+ * This aggregation is done for every element `j` in the row by applying aggregation operation `j`
+ * between the new and existing element.
  *
  * Instead of storing the entire rows from `input_keys` and `input_values` in
- * the hashmap, we instead store the row indices. For example, when inserting
- * row at index `i` from `input_keys` into the hash map, the value `i` is what
- * gets stored for the hash map's "key". It is assumed the `map` was constructed
+ * the hashset, we instead store the row indices. For example, when inserting
+ * row at index `i` from `input_keys` into the hash set, the value `i` is what
+ * gets stored for the hash set's "key". It is assumed the `set` was constructed
  * with a custom comparator that uses these row indices to check for equality
  * between key rows. For example, comparing two keys `k0` and `k1` will compare
  * the two rows `input_keys[k0] ?= input_keys[k1]`
  *
- * Likewise, we store the row indices for the hash maps "values". These indices
- * index into the `output_values` table. For a given key `k` (which is an index
- * into `input_keys`), the corresponding value `v` indexes into `output_values`
- * and stores the result of aggregating rows from `input_values` from rows of
- * `input_keys` equivalent to the row at `k`.
- *
  * The exact size of the result is not known a priori, but can be upper bounded
  * by the number of rows in `input_keys` & `input_values`. Therefore, it is
  * assumed `output_values` has sufficient storage for an equivalent number of
@@ -74,9 +66,9 @@ struct compute_single_pass_aggs_fn {
   /**
    * @brief Construct a new compute_single_pass_aggs_fn functor object
    *
-   * @param set_ref Hash map object to insert key,value pairs into.
+   * @param set_ref Hash set object to insert key,value pairs into.
    * @param input_values The table whose rows will be aggregated in the values
-   * of the hash map
+   * of the hash set
    * @param output_values Table that stores the results of aggregating rows of
    * `input_values`.
    * @param aggs The set of aggregation operations to perform across the

From 7dd59a6c10a88eee37a33304b2bf4d53ec686916 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 13:44:43 -0800
Subject: [PATCH 08/16] Minor doc updates

---
 cpp/src/groupby/hash/groupby_kernels.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index a8e02266593..f4d2e63af4e 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -29,8 +29,8 @@ namespace groupby {
 namespace detail {
 namespace hash {
 /**
- * @brief Compute single-pass aggregations and store results into a sparse
- * `output_values` table, and populate `set` with indices of unique keys
+ * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
+ * and populate `set` with indices of unique keys
  *
  * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If
  * the index was not present in the set, insert they index and then copy it to the output. If the

From 3cbdb7c4b7fa4e5de684c79be105eefe1ae37d0b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 13:53:25 -0800
Subject: [PATCH 09/16] Add peak memory usage metrics to groupby NV bencmarks

---
 cpp/benchmarks/groupby/group_max.cpp         | 7 ++++++-
 cpp/benchmarks/groupby/group_nunique.cpp     | 7 ++++++-
 cpp/benchmarks/groupby/group_rank.cpp        | 7 ++++++-
 cpp/benchmarks/groupby/group_struct_keys.cpp | 9 +++++++--
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index e65c37f001d..b7b330f02e5 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -50,9 +51,13 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
+  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH_TYPES(bench_groupby_max,
diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
index 63d738b2951..8206c739e01 100644
--- a/cpp/benchmarks/groupby/group_nunique.cpp
+++ b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -63,9 +64,13 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
   auto const requests = make_aggregation_request_vector(
     *vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
 
+  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t>))
diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp
index 2122720a421..c00a2cef937 100644
--- a/cpp/benchmarks/groupby/group_rank.cpp
+++ b/cpp/benchmarks/groupby/group_rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/groupby.hpp>
@@ -53,11 +54,15 @@ static void nvbench_groupby_rank(nvbench::state& state,
   cudf::groupby::groupby gb_obj(
     keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO);
 
+  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
     // groupby scan uses sort implementation
     auto result = gb_obj.scan(requests);
   });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 enum class rank_method : int32_t {};
diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
index 44a12c1c30e..cadd9c2d137 100644
--- a/cpp/benchmarks/groupby/group_struct_keys.cpp
+++ b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -80,11 +81,15 @@ void bench_groupby_struct_keys(nvbench::state& state)
   requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
 
   // Set up nvbench default stream
-  auto stream = cudf::get_default_stream();
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  auto stream                 = cudf::get_default_stream();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH(bench_groupby_struct_keys)

From 82aa0ce00b9748ae2d71aaf2a72cd45fdf0285dd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 16 Feb 2024 14:09:51 -0800
Subject: [PATCH 10/16] Revert some benchmark changes

---
 cpp/benchmarks/groupby/group_nunique.cpp | 7 +------
 cpp/benchmarks/groupby/group_rank.cpp    | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
index 8206c739e01..63d738b2951 100644
--- a/cpp/benchmarks/groupby/group_nunique.cpp
+++ b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -64,13 +63,9 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
   auto const requests = make_aggregation_request_vector(
     *vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
 
-  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
-
-  state.add_buffer_size(
-    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t>))
diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp
index c00a2cef937..2122720a421 100644
--- a/cpp/benchmarks/groupby/group_rank.cpp
+++ b/cpp/benchmarks/groupby/group_rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/groupby.hpp>
@@ -54,15 +53,11 @@ static void nvbench_groupby_rank(nvbench::state& state,
   cudf::groupby::groupby gb_obj(
     keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO);
 
-  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
     // groupby scan uses sort implementation
     auto result = gb_obj.scan(requests);
   });
-
-  state.add_buffer_size(
-    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 enum class rank_method : int32_t {};

From 4193c759f439b92b31faae33b87bec835bf0a2bf Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 22 Feb 2024 12:36:05 -0800
Subject: [PATCH 11/16] Fix pytests

---
 python/cudf/cudf/tests/test_groupby.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e8dbdd35352..232fc057702 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -55,12 +55,12 @@ def assert_groupby_results_equal(
             if isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
                 expect = expect.sort_values(by=by).reset_index(drop=True)
             else:
-                expect = expect.sort_values().reset_index(drop=True)
+                expect = expect.sort_values(by=by).reset_index(drop=True)
 
             if isinstance(got, cudf.DataFrame):
                 got = got.sort_values(by=by).reset_index(drop=True)
             else:
-                got = got.sort_values().reset_index(drop=True)
+                got = got.sort_values(by=by).reset_index(drop=True)
 
     assert_eq(expect, got, **kwargs)
 
@@ -179,7 +179,7 @@ def test_groupby_agg_min_max_dictlist(nelem):
 def test_groupby_as_index_single_agg(pdf, gdf, as_index):
     gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"})
     pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"})
-    assert_groupby_results_equal(pdf, gdf)
+    assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y")
 
 
 @pytest.mark.parametrize("engine", ["cudf", "jit"])
@@ -192,7 +192,7 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
     if PANDAS_GE_220:
         kwargs["include_groups"] = False
     pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs)
-    assert_groupby_results_equal(pdf, gdf)
+    assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y")
 
 
 @pytest.mark.parametrize("as_index", [True, False])
@@ -3764,7 +3764,13 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index):
 
     # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
     assert_groupby_results_equal(
-        actual, expected, check_names=False, check_index_type=False
+        actual,
+        expected,
+        check_names=False,
+        check_index_type=False,
+        as_index=as_index,
+        by=["gender", "education"],
+        sort=sort,
     )
 
 

From 574f628c36fa69b8658d7478ea40da17f019b4a0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 22 Feb 2024 12:42:02 -0800
Subject: [PATCH 12/16] Renaming

---
 cpp/include/cudf/detail/cuco_helpers.hpp | 3 +--
 cpp/src/groupby/hash/groupby.cu          | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
index a030354fb3c..11a6f9a909b 100644
--- a/cpp/include/cudf/detail/cuco_helpers.hpp
+++ b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -24,8 +24,7 @@
 namespace cudf::detail {
 
 /// Sentinel value for `cudf::size_type`
-cudf::size_type constexpr CUCO_CUDF_SIZE_TYPE_SENTINEL = -1;
-// TODO: is it a mouthful? Maybe `CUCO_SIZE_TYPE_SENTINEL`?
+cudf::size_type constexpr CUDF_SIZE_TYPE_SENTINEL = -1;
 
 /**
  * @brief Stream-ordered allocator adaptor used for cuco data structures
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 458821cffae..dcb40edbddc 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -567,7 +567,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
   auto const comparator_helper = [&](auto const d_key_equal) {
     auto const set = cuco::static_set{num_keys,
                                       0.5,  // desired load factor
-                                      cuco::empty_key{cudf::detail::CUCO_CUDF_SIZE_TYPE_SENTINEL},
+                                      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
                                       d_key_equal,
                                       probing_scheme_type{d_row_hash},
                                       cuco::thread_scope_device,

From 75a8e6461accd2434e9b3311c8c3c01691bc4a93 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 22 Feb 2024 16:36:17 -0800
Subject: [PATCH 13/16] Fix several docstring tests

---
 python/cudf/cudf/core/dataframe.py       |  2 +-
 python/cudf/cudf/core/groupby/groupby.py | 28 ++++++++++++------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 89abd7be0ba..0a90b535040 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7693,7 +7693,7 @@ def value_counts(
         dog            4          0
         cat            4          0
         ant            6          0
-        >>> df.value_counts()
+        >>> df.value_counts(sort=True)
         num_legs  num_wings
         4         0            2
         2         2            1
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index a236a9b6abf..1bbacb29a1e 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -109,11 +109,11 @@ def _is_row_of(chunk, obj):
 Parrot     30.0
 Parrot     20.0
 Name: Max Speed, dtype: float64
->>> ser.groupby(level=0).mean()
+>>> ser.groupby(level=0, sort=True).mean()
 Falcon    370.0
 Parrot     25.0
 Name: Max Speed, dtype: float64
->>> ser.groupby(ser > 100).mean()
+>>> ser.groupby(ser > 100, sort=True).mean()
 Max Speed
 False     25.0
 True     370.0
@@ -133,7 +133,7 @@ def _is_row_of(chunk, obj):
 1  Falcon      370.0
 2  Parrot       24.0
 3  Parrot       26.0
->>> df.groupby(['Animal']).mean()
+>>> df.groupby(['Animal'], sort=True).mean()
         Max Speed
 Animal
 Falcon      375.0
@@ -151,22 +151,22 @@ def _is_row_of(chunk, obj):
         Wild         350.0
 Parrot Captive       30.0
         Wild          20.0
->>> df.groupby(level=0).mean()
+>>> df.groupby(level=0, sort=True).mean()
         Max Speed
 Animal
 Falcon      370.0
 Parrot       25.0
->>> df.groupby(level="Type").mean()
+>>> df.groupby(level="Type", sort=True).mean()
         Max Speed
 Type
-Wild         185.0
 Captive      210.0
+Wild         185.0
 
 >>> df = cudf.DataFrame({{'A': 'a a b'.split(),
 ...                      'B': [1,2,3],
 ...                      'C': [4,6,5]}})
->>> g1 = df.groupby('A', group_keys=False)
->>> g2 = df.groupby('A', group_keys=True)
+>>> g1 = df.groupby('A', group_keys=False, sort=True)
+>>> g2 = df.groupby('A', group_keys=True, sort=True)
 
 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
 differ in their ``group_keys`` argument. Calling `apply` in various ways,
@@ -539,11 +539,11 @@ def agg(self, func):
         ...     'b': [1, 2, 3],
         ...     'c': [2, 2, 1]
         ... })
-        >>> a.groupby('a').agg('sum')
+        >>> a.groupby('a', sort=True).agg('sum')
            b  c
         a
-        2  3  1
         1  3  4
+        2  3  1
 
         Specifying a list of aggregations to perform on each column.
 
@@ -553,12 +553,12 @@ def agg(self, func):
         ...     'b': [1, 2, 3],
         ...     'c': [2, 2, 1]
         ... })
-        >>> a.groupby('a').agg(['sum', 'min'])
+        >>> a.groupby('a', sort=True).agg(['sum', 'min'])
             b       c
           sum min sum min
         a
-        2   3   3   1   1
         1   3   1   4   2
+        2   3   3   1   1
 
         Using a dict to specify aggregations to perform per column.
 
@@ -568,12 +568,12 @@ def agg(self, func):
         ...     'b': [1, 2, 3],
         ...     'c': [2, 2, 1]
         ... })
-        >>> a.groupby('a').agg({'a': 'max', 'b': ['min', 'mean']})
+        >>> a.groupby('a', sort=True).agg({'a': 'max', 'b': ['min', 'mean']})
             a   b
           max min mean
         a
-        2   2   3  3.0
         1   1   1  1.5
+        2   2   3  3.0
 
         Using lambdas/callables to specify aggregations taking parameters.
 

From 85a47db938b8ffc5e7dd9646eac38272458c37d7 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 23 Feb 2024 12:57:50 -0800
Subject: [PATCH 14/16] Make value_counts docstring test deterministic

---
 python/cudf/cudf/core/dataframe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0a90b535040..8f22b990443 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7693,10 +7693,10 @@ def value_counts(
         dog            4          0
         cat            4          0
         ant            6          0
-        >>> df.value_counts(sort=True)
+        >>> df.value_counts().sort_index()
         num_legs  num_wings
-        4         0            2
         2         2            1
+        4         0            2
         6         0            1
         Name: count, dtype: int64
         """

From f79f1d6a08eb6bb6ce68cfcbea43544ddcb2a556 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 28 Feb 2024 17:01:27 -0800
Subject: [PATCH 15/16] Update docs

---
 docs/cudf/source/user_guide/pandas-comparison.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index 03ce58ea9e3..549d91b771a 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,7 +87,7 @@ using `.from_arrow()` or `.from_pandas()`.
 
 ## Result ordering
 
-By default, `join` (or `merge`) and `groupby` operations in cuDF
+By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
 do *not* guarantee output ordering.
 Compare the results obtained from Pandas and cuDF below:
 

From 56a222996273eb229d9aa0fd8a914012bcc21592 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 29 Feb 2024 09:10:48 -0800
Subject: [PATCH 16/16] Add TODO reminder for future performance tuning

---
 cpp/src/groupby/hash/groupby.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index dcb40edbddc..acc1b087510 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -59,6 +59,8 @@ namespace detail {
 namespace hash {
 namespace {
 
+// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
+// types and `cg_size = 1`for flat data to improve performance
 using probing_scheme_type = cuco::linear_probing<
   1,  ///< Number of threads used to handle each input key
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,