From 1d1a35bd327650742e51c8cc33bc1c9e659006a4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 26 Oct 2021 10:16:12 -0600
Subject: [PATCH 01/48] Add condition to fallback to sort-based aggregates if
 the input values column has structs

---
 cpp/src/groupby/groupby.cu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 7ac7c199b05..69d7f5860db 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -67,13 +67,21 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
 {
   using namespace cudf::structs::detail;
 
+  // Currently, structs are not supported in hash-based aggregates.
+  // Therefore, if any request contains structs then we must fallback to sort-based aggregates.
+  // TODO: Support structs in hash-based aggregates.
+  auto const has_struct =
+    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+      return r.values.type().id() == type_id::STRUCT;
+    });
+
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
   // all the aggs that can be done by hash groupby are efficiently done by
   // sort groupby as well.
   // Only use hash groupby if the keys aren't sorted and all requests can be
   // satisfied with a hash implementation
-  if (_keys_are_sorted == sorted::NO and not _helper and
+  if (_keys_are_sorted == sorted::NO and not _helper and (not has_struct) and
       detail::hash::can_use_hash_groupby(_keys, requests)) {
     // Optionally flatten nested key columns.
     auto flattened             = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);

From f36abed8820d5e6a9877e82d20772f22717b7ffa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 26 Oct 2021 13:32:12 -0600
Subject: [PATCH 02/48] Rename function

---
 cpp/src/groupby/sort/group_single_pass_reduction_util.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index db2ae5b5d8e..59748110a83 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -124,7 +124,7 @@ struct null_replaced_value_accessor : value_accessor<T> {
 template <aggregation::Kind K>
 struct reduce_functor {
   template <typename T>
-  static constexpr bool is_supported()
+  static constexpr bool is_natively_supported()
   {
     switch (K) {
       case aggregation::SUM:
@@ -140,7 +140,7 @@ struct reduce_functor {
   }
 
   template <typename T>
-  std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
+  std::enable_if_t<is_natively_supported<T>(), std::unique_ptr<column>> operator()(
     column_view const& values,
     size_type num_groups,
     cudf::device_span<size_type const> group_labels,
@@ -207,7 +207,8 @@ struct reduce_functor {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
+  std::enable_if_t<not is_natively_supported<T>(), std::unique_ptr<column>> operator()(
+    Args&&... args)
   {
     CUDF_FAIL("Unsupported type-agg combination");
   }

From b0b45355f8034779ec2165c75ad7bb72ee7e5a00 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 26 Oct 2021 15:02:13 -0600
Subject: [PATCH 03/48] Implement argmin/argmax for structs

---
 .../sort/group_single_pass_reduction_util.cuh | 81 ++++++++++++++++++-
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 59748110a83..98894ee4370 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -21,7 +21,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -172,7 +174,7 @@ struct reduce_functor {
                             thrust::make_counting_iterator<ResultType>(0),
                             thrust::make_discard_iterator(),
                             resultview->begin<ResultType>(),
-                            thrust::equal_to<ResultType>{},
+                            thrust::equal_to<size_type>{},
                             OpType{*valuesview});
     } else {
       auto init  = OpType::template identity<DeviceType>();
@@ -206,9 +208,82 @@ struct reduce_functor {
     return result;
   }
 
+  template <typename T>
+  std::enable_if_t<not is_natively_supported<T>() and std::is_same_v<T, struct_view> and
+                     (K == aggregation::ARGMIN or K == aggregation::ARGMAX),
+                   std::unique_ptr<column>>
+  operator()(column_view const& values,
+             size_type num_groups,
+             cudf::device_span<size_type const> group_labels,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    using ResultType = cudf::detail::target_type_t<T, K>;
+    auto result      = make_fixed_width_column(
+      data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
+
+    if (values.is_empty()) { return result; }
+
+    // The comparison and null orders for finding arg_min/arg_max for the min/max elements.
+    auto const comp_order      = K == aggregation::ARGMIN ? order::ASCENDING : order::DESCENDING;
+    auto const null_precedence = K == aggregation::ARGMIN ? null_order::AFTER : null_order::BEFORE;
+
+    auto const flattened_values =
+      structs::detail::flatten_nested_columns(table_view{{values}},
+                                              {comp_order},
+                                              {null_precedence},
+                                              structs::detail::column_nullability::MATCH_INCOMING);
+    auto const values_ptr = table_device_view::create(flattened_values, stream);
+
+    // Perform reduction to find arg_min/arg_max.
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
+      thrust::reduce_by_key(rmm::exec_policy(stream),
+                            group_labels.data(),
+                            group_labels.data() + group_labels.size(),
+                            inp_iter,
+                            thrust::make_discard_iterator(),
+                            out_iter,
+                            thrust::equal_to<size_type>{},
+                            comp);
+    };
+
+    auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
+    auto const result_begin = result->mutable_view().template begin<ResultType>();
+    if (!values.has_nulls()) {
+      auto const comp = row_lexicographic_comparator<false>(*values_ptr,
+                                                            *values_ptr,
+                                                            flattened_values.orders().data(),
+                                                            flattened_values.null_orders().data());
+      do_reduction(count_iter, result_begin, comp);
+    } else {
+      auto const comp = row_lexicographic_comparator<true>(*values_ptr,
+                                                           *values_ptr,
+                                                           flattened_values.orders().data(),
+                                                           flattened_values.null_orders().data());
+      do_reduction(count_iter, result_begin, comp);
+
+      // Generate bitmask for the output from the input.
+      auto const values_ptr = column_device_view::create(values, stream);
+      auto validity         = rmm::device_uvector<bool>(num_groups, stream);
+      do_reduction(cudf::detail::make_validity_iterator(*values_ptr),
+                   validity.begin(),
+                   thrust::logical_or<bool>{});
+
+      auto [null_mask, null_count] = cudf::detail::valid_if(
+        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      result->set_null_mask(std::move(null_mask));
+      result->set_null_count(null_count);
+    }
+
+    return result;
+  }
+
   template <typename T, typename... Args>
-  std::enable_if_t<not is_natively_supported<T>(), std::unique_ptr<column>> operator()(
-    Args&&... args)
+  std::enable_if_t<not is_natively_supported<T>() and
+                     (not std::is_same_v<T, struct_view> or
+                      (K != aggregation::ARGMIN or K != aggregation::ARGMAX)),
+                   std::unique_ptr<column>>
+  operator()(Args&&... args)
   {
     CUDF_FAIL("Unsupported type-agg combination");
   }

From 6387ae25115459c09fd3e2bda350d5f5ac62f1e8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 10:52:30 -0600
Subject: [PATCH 04/48] Add comments and  cleanup

---
 .../sort/group_single_pass_reduction_util.cuh | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 98894ee4370..812647fbb93 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -208,6 +208,10 @@ struct reduce_functor {
     return result;
   }
 
+  // This specialization handles the cases when the input values type:
+  //  - Is not natively supported, and
+  //  - Is struct type, and
+  //  - Aggregation is either ARGMIN or ARGMAX.
   template <typename T>
   std::enable_if_t<not is_natively_supported<T>() and std::is_same_v<T, struct_view> and
                      (K == aggregation::ARGMIN or K == aggregation::ARGMAX),
@@ -218,13 +222,15 @@ struct reduce_functor {
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
+    // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;
-    auto result      = make_fixed_width_column(
+
+    auto result = make_fixed_width_column(
       data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
 
     if (values.is_empty()) { return result; }
 
-    // The comparison and null orders for finding arg_min/arg_max for the min/max elements.
+    // The comparison order and null order for finding ARGMIN/ARGMAX.
     auto const comp_order      = K == aggregation::ARGMIN ? order::ASCENDING : order::DESCENDING;
     auto const null_precedence = K == aggregation::ARGMIN ? null_order::AFTER : null_order::BEFORE;
 
@@ -233,9 +239,9 @@ struct reduce_functor {
                                               {comp_order},
                                               {null_precedence},
                                               structs::detail::column_nullability::MATCH_INCOMING);
-    auto const values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_values_ptr = table_device_view::create(flattened_values, stream);
 
-    // Perform reduction to find arg_min/arg_max.
+    // Perform segmented reduction to find ARGMIN/ARGMAX.
     auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
@@ -250,19 +256,19 @@ struct reduce_functor {
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (!values.has_nulls()) {
-      auto const comp = row_lexicographic_comparator<false>(*values_ptr,
-                                                            *values_ptr,
+      auto const comp = row_lexicographic_comparator<false>(*flattened_values_ptr,
+                                                            *flattened_values_ptr,
                                                             flattened_values.orders().data(),
                                                             flattened_values.null_orders().data());
       do_reduction(count_iter, result_begin, comp);
     } else {
-      auto const comp = row_lexicographic_comparator<true>(*values_ptr,
-                                                           *values_ptr,
+      auto const comp = row_lexicographic_comparator<true>(*flattened_values_ptr,
+                                                           *flattened_values_ptr,
                                                            flattened_values.orders().data(),
                                                            flattened_values.null_orders().data());
       do_reduction(count_iter, result_begin, comp);
 
-      // Generate bitmask for the output from the input.
+      // Generate bitmask for the output by segmented reduction of the input bitmask.
       auto const values_ptr = column_device_view::create(values, stream);
       auto validity         = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*values_ptr),
@@ -278,6 +284,10 @@ struct reduce_functor {
     return result;
   }
 
+  // Throw exception if the input values type:
+  //  - Is not natively supported, or
+  //  - Is not struct type, or
+  //  - Is struct type but aggregation is not neither ARGMIN nor ARGMAX.
   template <typename T, typename... Args>
   std::enable_if_t<not is_natively_supported<T>() and
                      (not std::is_same_v<T, struct_view> or

From fddbac9388225ae966b681b2bed76565b54d15f6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 16:22:18 -0600
Subject: [PATCH 05/48] Cleanup

---
 .../sort/group_single_pass_reduction_util.cuh | 49 +++++++++++++------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 812647fbb93..5efa64f5c27 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -77,6 +77,34 @@ struct ArgMax {
   }
 };
 
+/**
+ * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ */
+template <bool has_nulls, bool arg_min>
+struct ArgMinMax {
+  size_type const num_rows;
+  row_lexicographic_comparator<has_nulls> const comp;
+
+  ArgMinMax(size_type const num_rows_, table_device_view const& table_)
+    : num_rows(num_rows_), comp(table_, table_)
+  {
+  }
+
+  CUDA_DEVICE_CALLABLE auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
+    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -230,15 +258,8 @@ struct reduce_functor {
 
     if (values.is_empty()) { return result; }
 
-    // The comparison order and null order for finding ARGMIN/ARGMAX.
-    auto const comp_order      = K == aggregation::ARGMIN ? order::ASCENDING : order::DESCENDING;
-    auto const null_precedence = K == aggregation::ARGMIN ? null_order::AFTER : null_order::BEFORE;
-
     auto const flattened_values =
-      structs::detail::flatten_nested_columns(table_view{{values}},
-                                              {comp_order},
-                                              {null_precedence},
-                                              structs::detail::column_nullability::MATCH_INCOMING);
+      structs::detail::flatten_nested_columns(table_view{{values}}, {}, {});
     auto const flattened_values_ptr = table_device_view::create(flattened_values, stream);
 
     // Perform segmented reduction to find ARGMIN/ARGMAX.
@@ -256,16 +277,12 @@ struct reduce_functor {
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (!values.has_nulls()) {
-      auto const comp = row_lexicographic_comparator<false>(*flattened_values_ptr,
-                                                            *flattened_values_ptr,
-                                                            flattened_values.orders().data(),
-                                                            flattened_values.null_orders().data());
+      auto const comp =
+        ArgMinMax<false, K == aggregation::ARGMIN>(values.size(), *flattened_values_ptr);
       do_reduction(count_iter, result_begin, comp);
     } else {
-      auto const comp = row_lexicographic_comparator<true>(*flattened_values_ptr,
-                                                           *flattened_values_ptr,
-                                                           flattened_values.orders().data(),
-                                                           flattened_values.null_orders().data());
+      auto const comp =
+        ArgMinMax<true, K == aggregation::ARGMIN>(values.size(), *flattened_values_ptr);
       do_reduction(count_iter, result_begin, comp);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.

From b86665ee298e4a38886b9c5021bc2a91ece38dd4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 17:59:47 -0600
Subject: [PATCH 06/48] Simplify code

---
 .../sort/group_single_pass_reduction_util.cuh | 135 +++++++++---------
 1 file changed, 65 insertions(+), 70 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 5efa64f5c27..0f798fa6ba7 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -40,48 +40,44 @@ namespace groupby {
 namespace detail {
 
 /**
- * @brief ArgMin binary operator with index values into input column.
+ * @brief Binary operator with index values into the input column.
  *
  * @tparam T Type of the underlying column. Must support '<' operator.
  */
-template <typename T>
-struct ArgMin {
+template <typename T, bool has_nulls, bool arg_min, typename Enable = void>
+struct ArgMinMax {
   column_device_view const d_col;
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
     // github.com/NVIDIA/thrust/issues/1525
     // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
-    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
-    return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
-  }
-};
+    if (lhs_idx < 0 || lhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(lhs_idx))) {
+      return rhs_idx;
+    }
+    if (rhs_idx < 0 || rhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(rhs_idx))) {
+      return lhs_idx;
+    }
 
-/**
- * @brief ArgMax binary operator with index values into input column.
- *
- * @tparam T Type of the underlying column. Must support '<' operator.
- */
-template <typename T>
-struct ArgMax {
-  column_device_view const d_col;
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
-    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
-    return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
+    return less == arg_min ? lhs_idx : rhs_idx;
   }
 };
 
 /**
  * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ *
+ * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
+ * '<' operator.
  */
-template <bool has_nulls, bool arg_min>
-struct ArgMinMax {
+template <typename T, bool has_nulls, bool arg_min>
+struct ArgMinMax<T,
+                 has_nulls,
+                 arg_min,
+                 std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>> {
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;
 
@@ -178,7 +174,6 @@ struct reduce_functor {
     rmm::mr::device_memory_resource* mr)
   {
     using DeviceType  = device_storage_type_t<T>;
-    using OpType      = cudf::detail::corresponding_operator_t<K>;
     using ResultType  = cudf::detail::target_type_t<T, K>;
     using ResultDType = device_storage_type_t<ResultType>;
 
@@ -191,43 +186,44 @@ struct reduce_functor {
 
     if (values.is_empty()) { return result; }
 
-    auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
-    auto valuesview = column_device_view::create(values, stream);
-    if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
-      using OpType =
-        std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
+    // Perform segmented reduction.
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            thrust::make_counting_iterator<ResultType>(0),
+                            inp_iter,
                             thrust::make_discard_iterator(),
-                            resultview->begin<ResultType>(),
+                            out_iter,
                             thrust::equal_to<size_type>{},
-                            OpType{*valuesview});
+                            comp);
+    };
+
+    auto const d_values_ptr = column_device_view::create(values, stream);
+    auto const result_begin = result->mutable_view().template begin<ResultDType>();
+
+    if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
+      auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
+      if (values.has_nulls()) {
+        using OpType = ArgMinMax<T, true, K == aggregation::ARGMIN>;
+        do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
+      } else {
+        using OpType = ArgMinMax<T, false, K == aggregation::ARGMIN>;
+        do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
+      }
     } else {
-      auto init  = OpType::template identity<DeviceType>();
-      auto begin = cudf::detail::make_counting_transform_iterator(
-        0, null_replaced_value_accessor{*valuesview, init, values.has_nulls()});
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            group_labels.data(),
-                            group_labels.data() + group_labels.size(),
-                            begin,
-                            thrust::make_discard_iterator(),
-                            resultview->begin<ResultDType>(),
-                            thrust::equal_to<size_type>{},
-                            OpType{});
+      using OpType = cudf::detail::corresponding_operator_t<K>;
+      auto init    = OpType::template identity<DeviceType>();
+      auto begin   = cudf::detail::make_counting_transform_iterator(
+        0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()});
+      do_reduction(begin, result_begin, OpType{});
     }
 
     if (values.has_nulls()) {
       rmm::device_uvector<bool> validity(num_groups, stream);
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            group_labels.data(),
-                            group_labels.data() + group_labels.size(),
-                            cudf::detail::make_validity_iterator(*valuesview),
-                            thrust::make_discard_iterator(),
-                            validity.begin(),
-                            thrust::equal_to<size_type>{},
-                            thrust::logical_or<bool>{});
+      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
+                   validity.begin(),
+                   thrust::logical_or<bool>{});
+
       auto [null_mask, null_count] = cudf::detail::valid_if(
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
       result->set_null_mask(std::move(null_mask));
@@ -260,7 +256,7 @@ struct reduce_functor {
 
     auto const flattened_values =
       structs::detail::flatten_nested_columns(table_view{{values}}, {}, {});
-    auto const flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
 
     // Perform segmented reduction to find ARGMIN/ARGMAX.
     auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
@@ -276,19 +272,15 @@ struct reduce_functor {
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
-    if (!values.has_nulls()) {
-      auto const comp =
-        ArgMinMax<false, K == aggregation::ARGMIN>(values.size(), *flattened_values_ptr);
-      do_reduction(count_iter, result_begin, comp);
-    } else {
-      auto const comp =
-        ArgMinMax<true, K == aggregation::ARGMIN>(values.size(), *flattened_values_ptr);
-      do_reduction(count_iter, result_begin, comp);
+    if (values.has_nulls()) {
+      auto const op =
+        ArgMinMax<T, true, K == aggregation::ARGMIN>(values.size(), *d_flattened_values_ptr);
+      do_reduction(count_iter, result_begin, op);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
-      auto const values_ptr = column_device_view::create(values, stream);
-      auto validity         = rmm::device_uvector<bool>(num_groups, stream);
-      do_reduction(cudf::detail::make_validity_iterator(*values_ptr),
+      auto const d_values_ptr = column_device_view::create(values, stream);
+      auto validity           = rmm::device_uvector<bool>(num_groups, stream);
+      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
                    thrust::logical_or<bool>{});
 
@@ -296,15 +288,18 @@ struct reduce_functor {
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
       result->set_null_mask(std::move(null_mask));
       result->set_null_count(null_count);
+    } else {
+      auto const op =
+        ArgMinMax<T, false, K == aggregation::ARGMIN>(values.size(), *d_flattened_values_ptr);
+      do_reduction(count_iter, result_begin, op);
     }
 
     return result;
   }
 
   // Throw exception if the input values type:
-  //  - Is not natively supported, or
-  //  - Is not struct type, or
-  //  - Is struct type but aggregation is not neither ARGMIN nor ARGMAX.
+  //  - Is not natively supported, and
+  //  - Is not struct type, or is struct type but aggregation is not neither ARGMIN nor ARGMAX.
   template <typename T, typename... Args>
   std::enable_if_t<not is_natively_supported<T>() and
                      (not std::is_same_v<T, struct_view> or

From 96683ac1d77f92a1b764d8a6b20b2641de0dd075 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 21:19:55 -0600
Subject: [PATCH 07/48] Fix null order

---
 .../sort/group_single_pass_reduction_util.cuh | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 0f798fa6ba7..b292c74f37f 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -81,8 +82,10 @@ struct ArgMinMax<T,
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;
 
-  ArgMinMax(size_type const num_rows_, table_device_view const& table_)
-    : num_rows(num_rows_), comp(table_, table_)
+  ArgMinMax(size_type const num_rows_,
+            table_device_view const& table_,
+            null_order const* null_precedence)
+    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
   {
   }
 
@@ -254,9 +257,17 @@ struct reduce_functor {
 
     if (values.is_empty()) { return result; }
 
-    auto const flattened_values =
-      structs::detail::flatten_nested_columns(table_view{{values}}, {}, {});
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite for ARGMAX.
+    auto const null_precedence =
+      (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
     auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::ARGMIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
 
     // Perform segmented reduction to find ARGMIN/ARGMAX.
     auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
@@ -273,8 +284,8 @@ struct reduce_functor {
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const op =
-        ArgMinMax<T, true, K == aggregation::ARGMIN>(values.size(), *d_flattened_values_ptr);
+      auto const op = ArgMinMax<T, true, K == aggregation::ARGMIN>(
+        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
@@ -289,8 +300,8 @@ struct reduce_functor {
       result->set_null_mask(std::move(null_mask));
       result->set_null_count(null_count);
     } else {
-      auto const op =
-        ArgMinMax<T, false, K == aggregation::ARGMIN>(values.size(), *d_flattened_values_ptr);
+      auto const op = ArgMinMax<T, false, K == aggregation::ARGMIN>(
+        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
     }
 

From b26cc9343cf856c0ff7bc0912b2d727af9d23215 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 21:28:58 -0600
Subject: [PATCH 08/48] Add unit tests

---
 cpp/tests/groupby/argmax_tests.cpp | 74 ++++++++++++++++++++++++-
 cpp/tests/groupby/argmin_tests.cpp | 74 ++++++++++++++++++++++++-
 cpp/tests/groupby/max_tests.cpp    | 88 +++++++++++++++++++++++++++++-
 cpp/tests/groupby/min_tests.cpp    | 88 +++++++++++++++++++++++++++++-
 4 files changed, 318 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index 7cf693f7b08..0b06c184b75 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_argmax_test : public cudf::test::BaseFixture {
 };
 using K = int32_t;
 
-TYPED_TEST_CASE(groupby_argmax_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_argmax_test, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(groupby_argmax_test, basic)
 {
@@ -182,6 +182,78 @@ TEST_F(groupby_dictionary_argmax_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_argmax_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmax_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{0, 4, 2};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmax_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys           = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals           = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{0, 4, 2};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmax_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{{0, 4, 2, null}, null_at(3)};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
 
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index 915575546c9..67235a64066 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_argmin_test : public cudf::test::BaseFixture {
 };
 using K = int32_t;
 
-TYPED_TEST_CASE(groupby_argmin_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_argmin_test, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(groupby_argmin_test, basic)
 {
@@ -183,5 +183,77 @@ TEST_F(groupby_dictionary_argmin_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_argmin_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmin_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{3, 5, 7};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmin_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys           = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals           = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{3, 5, 7};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmin_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{{3, 1, 8, null}, null_at(3)};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 491e6927304..8d15401aa09 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -33,7 +33,7 @@ struct groupby_max_test : public cudf::test::BaseFixture {
 };
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_max_test, basic)
 {
@@ -255,7 +255,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
 {
@@ -304,5 +304,89 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
   }
 }
 
+struct groupby_max_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 5, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 5, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 5, 7, null};
+    return structs_column_wrapper{{child1, child2}, null_at(3)};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 4f8db1d750c..c2cfca83b29 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -33,7 +33,7 @@ struct groupby_min_test : public cudf::test::BaseFixture {
 };
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_min_test, basic)
 {
@@ -255,7 +255,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
 {
@@ -303,5 +303,89 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
   }
 }
 
+struct groupby_min_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bat", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{4, 6, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bat", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{4, 6, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bit", "€1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{6, 8, 1, null};
+    return structs_column_wrapper{{child1, child2}, null_at(3)};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf

From 895cabb21fdd6799751db08638359735ebb2e9a1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 27 Oct 2021 22:18:21 -0600
Subject: [PATCH 09/48] Rename functor

---
 .../sort/group_single_pass_reduction_util.cuh | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index b292c74f37f..df779062cfd 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -46,7 +46,7 @@ namespace detail {
  * @tparam T Type of the underlying column. Must support '<' operator.
  */
 template <typename T, bool has_nulls, bool arg_min, typename Enable = void>
-struct ArgMinMax {
+struct arg_minmax_fn {
   column_device_view const d_col;
   CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
@@ -75,16 +75,16 @@ struct ArgMinMax {
  * '<' operator.
  */
 template <typename T, bool has_nulls, bool arg_min>
-struct ArgMinMax<T,
-                 has_nulls,
-                 arg_min,
-                 std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>> {
+struct arg_minmax_fn<T,
+                     has_nulls,
+                     arg_min,
+                     std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>> {
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;
 
-  ArgMinMax(size_type const num_rows_,
-            table_device_view const& table_,
-            null_order const* null_precedence)
+  arg_minmax_fn(size_type const num_rows_,
+                table_device_view const& table_,
+                null_order const* null_precedence)
     : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
   {
   }
@@ -190,7 +190,7 @@ struct reduce_functor {
     if (values.is_empty()) { return result; }
 
     // Perform segmented reduction.
-    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
@@ -198,7 +198,7 @@ struct reduce_functor {
                             thrust::make_discard_iterator(),
                             out_iter,
                             thrust::equal_to<size_type>{},
-                            comp);
+                            binop);
     };
 
     auto const d_values_ptr = column_device_view::create(values, stream);
@@ -207,10 +207,10 @@ struct reduce_functor {
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
       auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
       if (values.has_nulls()) {
-        using OpType = ArgMinMax<T, true, K == aggregation::ARGMIN>;
+        using OpType = arg_minmax_fn<T, true, K == aggregation::ARGMIN>;
         do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
       } else {
-        using OpType = ArgMinMax<T, false, K == aggregation::ARGMIN>;
+        using OpType = arg_minmax_fn<T, false, K == aggregation::ARGMIN>;
         do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
       }
     } else {
@@ -270,7 +270,7 @@ struct reduce_functor {
         : rmm::device_uvector<null_order>(0, stream);
 
     // Perform segmented reduction to find ARGMIN/ARGMAX.
-    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& comp) {
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
@@ -278,13 +278,13 @@ struct reduce_functor {
                             thrust::make_discard_iterator(),
                             out_iter,
                             thrust::equal_to<size_type>{},
-                            comp);
+                            binop);
     };
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const op = ArgMinMax<T, true, K == aggregation::ARGMIN>(
+      auto const op = arg_minmax_fn<T, true, K == aggregation::ARGMIN>(
         values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
 
@@ -300,7 +300,7 @@ struct reduce_functor {
       result->set_null_mask(std::move(null_mask));
       result->set_null_count(null_count);
     } else {
-      auto const op = ArgMinMax<T, false, K == aggregation::ARGMIN>(
+      auto const op = arg_minmax_fn<T, false, K == aggregation::ARGMIN>(
         values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
     }

From bfc0585151d3d9dcd3bc638fe925f497f474b273 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 28 Oct 2021 09:03:36 -0600
Subject: [PATCH 10/48] Move `has_struct` condition check into
 `can_use_has_groupby`

---
 cpp/src/groupby/groupby.cu      | 10 +---------
 cpp/src/groupby/hash/groupby.cu | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 508353e7759..e8b4a8b1cbf 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -67,21 +67,13 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
 {
   using namespace cudf::structs::detail;
 
-  // Currently, structs are not supported in hash-based aggregates.
-  // Therefore, if any request contains structs then we must fallback to sort-based aggregates.
-  // TODO: Support structs in hash-based aggregates.
-  auto const has_struct =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return r.values.type().id() == type_id::STRUCT;
-    });
-
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
   // all the aggs that can be done by hash groupby are efficiently done by
   // sort groupby as well.
   // Only use hash groupby if the keys aren't sorted and all requests can be
   // satisfied with a hash implementation
-  if (_keys_are_sorted == sorted::NO and not _helper and (not has_struct) and
+  if (_keys_are_sorted == sorted::NO and not _helper and
       detail::hash::can_use_hash_groupby(_keys, requests)) {
     // Optionally flatten nested key columns.
     auto flattened             = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e7024c80a68..ef640256927 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -632,11 +632,22 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
  */
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
-  return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-      return is_hash_aggregation(a->kind);
+  auto const all_hash_aggregations =
+    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+      return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+        return is_hash_aggregation(a->kind);
+      });
     });
-  });
+
+  // Currently, structs are not supported in any of hash-based aggregations.
+  // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
+  // TODO: Support structs in hash-based aggregations.
+  auto const has_struct =
+    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+      return r.values.type().id() == type_id::STRUCT;
+    });
+
+  return all_hash_aggregations && !has_struct;
 }
 
 // Hash-based groupby

From cc5c8c416250531e9dc26a8d35839ffdc5ddf4b7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 29 Oct 2021 14:39:08 -0600
Subject: [PATCH 11/48] Rename structs and function

---
 .../sort/group_single_pass_reduction_util.cuh | 36 +++++++++----------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index df779062cfd..22a01fb6cb3 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -45,8 +45,8 @@ namespace detail {
  *
  * @tparam T Type of the underlying column. Must support '<' operator.
  */
-template <typename T, bool has_nulls, bool arg_min, typename Enable = void>
-struct arg_minmax_fn {
+template <typename T, bool has_nulls, bool arg_min>
+struct element_arg_minmax_fn {
   column_device_view const d_col;
   CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
@@ -75,16 +75,13 @@ struct arg_minmax_fn {
  * '<' operator.
  */
 template <typename T, bool has_nulls, bool arg_min>
-struct arg_minmax_fn<T,
-                     has_nulls,
-                     arg_min,
-                     std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>> {
+struct row_arg_minmax_fn {
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;
 
-  arg_minmax_fn(size_type const num_rows_,
-                table_device_view const& table_,
-                null_order const* null_precedence)
+  row_arg_minmax_fn(size_type const num_rows_,
+                    table_device_view const& table_,
+                    null_order const* null_precedence)
     : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
   {
   }
@@ -153,7 +150,7 @@ struct null_replaced_value_accessor : value_accessor<T> {
 template <aggregation::Kind K>
 struct reduce_functor {
   template <typename T>
-  static constexpr bool is_natively_supported()
+  static constexpr bool is_trivially_supported()
   {
     switch (K) {
       case aggregation::SUM:
@@ -169,7 +166,7 @@ struct reduce_functor {
   }
 
   template <typename T>
-  std::enable_if_t<is_natively_supported<T>(), std::unique_ptr<column>> operator()(
+  std::enable_if_t<is_trivially_supported<T>(), std::unique_ptr<column>> operator()(
     column_view const& values,
     size_type num_groups,
     cudf::device_span<size_type const> group_labels,
@@ -207,10 +204,10 @@ struct reduce_functor {
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
       auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
       if (values.has_nulls()) {
-        using OpType = arg_minmax_fn<T, true, K == aggregation::ARGMIN>;
+        using OpType = element_arg_minmax_fn<T, true, K == aggregation::ARGMIN>;
         do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
       } else {
-        using OpType = arg_minmax_fn<T, false, K == aggregation::ARGMIN>;
+        using OpType = element_arg_minmax_fn<T, false, K == aggregation::ARGMIN>;
         do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
       }
     } else {
@@ -236,11 +233,12 @@ struct reduce_functor {
   }
 
   // This specialization handles the cases when the input values type:
-  //  - Is not natively supported, and
   //  - Is struct type, and
   //  - Aggregation is either ARGMIN or ARGMAX.
+  // Since `is_trivially_supported` returns false for `struct_view`, we don't have to cover it in
+  // the SFINAE condition.
   template <typename T>
-  std::enable_if_t<not is_natively_supported<T>() and std::is_same_v<T, struct_view> and
+  std::enable_if_t<std::is_same_v<T, struct_view> and
                      (K == aggregation::ARGMIN or K == aggregation::ARGMAX),
                    std::unique_ptr<column>>
   operator()(column_view const& values,
@@ -284,7 +282,7 @@ struct reduce_functor {
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const op = arg_minmax_fn<T, true, K == aggregation::ARGMIN>(
+      auto const op = row_arg_minmax_fn<T, true, K == aggregation::ARGMIN>(
         values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
 
@@ -300,7 +298,7 @@ struct reduce_functor {
       result->set_null_mask(std::move(null_mask));
       result->set_null_count(null_count);
     } else {
-      auto const op = arg_minmax_fn<T, false, K == aggregation::ARGMIN>(
+      auto const op = row_arg_minmax_fn<T, false, K == aggregation::ARGMIN>(
         values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
       do_reduction(count_iter, result_begin, op);
     }
@@ -309,10 +307,10 @@ struct reduce_functor {
   }
 
   // Throw exception if the input values type:
-  //  - Is not natively supported, and
+  //  - Is not trivially supported, and
   //  - Is not struct type, or is struct type but aggregation is not neither ARGMIN nor ARGMAX.
   template <typename T, typename... Args>
-  std::enable_if_t<not is_natively_supported<T>() and
+  std::enable_if_t<not is_trivially_supported<T>() and
                      (not std::is_same_v<T, struct_view> or
                       (K != aggregation::ARGMIN or K != aggregation::ARGMAX)),
                    std::unique_ptr<column>>

From cd7f7a4ba4fa48410a23cee9e2be01fbf009c537 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 1 Nov 2021 11:12:48 -0600
Subject: [PATCH 12/48] Fix SFINAE condition, and extract a struct functor

---
 .../sort/group_single_pass_reduction_util.cuh | 37 +-----------
 cpp/src/groupby/sort/group_util.cuh           | 60 +++++++++++++++++++
 2 files changed, 63 insertions(+), 34 deletions(-)
 create mode 100644 cpp/src/groupby/sort/group_util.cuh

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 22a01fb6cb3..3e1f994cf7b 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <groupby/sort/group_util.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -68,39 +70,6 @@ struct element_arg_minmax_fn {
   }
 };
 
-/**
- * @brief Binary operator ArgMin/ArgMax with index values into the input table.
- *
- * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
- * '<' operator.
- */
-template <typename T, bool has_nulls, bool arg_min>
-struct row_arg_minmax_fn {
-  size_type const num_rows;
-  row_lexicographic_comparator<has_nulls> const comp;
-
-  row_arg_minmax_fn(size_type const num_rows_,
-                    table_device_view const& table_,
-                    null_order const* null_precedence)
-    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
-  {
-  }
-
-  CUDA_DEVICE_CALLABLE auto operator()(size_type lhs_idx, size_type rhs_idx) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
-    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
-
-    // Return `lhs_idx` iff:
-    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
-    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
-  }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -312,7 +281,7 @@ struct reduce_functor {
   template <typename T, typename... Args>
   std::enable_if_t<not is_trivially_supported<T>() and
                      (not std::is_same_v<T, struct_view> or
-                      (K != aggregation::ARGMIN or K != aggregation::ARGMAX)),
+                      (K != aggregation::ARGMIN and K != aggregation::ARGMAX)),
                    std::unique_ptr<column>>
   operator()(Args&&... args)
   {
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
new file mode 100644
index 00000000000..43de1b4a9d8
--- /dev/null
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/row_operators.cuh>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+/**
+ * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ *
+ * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
+ * '<' operator.
+ */
+template <typename T, bool has_nulls, bool arg_min>
+struct row_arg_minmax_fn {
+  size_type const num_rows;
+  row_lexicographic_comparator<has_nulls> const comp;
+
+  row_arg_minmax_fn(size_type const num_rows_,
+                    table_device_view const& table_,
+                    null_order const* null_precedence)
+    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
+  {
+  }
+
+  CUDA_DEVICE_CALLABLE auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
+    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf

From f7d1b3e027aefe80825f5662bd26cf3d186d500a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 1 Nov 2021 11:13:02 -0600
Subject: [PATCH 13/48] Implement groupby scan for struct min/max

---
 cpp/src/groupby/sort/group_scan_util.cuh | 153 ++++++++++++++++++-----
 1 file changed, 119 insertions(+), 34 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index ef9df937fc5..dba066fc55c 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,12 +16,18 @@
 
 #pragma once
 
+#include <groupby/sort/group_util.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -39,7 +45,7 @@ namespace detail {
 template <aggregation::Kind K>
 struct scan_functor {
   template <typename T>
-  static constexpr bool is_supported()
+  static constexpr bool is_trivially_supported()
   {
     if (K == aggregation::SUM)
       return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
@@ -50,7 +56,7 @@ struct scan_functor {
   }
 
   template <typename T>
-  std::enable_if_t<is_supported<T>() and not std::is_same_v<T, cudf::string_view>,
+  std::enable_if_t<is_trivially_supported<T>() and not std::is_same_v<T, cudf::string_view>,
                    std::unique_ptr<column>>
   operator()(column_view const& values,
              size_type num_groups,
@@ -78,34 +84,33 @@ struct scan_functor {
     auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
     auto values_view = column_device_view::create(values, stream);
 
-    if (values.has_nulls()) {
-      auto input = thrust::make_transform_iterator(
-        make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
-        thrust::identity<ResultDeviceType>{});
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                     group_labels.begin(),
                                     group_labels.end(),
-                                    input,
-                                    result_view->begin<ResultDeviceType>(),
+                                    inp_iter,
+                                    out_iter,
                                     thrust::equal_to<size_type>{},
-                                    OpType{});
+                                    binop);
+    };
+
+    if (values.has_nulls()) {
+      auto input = thrust::make_transform_iterator(
+        make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
+        thrust::identity<ResultDeviceType>{});
+      do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
       result->set_null_mask(cudf::detail::copy_bitmask(values, stream));
     } else {
       auto input = thrust::make_transform_iterator(values_view->begin<DeviceType>(),
                                                    thrust::identity<ResultDeviceType>{});
-      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                    group_labels.begin(),
-                                    group_labels.end(),
-                                    input,
-                                    result_view->begin<ResultDeviceType>(),
-                                    thrust::equal_to<size_type>{},
-                                    OpType{});
+      do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
     }
     return result;
   }
 
   template <typename T>
-  std::enable_if_t<is_supported<T>() and std::is_same_v<T, cudf::string_view>,
+  std::enable_if_t<is_trivially_supported<T>() and std::is_same_v<T, cudf::string_view>,
                    std::unique_ptr<column>>
   operator()(column_view const& values,
              size_type num_groups,
@@ -115,33 +120,30 @@ struct scan_functor {
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
-    if (values.is_empty()) {
-      return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-    }
+    if (values.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
     // create an empty output vector we can fill with string_view instances
     auto results_vector = rmm::device_uvector<string_view>(values.size(), stream);
 
     auto values_view = column_device_view::create(values, stream);
 
-    if (values.has_nulls()) {
-      auto input = make_null_replacement_iterator(
-        *values_view, OpType::template identity<string_view>(), values.has_nulls());
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                     group_labels.begin(),
                                     group_labels.end(),
-                                    input,
-                                    results_vector.begin(),
+                                    inp_iter,
+                                    out_iter,
                                     thrust::equal_to<size_type>{},
-                                    OpType{});
+                                    binop);
+    };
+
+    if (values.has_nulls()) {
+      auto input = make_null_replacement_iterator(
+        *values_view, OpType::template identity<string_view>(), values.has_nulls());
+      do_scan(input, results_vector.begin(), OpType{});
     } else {
-      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                    group_labels.begin(),
-                                    group_labels.end(),
-                                    values_view->begin<string_view>(),
-                                    results_vector.begin(),
-                                    thrust::equal_to<size_type>{},
-                                    OpType{});
+      do_scan(values_view->begin<string_view>(), results_vector.begin(), OpType{});
     }
 
     // turn the string_view vector into a strings column
@@ -151,8 +153,91 @@ struct scan_functor {
     return results;
   }
 
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::struct_view> and
+                     (K == aggregation::MIN or K == aggregation::MAX),
+                   std::unique_ptr<column>>
+  operator()(column_view const& values,
+             size_type num_groups,
+             cudf::device_span<cudf::size_type const> group_labels,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    if (values.is_empty()) { return cudf::empty_like(values); }
+
+    // When finding MIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite when finding MAX.
+    auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::MIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
+
+    // Create a gather map contaning indices of the prefix min/max elements.
+    auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
+    auto const map_begin = gather_map.begin();
+
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    inp_iter,
+                                    out_iter,
+                                    thrust::equal_to<size_type>{},
+                                    binop);
+    };
+
+    // Find the indices of the prefix min/max elements within each group.
+    auto const count_iter = thrust::make_counting_iterator<size_type>(0);
+    if (values.has_nulls()) {
+      auto const op = row_arg_minmax_fn<T, true, K == aggregation::MIN>(
+        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
+      do_scan(count_iter, map_begin, op);
+    } else {
+      auto const op = row_arg_minmax_fn<T, false, K == aggregation::MIN>(
+        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
+      do_scan(count_iter, map_begin, op);
+    }
+
+    auto gather_map_view =
+      column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
+
+    // Gather the children elements of the prefix min/max struct elements first.
+    auto scanned_children =
+      cudf::detail::gather(
+        table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
+        gather_map_view,
+        cudf::out_of_bounds_policy::DONT_CHECK,
+        cudf::detail::negative_index_policy::NOT_ALLOWED,
+        stream,
+        mr)
+        ->release();
+
+    // After gathering the children elements, we need to push down nulls from the root structs
+    // column to them (so we will have null row in => null row out).
+    if (values.has_nulls()) {
+      for (std::unique_ptr<column>& child : scanned_children) {
+        structs::detail::superimpose_parent_nulls(
+          values.null_mask(), values.null_count(), *child, stream, mr);
+      }
+    }
+
+    return make_structs_column(values.size(),
+                               std::move(scanned_children),
+                               values.null_count(),
+                               cudf::detail::copy_bitmask(values, stream));
+  }
+
   template <typename T, typename... Args>
-  std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
+  std::enable_if_t<not is_trivially_supported<T>() and
+                     (not std::is_same_v<T, cudf::struct_view> or
+                      (K != aggregation::MIN and K != aggregation::MAX)),
+                   std::unique_ptr<column>>
+  operator()(Args&&... args)
   {
     CUDF_FAIL("Unsupported groupby scan type-agg combination");
   }

From 5d77d4f140fabefee7da190f1f717a01191b68c3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 1 Nov 2021 14:09:46 -0600
Subject: [PATCH 14/48] Implement unit tests

---
 cpp/tests/groupby/max_scan_tests.cpp | 89 +++++++++++++++++++++++++++-
 cpp/tests/groupby/min_scan_tests.cpp | 89 +++++++++++++++++++++++++++-
 2 files changed, 174 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 5108f91b31c..bb2f87fd424 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -39,7 +39,7 @@ struct groupby_max_scan_test : public cudf::test::BaseFixture {
   using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
 };
 
-TYPED_TEST_CASE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_max_scan_test, basic)
 {
@@ -148,7 +148,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
 {
@@ -173,5 +173,90 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
   }
 }
 
+struct groupby_max_scan_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_scan_struct_test, basic)
+{
+  auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "año", "año", "bit", "zit", "zit", "zit", "₹1", "₹1", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 1, 1, 2, 5, 5, 5, 3, 3, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_scan_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original =
+    key_wrapper{dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = key_wrapper{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "año", "año", "bit", "zit", "zit", "zit", "₹1", "₹1", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 1, 1, 2, 5, 5, 5, 3, 3, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_scan_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys = key_wrapper{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = key_wrapper{{1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "año", "" /*NULL*/, "bit", "zit", "" /*NULL*/, "zit", "₹1", "₹1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 9, null, 8, 5, null, 5, 7, 7, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({2, 5, 9})};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index 59b6382616f..06c0f5ceb3b 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -38,7 +38,7 @@ struct groupby_min_scan_test : public cudf::test::BaseFixture {
   using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
 };
 
-TYPED_TEST_CASE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_min_scan_test, basic)
 {
@@ -146,7 +146,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
 {
@@ -172,5 +172,90 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
   }
 }
 
+struct groupby_min_scan_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_scan_struct_test, basic)
+{
+  auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "aaa", "aaa", "bit", "bit", "bat", "bat", "₹1", "$1", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 4, 4, 2, 2, 6, 6, 3, 8, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_scan_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original =
+    key_wrapper{dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = key_wrapper{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "aaa", "aaa", "bit", "bit", "bat", "bat", "₹1", "$1", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 4, 4, 2, 2, 6, 6, 3, 8, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_scan_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys = key_wrapper{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = key_wrapper{{1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "aaa", "" /*NULL*/, "bit", "bit", "" /*NULL*/, "bit", "₹1", "€1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 6, null, 8, 8, null, 8, 7, 1, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({2, 5, 9})};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf

From bce93e40934b0d7dc654040a606690eb524e54d4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 1 Nov 2021 14:31:22 -0600
Subject: [PATCH 15/48] Rewrite SFINAE style

---
 cpp/src/groupby/sort/group_scan_util.cuh      | 109 +++++++++++-------
 .../sort/group_single_pass_reduction_util.cuh | 104 +++++++++--------
 2 files changed, 124 insertions(+), 89 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index dba066fc55c..c899052b058 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -42,27 +42,55 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
+// Error case when no other overload or specialization is available
+template <aggregation::Kind K, typename T, typename Enable = void>
+struct scan_functor_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported groupby scan type-agg combination.");
+  }
+};
+
 template <aggregation::Kind K>
 struct scan_functor {
   template <typename T>
-  static constexpr bool is_trivially_supported()
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    if (K == aggregation::SUM)
-      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
-    else if (K == aggregation::MIN or K == aggregation::MAX)
-      return !cudf::is_dictionary<T>() and is_relationally_comparable<T, T>();
-    else
-      return false;
+    return scan_functor_impl<K, T>{}(values, num_groups, group_labels, stream, mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<is_trivially_supported<T>() and not std::is_same_v<T, cudf::string_view>,
-                   std::unique_ptr<column>>
-  operator()(column_view const& values,
-             size_type num_groups,
-             cudf::device_span<cudf::size_type const> group_labels,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+/**
+ * @brief Check if the given aggregation K with data type T is supported in groupby scan.
+ */
+template <aggregation::Kind K, typename T>
+static constexpr bool is_scan_supported()
+{
+  if (K == aggregation::SUM)
+    return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+  else if (K == aggregation::MIN or K == aggregation::MAX)
+    return not cudf::is_dictionary<T>() and
+           (is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>);
+  else
+    return false;
+}
+
+template <aggregation::Kind K, typename T>
+struct scan_functor_impl<
+  K,
+  T,
+  std::enable_if_t<is_scan_supported<K, T>() and not std::is_same_v<T, cudf::string_view> and
+                   not std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -108,15 +136,18 @@ struct scan_functor {
     }
     return result;
   }
+};
 
-  template <typename T>
-  std::enable_if_t<is_trivially_supported<T>() and std::is_same_v<T, cudf::string_view>,
-                   std::unique_ptr<column>>
-  operator()(column_view const& values,
-             size_type num_groups,
-             cudf::device_span<cudf::size_type const> group_labels,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+template <aggregation::Kind K, typename T>
+struct scan_functor_impl<
+  K,
+  T,
+  std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::string_view>>> {
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
@@ -152,16 +183,18 @@ struct scan_functor {
       results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
     return results;
   }
+};
 
-  template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::struct_view> and
-                     (K == aggregation::MIN or K == aggregation::MAX),
-                   std::unique_ptr<column>>
-  operator()(column_view const& values,
-             size_type num_groups,
-             cudf::device_span<cudf::size_type const> group_labels,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+template <aggregation::Kind K, typename T>
+struct scan_functor_impl<
+  K,
+  T,
+  std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
@@ -218,7 +251,7 @@ struct scan_functor {
         ->release();
 
     // After gathering the children elements, we need to push down nulls from the root structs
-    // column to them (so we will have null row in => null row out).
+    // column to them.
     if (values.has_nulls()) {
       for (std::unique_ptr<column>& child : scanned_children) {
         structs::detail::superimpose_parent_nulls(
@@ -231,16 +264,6 @@ struct scan_functor {
                                values.null_count(),
                                cudf::detail::copy_bitmask(values, stream));
   }
-
-  template <typename T, typename... Args>
-  std::enable_if_t<not is_trivially_supported<T>() and
-                     (not std::is_same_v<T, cudf::struct_view> or
-                      (K != aggregation::MIN and K != aggregation::MAX)),
-                   std::unique_ptr<column>>
-  operator()(Args&&... args)
-  {
-    CUDF_FAIL("Unsupported groupby scan type-agg combination");
-  }
 };
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 3e1f994cf7b..e3dbfe23e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -116,31 +116,59 @@ struct null_replaced_value_accessor : value_accessor<T> {
   }
 };
 
+// Error case when no other overload or specialization is available
+template <aggregation::Kind K, typename T, typename Enable = void>
+struct reduce_functor_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported groupby reduction type-agg combination.");
+  }
+};
+
 template <aggregation::Kind K>
 struct reduce_functor {
   template <typename T>
-  static constexpr bool is_trivially_supported()
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    switch (K) {
-      case aggregation::SUM:
-        return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
-      case aggregation::PRODUCT: return cudf::detail::is_product_supported<T>();
-      case aggregation::MIN:
-      case aggregation::MAX:
-        return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
-      case aggregation::ARGMIN:
-      case aggregation::ARGMAX: return is_relationally_comparable<T, T>();
-      default: return false;
-    }
+    return reduce_functor_impl<K, T>{}(values, num_groups, group_labels, stream, mr);
   }
+};
+
+/**
+ * @brief Check if the given aggregation K with data type T is supported in groupby reduction.
+ */
+template <aggregation::Kind K, typename T>
+static constexpr bool is_redution_supported()
+{
+  switch (K) {
+    case aggregation::SUM:
+      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+    case aggregation::PRODUCT: return cudf::detail::is_product_supported<T>();
+    case aggregation::MIN:
+    case aggregation::MAX: return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
+    case aggregation::ARGMIN:
+    case aggregation::ARGMAX:
+      return is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>;
+    default: return false;
+  }
+}
+
+template <aggregation::Kind K, typename T>
+struct reduce_functor_impl<
+  K,
+  T,
+  std::enable_if_t<is_redution_supported<K, T>() and not std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 
-  template <typename T>
-  std::enable_if_t<is_trivially_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& values,
-    size_type num_groups,
-    cudf::device_span<size_type const> group_labels,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
   {
     using DeviceType  = device_storage_type_t<T>;
     using ResultType  = cudf::detail::target_type_t<T, K>;
@@ -200,21 +228,18 @@ struct reduce_functor {
     }
     return result;
   }
+};
 
-  // This specialization handles the cases when the input values type:
-  //  - Is struct type, and
-  //  - Aggregation is either ARGMIN or ARGMAX.
-  // Since `is_trivially_supported` returns false for `struct_view`, we don't have to cover it in
-  // the SFINAE condition.
-  template <typename T>
-  std::enable_if_t<std::is_same_v<T, struct_view> and
-                     (K == aggregation::ARGMIN or K == aggregation::ARGMAX),
-                   std::unique_ptr<column>>
-  operator()(column_view const& values,
-             size_type num_groups,
-             cudf::device_span<size_type const> group_labels,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+template <aggregation::Kind K, typename T>
+struct reduce_functor_impl<
+  K,
+  T,
+  std::enable_if_t<is_redution_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;
@@ -274,19 +299,6 @@ struct reduce_functor {
 
     return result;
   }
-
-  // Throw exception if the input values type:
-  //  - Is not trivially supported, and
-  //  - Is not struct type, or is struct type but aggregation is not neither ARGMIN nor ARGMAX.
-  template <typename T, typename... Args>
-  std::enable_if_t<not is_trivially_supported<T>() and
-                     (not std::is_same_v<T, struct_view> or
-                      (K != aggregation::ARGMIN and K != aggregation::ARGMAX)),
-                   std::unique_ptr<column>>
-  operator()(Args&&... args)
-  {
-    CUDF_FAIL("Unsupported type-agg combination");
-  }
 };
 
 }  // namespace detail

From b1b916f2239fdd081255eccd334169cb74b382b6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 1 Nov 2021 16:21:47 -0600
Subject: [PATCH 16/48] Add missing `mr` parameter

---
 cpp/src/groupby/sort/group_scan_util.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index c899052b058..a1f4b6c9843 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -262,7 +262,7 @@ struct scan_functor_impl<
     return make_structs_column(values.size(),
                                std::move(scanned_children),
                                values.null_count(),
-                               cudf::detail::copy_bitmask(values, stream));
+                               cudf::detail::copy_bitmask(values, stream, mr));
   }
 };
 

From 75e201f5da16ac3907a9d161f8fc134f81c3736c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 2 Nov 2021 09:24:52 -0600
Subject: [PATCH 17/48] Refactor `row_arg_minmax`

---
 cpp/src/groupby/sort/group_scan_util.cuh      | 16 ++++---
 .../sort/group_single_pass_reduction_util.cuh | 45 ++++++++++---------
 cpp/src/groupby/sort/group_util.cuh           |  9 ++--
 3 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index a1f4b6c9843..743ef97d505 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -227,13 +227,17 @@ struct scan_functor_impl<
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
     if (values.has_nulls()) {
-      auto const op = row_arg_minmax_fn<T, true, K == aggregation::MIN>(
-        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
-      do_scan(count_iter, map_begin, op);
+      auto const binop = row_arg_minmax_fn<T, true>(values.size(),
+                                                    *d_flattened_values_ptr,
+                                                    flattened_null_precedences.data(),
+                                                    K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
     } else {
-      auto const op = row_arg_minmax_fn<T, false, K == aggregation::MIN>(
-        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
-      do_scan(count_iter, map_begin, op);
+      auto const binop = row_arg_minmax_fn<T, false>(values.size(),
+                                                     *d_flattened_values_ptr,
+                                                     flattened_null_precedences.data(),
+                                                     K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
     }
 
     auto gather_map_view =
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index e3dbfe23e57..9e8451f7794 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -47,9 +47,12 @@ namespace detail {
  *
  * @tparam T Type of the underlying column. Must support '<' operator.
  */
-template <typename T, bool has_nulls, bool arg_min>
+template <typename T>
 struct element_arg_minmax_fn {
   column_device_view const d_col;
+  bool const has_nulls;
+  bool const arg_min;
+
   CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
@@ -200,19 +203,15 @@ struct reduce_functor_impl<
 
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
       auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
-      if (values.has_nulls()) {
-        using OpType = element_arg_minmax_fn<T, true, K == aggregation::ARGMIN>;
-        do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
-      } else {
-        using OpType = element_arg_minmax_fn<T, false, K == aggregation::ARGMIN>;
-        do_reduction(count_iter, result_begin, OpType{*d_values_ptr});
-      }
+      auto const binop =
+        element_arg_minmax_fn<T>{*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
+      do_reduction(count_iter, result_begin, binop);
     } else {
-      using OpType = cudf::detail::corresponding_operator_t<K>;
-      auto init    = OpType::template identity<DeviceType>();
-      auto begin   = cudf::detail::make_counting_transform_iterator(
+      using OpType    = cudf::detail::corresponding_operator_t<K>;
+      auto init       = OpType::template identity<DeviceType>();
+      auto inp_values = cudf::detail::make_counting_transform_iterator(
         0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()});
-      do_reduction(begin, result_begin, OpType{});
+      do_reduction(inp_values, result_begin, OpType{});
     }
 
     if (values.has_nulls()) {
@@ -223,8 +222,7 @@ struct reduce_functor_impl<
 
       auto [null_mask, null_count] = cudf::detail::valid_if(
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
-      result->set_null_mask(std::move(null_mask));
-      result->set_null_count(null_count);
+      result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
   }
@@ -276,9 +274,11 @@ struct reduce_functor_impl<
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const op = row_arg_minmax_fn<T, true, K == aggregation::ARGMIN>(
-        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
-      do_reduction(count_iter, result_begin, op);
+      auto const binop = row_arg_minmax_fn<T, true>(values.size(),
+                                                    *d_flattened_values_ptr,
+                                                    flattened_null_precedences.data(),
+                                                    K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
       auto const d_values_ptr = column_device_view::create(values, stream);
@@ -289,12 +289,13 @@ struct reduce_functor_impl<
 
       auto [null_mask, null_count] = cudf::detail::valid_if(
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
-      result->set_null_mask(std::move(null_mask));
-      result->set_null_count(null_count);
+      result->set_null_mask(std::move(null_mask), null_count);
     } else {
-      auto const op = row_arg_minmax_fn<T, false, K == aggregation::ARGMIN>(
-        values.size(), *d_flattened_values_ptr, flattened_null_precedences.data());
-      do_reduction(count_iter, result_begin, op);
+      auto const binop = row_arg_minmax_fn<T, false>(values.size(),
+                                                     *d_flattened_values_ptr,
+                                                     flattened_null_precedences.data(),
+                                                     K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
     }
 
     return result;
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
index 43de1b4a9d8..6475585a97e 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,15 +28,16 @@ namespace detail {
  * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
  * '<' operator.
  */
-template <typename T, bool has_nulls, bool arg_min>
+template <typename T, bool has_nulls>
 struct row_arg_minmax_fn {
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;
+  bool const arg_min;
 
   row_arg_minmax_fn(size_type const num_rows_,
                     table_device_view const& table_,
-                    null_order const* null_precedence)
-    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence)
+                    null_order const* null_precedence_, bool const arg_min_)
+    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence_), arg_min(arg_min_)
   {
   }
 

From 08a60f8f8b39f2d37c4b7313e32208e8c2db0f94 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 2 Nov 2021 13:00:39 -0600
Subject: [PATCH 18/48] Adopt "dispatch to static invoke" pattern

---
 cpp/src/groupby/sort/group_scan_util.cuh      | 34 +++++++++----------
 .../sort/group_single_pass_reduction_util.cuh | 24 ++++++-------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 743ef97d505..570f349642f 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -46,7 +46,7 @@ namespace detail {
 template <aggregation::Kind K, typename T, typename Enable = void>
 struct scan_functor_impl {
   template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
+  static std::unique_ptr<column> invoke(Args&&...)
   {
     CUDF_FAIL("Unsupported groupby scan type-agg combination.");
   }
@@ -61,7 +61,7 @@ struct scan_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return scan_functor_impl<K, T>{}(values, num_groups, group_labels, stream, mr);
+    return scan_functor_impl<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
 };
 
@@ -86,11 +86,11 @@ struct scan_functor_impl<
   T,
   std::enable_if_t<is_scan_supported<K, T>() and not std::is_same_v<T, cudf::string_view> and
                    not std::is_same_v<T, cudf::struct_view>>> {
-  std::unique_ptr<column> operator()(column_view const& values,
-                                     size_type num_groups,
-                                     cudf::device_span<cudf::size_type const> group_labels,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -143,11 +143,11 @@ struct scan_functor_impl<
   K,
   T,
   std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::string_view>>> {
-  std::unique_ptr<column> operator()(column_view const& values,
-                                     size_type num_groups,
-                                     cudf::device_span<cudf::size_type const> group_labels,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
@@ -190,11 +190,11 @@ struct scan_functor_impl<
   K,
   T,
   std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
-  std::unique_ptr<column> operator()(column_view const& values,
-                                     size_type num_groups,
-                                     cudf::device_span<cudf::size_type const> group_labels,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 9e8451f7794..dd2098c81ec 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -123,7 +123,7 @@ struct null_replaced_value_accessor : value_accessor<T> {
 template <aggregation::Kind K, typename T, typename Enable = void>
 struct reduce_functor_impl {
   template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
+  static std::unique_ptr<column> invoke(Args&&...)
   {
     CUDF_FAIL("Unsupported groupby reduction type-agg combination.");
   }
@@ -138,7 +138,7 @@ struct reduce_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return reduce_functor_impl<K, T>{}(values, num_groups, group_labels, stream, mr);
+    return reduce_functor_impl<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
 };
 
@@ -166,11 +166,11 @@ struct reduce_functor_impl<
   K,
   T,
   std::enable_if_t<is_redution_supported<K, T>() and not std::is_same_v<T, cudf::struct_view>>> {
-  std::unique_ptr<column> operator()(column_view const& values,
-                                     size_type num_groups,
-                                     cudf::device_span<cudf::size_type const> group_labels,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 
   {
     using DeviceType  = device_storage_type_t<T>;
@@ -233,11 +233,11 @@ struct reduce_functor_impl<
   K,
   T,
   std::enable_if_t<is_redution_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
-  std::unique_ptr<column> operator()(column_view const& values,
-                                     size_type num_groups,
-                                     cudf::device_span<cudf::size_type const> group_labels,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;

From 3a0c580bdb7ae9339b9316f95ec6d49b624dbd12 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 2 Nov 2021 13:19:58 -0600
Subject: [PATCH 19/48] Rename functors to better expressive names

---
 cpp/src/groupby/sort/group_argmax.cu          |  2 +-
 cpp/src/groupby/sort/group_argmin.cu          |  2 +-
 cpp/src/groupby/sort/group_max.cu             |  9 +++++++--
 cpp/src/groupby/sort/group_max_scan.cu        |  2 +-
 cpp/src/groupby/sort/group_min.cu             |  9 +++++++--
 cpp/src/groupby/sort/group_min_scan.cu        |  9 +++++++--
 cpp/src/groupby/sort/group_product.cu         |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh      | 20 +++++++++----------
 .../sort/group_single_pass_reduction_util.cuh | 20 +++++++++----------
 cpp/src/groupby/sort/group_sum.cu             |  9 +++++++--
 cpp/src/groupby/sort/group_sum_scan.cu        |  9 +++++++--
 11 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index 6ce23ffc35b..466171ec80b 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -34,7 +34,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
-                                 reduce_functor<aggregation::ARGMAX>{},
+                                 group_reduction_dispatcher<aggregation::ARGMAX>{},
                                  values,
                                  num_groups,
                                  group_labels,
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index ab91c2c0d29..4f7b2b713e6 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -34,7 +34,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
-                                 reduce_functor<aggregation::ARGMIN>{},
+                                 group_reduction_dispatcher<aggregation::ARGMIN>{},
                                  values,
                                  num_groups,
                                  group_labels,
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 7dd0e43ad28..5da15266233 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -30,8 +30,13 @@ std::unique_ptr<column> group_max(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::MAX>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 303d606be9d..15d13f3f4b9 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -28,7 +28,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(
-    values.type(), scan_functor<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+    values.type(), group_scan_dispatcher<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 4124ec0f6f6..c42a0b94de0 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -30,8 +30,13 @@ std::unique_ptr<column> group_min(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::MIN>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::MIN>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 4a692cdf0bd..daaeb6bb6f7 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), scan_functor<aggregation::MIN>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::MIN>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index e9cf8611b58..74f5cbed041 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -33,7 +33,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
   return type_dispatcher(values_type,
-                         reduce_functor<aggregation::PRODUCT>{},
+                         group_reduction_dispatcher<aggregation::PRODUCT>{},
                          values,
                          num_groups,
                          group_labels,
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 570f349642f..3b61cf5725b 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -44,7 +44,7 @@ namespace groupby {
 namespace detail {
 // Error case when no other overload or specialization is available
 template <aggregation::Kind K, typename T, typename Enable = void>
-struct scan_functor_impl {
+struct group_scan_functor {
   template <typename... Args>
   static std::unique_ptr<column> invoke(Args&&...)
   {
@@ -53,7 +53,7 @@ struct scan_functor_impl {
 };
 
 template <aggregation::Kind K>
-struct scan_functor {
+struct group_scan_dispatcher {
   template <typename T>
   std::unique_ptr<column> operator()(column_view const& values,
                                      size_type num_groups,
@@ -61,7 +61,7 @@ struct scan_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return scan_functor_impl<K, T>::invoke(values, num_groups, group_labels, stream, mr);
+    return group_scan_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
 };
 
@@ -69,7 +69,7 @@ struct scan_functor {
  * @brief Check if the given aggregation K with data type T is supported in groupby scan.
  */
 template <aggregation::Kind K, typename T>
-static constexpr bool is_scan_supported()
+static constexpr bool is_group_scan_supported()
 {
   if (K == aggregation::SUM)
     return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
@@ -81,10 +81,10 @@ static constexpr bool is_scan_supported()
 }
 
 template <aggregation::Kind K, typename T>
-struct scan_functor_impl<
+struct group_scan_functor<
   K,
   T,
-  std::enable_if_t<is_scan_supported<K, T>() and not std::is_same_v<T, cudf::string_view> and
+  std::enable_if_t<is_group_scan_supported<K, T>() and not std::is_same_v<T, cudf::string_view> and
                    not std::is_same_v<T, cudf::struct_view>>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
@@ -139,10 +139,10 @@ struct scan_functor_impl<
 };
 
 template <aggregation::Kind K, typename T>
-struct scan_functor_impl<
+struct group_scan_functor<
   K,
   T,
-  std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::string_view>>> {
+  std::enable_if_t<is_group_scan_supported<K, T>() and std::is_same_v<T, cudf::string_view>>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -186,10 +186,10 @@ struct scan_functor_impl<
 };
 
 template <aggregation::Kind K, typename T>
-struct scan_functor_impl<
+struct group_scan_functor<
   K,
   T,
-  std::enable_if_t<is_scan_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+  std::enable_if_t<is_group_scan_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index dd2098c81ec..1c5e23cd5c8 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -121,7 +121,7 @@ struct null_replaced_value_accessor : value_accessor<T> {
 
 // Error case when no other overload or specialization is available
 template <aggregation::Kind K, typename T, typename Enable = void>
-struct reduce_functor_impl {
+struct group_reduction_functor {
   template <typename... Args>
   static std::unique_ptr<column> invoke(Args&&...)
   {
@@ -130,7 +130,7 @@ struct reduce_functor_impl {
 };
 
 template <aggregation::Kind K>
-struct reduce_functor {
+struct group_reduction_dispatcher {
   template <typename T>
   std::unique_ptr<column> operator()(column_view const& values,
                                      size_type num_groups,
@@ -138,7 +138,7 @@ struct reduce_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return reduce_functor_impl<K, T>::invoke(values, num_groups, group_labels, stream, mr);
+    return group_reduction_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
 };
 
@@ -146,7 +146,7 @@ struct reduce_functor {
  * @brief Check if the given aggregation K with data type T is supported in groupby reduction.
  */
 template <aggregation::Kind K, typename T>
-static constexpr bool is_redution_supported()
+static constexpr bool is_group_redution_supported()
 {
   switch (K) {
     case aggregation::SUM:
@@ -162,10 +162,10 @@ static constexpr bool is_redution_supported()
 }
 
 template <aggregation::Kind K, typename T>
-struct reduce_functor_impl<
-  K,
-  T,
-  std::enable_if_t<is_redution_supported<K, T>() and not std::is_same_v<T, cudf::struct_view>>> {
+struct group_reduction_functor<K,
+                               T,
+                               std::enable_if_t<is_group_redution_supported<K, T>() and
+                                                not std::is_same_v<T, cudf::struct_view>>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -229,10 +229,10 @@ struct reduce_functor_impl<
 };
 
 template <aggregation::Kind K, typename T>
-struct reduce_functor_impl<
+struct group_reduction_functor<
   K,
   T,
-  std::enable_if_t<is_redution_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+  std::enable_if_t<is_group_redution_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index e9e6e985c54..e3c2ce7c864 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -32,8 +32,13 @@ std::unique_ptr<column> group_sum(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::SUM>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::SUM>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index ae9b1c321d4..632fde3b9d5 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), scan_functor<aggregation::SUM>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::SUM>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail

From f4c53c274c721ebfb5eb937cfb93901b3bdef278 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 2 Nov 2021 14:26:34 -0600
Subject: [PATCH 20/48] Fix formatting style

---
 cpp/src/groupby/sort/group_util.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
index 6475585a97e..08803ba7916 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -36,7 +36,8 @@ struct row_arg_minmax_fn {
 
   row_arg_minmax_fn(size_type const num_rows_,
                     table_device_view const& table_,
-                    null_order const* null_precedence_, bool const arg_min_)
+                    null_order const* null_precedence_,
+                    bool const arg_min_)
     : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence_), arg_min(arg_min_)
   {
   }

From b1a3628ede3fc1943c02f31dc32e0f768b76332d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 2 Nov 2021 14:30:40 -0600
Subject: [PATCH 21/48] Fix formatting style

---
 cpp/src/groupby/sort/group_max_scan.cu | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 15d13f3f4b9..1551dc00a04 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), group_scan_dispatcher<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::MAX>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail

From d868d6628c4b5bc6c70926eec8dc344dc8485fd4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 3 Nov 2021 09:28:17 -0600
Subject: [PATCH 22/48] Remove redundant template argument

---
 cpp/src/groupby/sort/group_util.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
index 08803ba7916..98fcb199fb7 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -28,7 +28,7 @@ namespace detail {
  * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
  * '<' operator.
  */
-template <typename T, bool has_nulls>
+template <bool has_nulls>
 struct row_arg_minmax_fn {
   size_type const num_rows;
   row_lexicographic_comparator<has_nulls> const comp;

From 94eed997c05236e049ca4d6114f91cdc63c3ad52 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 3 Nov 2021 09:29:05 -0600
Subject: [PATCH 23/48] Rewrite SFINAE into specialization

---
 cpp/src/groupby/sort/group_scan_util.cuh      | 40 ++++++++-----------
 .../sort/group_single_pass_reduction_util.cuh | 29 ++++++--------
 2 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 3b61cf5725b..013ea924cce 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -81,11 +81,7 @@ static constexpr bool is_group_scan_supported()
 }
 
 template <aggregation::Kind K, typename T>
-struct group_scan_functor<
-  K,
-  T,
-  std::enable_if_t<is_group_scan_supported<K, T>() and not std::is_same_v<T, cudf::string_view> and
-                   not std::is_same_v<T, cudf::struct_view>>> {
+struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -138,11 +134,10 @@ struct group_scan_functor<
   }
 };
 
-template <aggregation::Kind K, typename T>
-struct group_scan_functor<
-  K,
-  T,
-  std::enable_if_t<is_group_scan_supported<K, T>() and std::is_same_v<T, cudf::string_view>>> {
+template <aggregation::Kind K>
+struct group_scan_functor<K,
+                          cudf::string_view,
+                          std::enable_if_t<is_group_scan_supported<K, cudf::string_view>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -185,11 +180,10 @@ struct group_scan_functor<
   }
 };
 
-template <aggregation::Kind K, typename T>
-struct group_scan_functor<
-  K,
-  T,
-  std::enable_if_t<is_group_scan_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+template <aggregation::Kind K>
+struct group_scan_functor<K,
+                          cudf::struct_view,
+                          std::enable_if_t<is_group_scan_supported<K, cudf::struct_view>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -227,16 +221,16 @@ struct group_scan_functor<
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<T, true>(values.size(),
-                                                    *d_flattened_values_ptr,
-                                                    flattened_null_precedences.data(),
-                                                    K == aggregation::MIN);
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     } else {
-      auto const binop = row_arg_minmax_fn<T, false>(values.size(),
-                                                     *d_flattened_values_ptr,
-                                                     flattened_null_precedences.data(),
-                                                     K == aggregation::MIN);
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 1c5e23cd5c8..73ca658ce81 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -162,10 +162,7 @@ static constexpr bool is_group_redution_supported()
 }
 
 template <aggregation::Kind K, typename T>
-struct group_reduction_functor<K,
-                               T,
-                               std::enable_if_t<is_group_redution_supported<K, T>() and
-                                                not std::is_same_v<T, cudf::struct_view>>> {
+struct group_reduction_functor<K, T, std::enable_if_t<is_group_redution_supported<K, T>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -228,11 +225,11 @@ struct group_reduction_functor<K,
   }
 };
 
-template <aggregation::Kind K, typename T>
+template <aggregation::Kind K>
 struct group_reduction_functor<
   K,
-  T,
-  std::enable_if_t<is_group_redution_supported<K, T>() and std::is_same_v<T, cudf::struct_view>>> {
+  cudf::struct_view,
+  std::enable_if_t<is_group_redution_supported<K, cudf::struct_view>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -240,7 +237,7 @@ struct group_reduction_functor<
                                         rmm::mr::device_memory_resource* mr)
   {
     // This is be expected to be size_type.
-    using ResultType = cudf::detail::target_type_t<T, K>;
+    using ResultType = cudf::detail::target_type_t<cudf::struct_view, K>;
 
     auto result = make_fixed_width_column(
       data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -274,10 +271,10 @@ struct group_reduction_functor<
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<T, true>(values.size(),
-                                                    *d_flattened_values_ptr,
-                                                    flattened_null_precedences.data(),
-                                                    K == aggregation::ARGMIN);
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
@@ -291,10 +288,10 @@ struct group_reduction_functor<
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
-      auto const binop = row_arg_minmax_fn<T, false>(values.size(),
-                                                     *d_flattened_values_ptr,
-                                                     flattened_null_precedences.data(),
-                                                     K == aggregation::ARGMIN);
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
     }
 

From cb6fb5f2dfcc0c0378f051039148588d0181c1e5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 10:12:04 -0600
Subject: [PATCH 24/48] Attempt to patch thrust

---
 cpp/cmake/thrust.patch | 73 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
index 2f9201d8ab4..772b63d534d 100644
--- a/cpp/cmake/thrust.patch
+++ b/cpp/cmake/thrust.patch
@@ -1,3 +1,76 @@
+diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h
+index c9178628..b287b0d4 100644
+--- a/thrust/system/cuda/detail/scan_by_key.h
++++ b/thrust/system/cuda/detail/scan_by_key.h
+@@ -340,7 +340,7 @@ namespace __scan_by_key {
+                            size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+       {
+         // Zip values and segment_flags
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           // Set segment_flags for first out-of-bounds item, zero for others
+@@ -358,7 +358,7 @@ namespace __scan_by_key {
+           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+       {
+         // Zip values and segment_flags
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           values[ITEM] = scan_items[ITEM].value;
+@@ -616,7 +616,7 @@ namespace __scan_by_key {
+     operator()(T (&items)[ITEMS_PER_THREAD],
+                Size (&flags)[ITEMS_PER_THREAD])
+     {
+-#pragma unroll
++#pragma unroll 1
+       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+       {
+         items[ITEM] = flags[ITEM] ? init : scan_op(init, items[ITEM]);
+
+diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h
+index ba66f6d8..bb3bff56 100644
+--- a/thrust/system/cuda/detail/reduce_by_key.h
++++ b/thrust/system/cuda/detail/reduce_by_key.h
+@@ -369,7 +369,7 @@ namespace __reduce_by_key {
+                            size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+       {
+         // Zip values and segment_flags
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           // Set segment_flags for first out-of-bounds item, zero for others
+@@ -389,7 +389,7 @@ namespace __reduce_by_key {
+           key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD])
+       {
+         // Zip values and segment_flags
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           scatter_items[ITEM].key   = keys[ITEM];
+@@ -410,7 +410,7 @@ namespace __reduce_by_key {
+           size_type (&segment_indices)[ITEMS_PER_THREAD])
+       {
+         // Scatter flagged keys and values
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           if (segment_flags[ITEM])
+@@ -440,7 +440,7 @@ namespace __reduce_by_key {
+         sync_threadblock();
+ 
+         // Compact and scatter keys
+-#pragma unroll
++#pragma unroll 1
+         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+         {
+           if (segment_flags[ITEM])
+
 diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
 index 1ffeef0..5e80800 100644
 --- a/thrust/system/cuda/detail/sort.h

From e6885d4767449c02d3a51344767265779620c24d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 12:07:24 -0600
Subject: [PATCH 25/48] Revert "Attempt to patch thrust"

This reverts commit cb6fb5f2dfcc0c0378f051039148588d0181c1e5.
---
 cpp/cmake/thrust.patch | 73 ------------------------------------------
 1 file changed, 73 deletions(-)

diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
index 772b63d534d..2f9201d8ab4 100644
--- a/cpp/cmake/thrust.patch
+++ b/cpp/cmake/thrust.patch
@@ -1,76 +1,3 @@
-diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h
-index c9178628..b287b0d4 100644
---- a/thrust/system/cuda/detail/scan_by_key.h
-+++ b/thrust/system/cuda/detail/scan_by_key.h
-@@ -340,7 +340,7 @@ namespace __scan_by_key {
-                            size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
-       {
-         // Zip values and segment_flags
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           // Set segment_flags for first out-of-bounds item, zero for others
-@@ -358,7 +358,7 @@ namespace __scan_by_key {
-           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
-       {
-         // Zip values and segment_flags
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           values[ITEM] = scan_items[ITEM].value;
-@@ -616,7 +616,7 @@ namespace __scan_by_key {
-     operator()(T (&items)[ITEMS_PER_THREAD],
-                Size (&flags)[ITEMS_PER_THREAD])
-     {
--#pragma unroll
-+#pragma unroll 1
-       for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-       {
-         items[ITEM] = flags[ITEM] ? init : scan_op(init, items[ITEM]);
-
-diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h
-index ba66f6d8..bb3bff56 100644
---- a/thrust/system/cuda/detail/reduce_by_key.h
-+++ b/thrust/system/cuda/detail/reduce_by_key.h
-@@ -369,7 +369,7 @@ namespace __reduce_by_key {
-                            size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
-       {
-         // Zip values and segment_flags
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           // Set segment_flags for first out-of-bounds item, zero for others
-@@ -389,7 +389,7 @@ namespace __reduce_by_key {
-           key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD])
-       {
-         // Zip values and segment_flags
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           scatter_items[ITEM].key   = keys[ITEM];
-@@ -410,7 +410,7 @@ namespace __reduce_by_key {
-           size_type (&segment_indices)[ITEMS_PER_THREAD])
-       {
-         // Scatter flagged keys and values
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           if (segment_flags[ITEM])
-@@ -440,7 +440,7 @@ namespace __reduce_by_key {
-         sync_threadblock();
- 
-         // Compact and scatter keys
--#pragma unroll
-+#pragma unroll 1
-         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-         {
-           if (segment_flags[ITEM])
-
 diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
 index 1ffeef0..5e80800 100644
 --- a/thrust/system/cuda/detail/sort.h

From d4d46447583b860a9f181d850240bfea7f1bece4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 13:29:54 -0600
Subject: [PATCH 26/48] Add declaration for new internal APIs

---
 cpp/src/groupby/sort/group_reductions.hpp | 20 ++++++++++++++++++++
 cpp/src/groupby/sort/group_scan.hpp       | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 75708c7b01c..268c088fe87 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -173,6 +173,26 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate group-wise indices of minimum/maximum values, specialized for
+ * STRUCT type.
+ *
+ * @param K The aggregation kind, must be `aggregation::ARGMIN` or `aggregation::ARGMAX`
+ * @param values Grouped values to get minimum value's index from
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param key_sort_order Indices indicating sort order of groupby keys
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
+                                               column_view const& values,
+                                               size_type num_groups,
+                                               cudf::device_span<size_type const> group_labels,
+                                               column_view const& key_sort_order,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to calculate number of non-null values in each group of
  *  @p values
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 82ef0e25380..2be779b6f3b 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -72,6 +72,24 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate groupwise cumulative minimum/maximum value, specialized for
+ * STRUCT type.
+ *
+ * @param K The aggregation kind, must be `aggregation::MIN` or `aggregation::MAX`
+ * @param values Grouped values to get maximum from
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> minmax_scan_struct(aggregation::Kind K,
+                                           column_view const& values,
+                                           size_type num_groups,
+                                           device_span<size_type const> group_labels,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to calculate cumulative number of values in each group
  *

From ad7998fcdc4ee96d8ecc43cdb07f0e77071a4319 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 13:30:18 -0600
Subject: [PATCH 27/48] Call the specialized functions for struct type values

---
 cpp/src/groupby/sort/aggregate.cpp | 48 ++++++++++++++++++++----------
 cpp/src/groupby/sort/scan.cpp      | 34 ++++++++++++++-------
 2 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 83c6c1bca57..7bce5a3e0fa 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -117,14 +117,22 @@ void aggregate_result_functor::operator()<aggregation::ARGMAX>(aggregation const
 {
   if (cache.has_result(values, agg)) return;
 
-  cache.add_result(values,
-                   agg,
-                   detail::group_argmax(get_grouped_values(),
-                                        helper.num_groups(stream),
-                                        helper.group_labels(stream),
-                                        helper.key_sort_order(stream),
-                                        stream,
-                                        mr));
+  auto result = values.type().id() == type_id::STRUCT
+                  ? detail::group_argminmax_struct(aggregation::ARGMAX,
+                                                   get_grouped_values(),
+                                                   helper.num_groups(stream),
+                                                   helper.group_labels(stream),
+                                                   helper.key_sort_order(stream),
+                                                   stream,
+                                                   mr)
+                  : detail::group_argmax(get_grouped_values(),
+                                         helper.num_groups(stream),
+                                         helper.group_labels(stream),
+                                         helper.key_sort_order(stream),
+                                         stream,
+                                         mr);
+
+  cache.add_result(values, agg, std::move(result));
 };
 
 template <>
@@ -132,14 +140,22 @@ void aggregate_result_functor::operator()<aggregation::ARGMIN>(aggregation const
 {
   if (cache.has_result(values, agg)) return;
 
-  cache.add_result(values,
-                   agg,
-                   detail::group_argmin(get_grouped_values(),
-                                        helper.num_groups(stream),
-                                        helper.group_labels(stream),
-                                        helper.key_sort_order(stream),
-                                        stream,
-                                        mr));
+  auto result = values.type().id() == type_id::STRUCT
+                  ? detail::group_argminmax_struct(aggregation::ARGMIN,
+                                                   get_grouped_values(),
+                                                   helper.num_groups(stream),
+                                                   helper.group_labels(stream),
+                                                   helper.key_sort_order(stream),
+                                                   stream,
+                                                   mr)
+                  : detail::group_argmin(get_grouped_values(),
+                                         helper.num_groups(stream),
+                                         helper.group_labels(stream),
+                                         helper.key_sort_order(stream),
+                                         stream,
+                                         mr);
+
+  cache.add_result(values, agg, std::move(result));
 };
 
 template <>
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index b22f82ce7e4..eed6bd52faf 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -81,11 +81,18 @@ void scan_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
 
-  cache.add_result(
-    values,
-    agg,
-    detail::min_scan(
-      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
+  auto result =
+    values.type().id() == type_id::STRUCT
+      ? detail::minmax_scan_struct(aggregation::MIN,
+                                   get_grouped_values(),
+                                   helper.num_groups(stream),
+                                   helper.group_labels(stream),
+                                   stream,
+                                   mr)
+      : detail::min_scan(
+          get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
+
+  cache.add_result(values, agg, std::move(result));
 }
 
 template <>
@@ -93,11 +100,18 @@ void scan_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
 
-  cache.add_result(
-    values,
-    agg,
-    detail::max_scan(
-      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
+  auto result =
+    values.type().id() == type_id::STRUCT
+      ? detail::minmax_scan_struct(aggregation::MAX,
+                                   get_grouped_values(),
+                                   helper.num_groups(stream),
+                                   helper.group_labels(stream),
+                                   stream,
+                                   mr)
+      : detail::max_scan(
+          get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
+
+  cache.add_result(values, agg, std::move(result));
 }
 
 template <>

From 0c2b0b4f03ad14e47e22dc2c0788e65918bd2c33 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 13:30:28 -0600
Subject: [PATCH 28/48] Add new .cu files

---
 cpp/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e06d02b0bcb..c70d1cbe509 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -223,6 +223,7 @@ add_library(cudf
     src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_argmax.cu
     src/groupby/sort/group_argmin.cu
+    src/groupby/sort/group_argminmax_struct.cu
     src/groupby/sort/group_collect.cu
     src/groupby/sort/group_correlation.cu
     src/groupby/sort/group_count.cu
@@ -241,6 +242,7 @@ add_library(cudf
     src/groupby/sort/group_count_scan.cu
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
+    src/groupby/sort/group_minmax_scan_struct.cu
     src/groupby/sort/group_rank_scan.cu
     src/groupby/sort/group_replace_nulls.cu
     src/groupby/sort/group_sum_scan.cu

From f063e266988541a220b2d2e7615c4a7a30a13766 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 13:57:03 -0600
Subject: [PATCH 29/48] Remove `struct_view` specialization

---
 cpp/src/groupby/sort/group_scan_util.cuh      | 87 +------------------
 .../sort/group_single_pass_reduction_util.cuh | 77 +---------------
 2 files changed, 2 insertions(+), 162 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 013ea924cce..f0d27830bfb 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -74,8 +74,7 @@ static constexpr bool is_group_scan_supported()
   if (K == aggregation::SUM)
     return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
   else if (K == aggregation::MIN or K == aggregation::MAX)
-    return not cudf::is_dictionary<T>() and
-           (is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>);
+    return not cudf::is_dictionary<T>() and is_relationally_comparable<T, T>();
   else
     return false;
 }
@@ -180,90 +179,6 @@ struct group_scan_functor<K,
   }
 };
 
-template <aggregation::Kind K>
-struct group_scan_functor<K,
-                          cudf::struct_view,
-                          std::enable_if_t<is_group_scan_supported<K, cudf::struct_view>()>> {
-  static std::unique_ptr<column> invoke(column_view const& values,
-                                        size_type num_groups,
-                                        cudf::device_span<cudf::size_type const> group_labels,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
-  {
-    if (values.is_empty()) { return cudf::empty_like(values); }
-
-    // When finding MIN, we need to consider nulls as larger than non-null elements.
-    // Thing is opposite when finding MAX.
-    auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
-    auto const flattened_values = structs::detail::flatten_nested_columns(
-      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-    auto const flattened_null_precedences =
-      (K == aggregation::MIN)
-        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-        : rmm::device_uvector<null_order>(0, stream);
-
-    // Create a gather map contaning indices of the prefix min/max elements.
-    auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
-    auto const map_begin = gather_map.begin();
-
-    // Perform segmented scan.
-    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
-      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                    group_labels.begin(),
-                                    group_labels.end(),
-                                    inp_iter,
-                                    out_iter,
-                                    thrust::equal_to<size_type>{},
-                                    binop);
-    };
-
-    // Find the indices of the prefix min/max elements within each group.
-    auto const count_iter = thrust::make_counting_iterator<size_type>(0);
-    if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    }
-
-    auto gather_map_view =
-      column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
-
-    // Gather the children elements of the prefix min/max struct elements first.
-    auto scanned_children =
-      cudf::detail::gather(
-        table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
-        gather_map_view,
-        cudf::out_of_bounds_policy::DONT_CHECK,
-        cudf::detail::negative_index_policy::NOT_ALLOWED,
-        stream,
-        mr)
-        ->release();
-
-    // After gathering the children elements, we need to push down nulls from the root structs
-    // column to them.
-    if (values.has_nulls()) {
-      for (std::unique_ptr<column>& child : scanned_children) {
-        structs::detail::superimpose_parent_nulls(
-          values.null_mask(), values.null_count(), *child, stream, mr);
-      }
-    }
-
-    return make_structs_column(values.size(),
-                               std::move(scanned_children),
-                               values.null_count(),
-                               cudf::detail::copy_bitmask(values, stream, mr));
-  }
-};
-
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 73ca658ce81..8d6abaf3d69 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -155,8 +155,7 @@ static constexpr bool is_group_redution_supported()
     case aggregation::MIN:
     case aggregation::MAX: return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
     case aggregation::ARGMIN:
-    case aggregation::ARGMAX:
-      return is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>;
+    case aggregation::ARGMAX: return is_relationally_comparable<T, T>();
     default: return false;
   }
 }
@@ -225,80 +224,6 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_redution_supporte
   }
 };
 
-template <aggregation::Kind K>
-struct group_reduction_functor<
-  K,
-  cudf::struct_view,
-  std::enable_if_t<is_group_redution_supported<K, cudf::struct_view>()>> {
-  static std::unique_ptr<column> invoke(column_view const& values,
-                                        size_type num_groups,
-                                        cudf::device_span<cudf::size_type const> group_labels,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
-  {
-    // This is be expected to be size_type.
-    using ResultType = cudf::detail::target_type_t<cudf::struct_view, K>;
-
-    auto result = make_fixed_width_column(
-      data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
-
-    if (values.is_empty()) { return result; }
-
-    // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
-    // Thing is opposite for ARGMAX.
-    auto const null_precedence =
-      (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
-    auto const flattened_values = structs::detail::flatten_nested_columns(
-      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-    auto const flattened_null_precedences =
-      (K == aggregation::ARGMIN)
-        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-        : rmm::device_uvector<null_order>(0, stream);
-
-    // Perform segmented reduction to find ARGMIN/ARGMAX.
-    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            group_labels.data(),
-                            group_labels.data() + group_labels.size(),
-                            inp_iter,
-                            thrust::make_discard_iterator(),
-                            out_iter,
-                            thrust::equal_to<size_type>{},
-                            binop);
-    };
-
-    auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
-    auto const result_begin = result->mutable_view().template begin<ResultType>();
-    if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
-
-      // Generate bitmask for the output by segmented reduction of the input bitmask.
-      auto const d_values_ptr = column_device_view::create(values, stream);
-      auto validity           = rmm::device_uvector<bool>(num_groups, stream);
-      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
-                   validity.begin(),
-                   thrust::logical_or<bool>{});
-
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
-      result->set_null_mask(std::move(null_mask), null_count);
-    } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
-    }
-
-    return result;
-  }
-};
-
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf

From 9456ea332910bf826e701507f19bafb9ef6d31c9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 13:59:15 -0600
Subject: [PATCH 30/48] Implement `struct_view` specialization

---
 .../groupby/sort/group_argminmax_struct.cu    | 110 ++++++++++++++++
 .../groupby/sort/group_minmax_scan_struct.cu  | 122 ++++++++++++++++++
 2 files changed, 232 insertions(+)
 create mode 100644 cpp/src/groupby/sort/group_argminmax_struct.cu
 create mode 100644 cpp/src/groupby/sort/group_minmax_scan_struct.cu

diff --git a/cpp/src/groupby/sort/group_argminmax_struct.cu b/cpp/src/groupby/sort/group_argminmax_struct.cu
new file mode 100644
index 00000000000..145f45c5890
--- /dev/null
+++ b/cpp/src/groupby/sort/group_argminmax_struct.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_util.cuh>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
+                                               column_view const& values,
+                                               size_type num_groups,
+                                               cudf::device_span<size_type const> group_labels,
+                                               column_view const& key_sort_order,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(K == aggregation::ARGMIN || aggregation::ARGMAX,
+               "Only groupby ARGMIN/ARGMAX are supported for STRUCT type.");
+
+  auto result = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
+
+  if (values.is_empty()) { return result; }
+
+  // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
+  // Thing is opposite for ARGMAX.
+  auto const null_precedence  = (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
+  auto const flattened_values = structs::detail::flatten_nested_columns(
+    table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+  auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+  auto const flattened_null_precedences =
+    (K == aggregation::ARGMIN)
+      ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+      : rmm::device_uvector<null_order>(0, stream);
+
+  // Perform segmented reduction to find ARGMIN/ARGMAX.
+  auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          group_labels.data(),
+                          group_labels.data() + group_labels.size(),
+                          inp_iter,
+                          thrust::make_discard_iterator(),
+                          out_iter,
+                          thrust::equal_to<size_type>{},
+                          binop);
+  };
+
+  auto const count_iter   = thrust::make_counting_iterator<size_type>(0);
+  auto const result_begin = result->mutable_view().template begin<size_type>();
+  if (values.has_nulls()) {
+    auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                               *d_flattened_values_ptr,
+                                               flattened_null_precedences.data(),
+                                               K == aggregation::ARGMIN);
+    do_reduction(count_iter, result_begin, binop);
+
+    // Generate bitmask for the output by segmented reduction of the input bitmask.
+    auto const d_values_ptr = column_device_view::create(values, stream);
+    auto validity           = rmm::device_uvector<bool>(num_groups, stream);
+    do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
+                 validity.begin(),
+                 thrust::logical_or<bool>{});
+
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+    result->set_null_mask(std::move(null_mask), null_count);
+  } else {
+    auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                *d_flattened_values_ptr,
+                                                flattened_null_precedences.data(),
+                                                K == aggregation::ARGMIN);
+    do_reduction(count_iter, result_begin, binop);
+  }
+
+  return result;
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_minmax_scan_struct.cu b/cpp/src/groupby/sort/group_minmax_scan_struct.cu
new file mode 100644
index 00000000000..e92fce36f52
--- /dev/null
+++ b/cpp/src/groupby/sort/group_minmax_scan_struct.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_util.cuh>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/scan.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> minmax_scan_struct(aggregation::Kind K,
+                                           column_view const& values,
+                                           size_type num_groups,
+                                           device_span<size_type const> group_labels,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  if (values.is_empty()) { return cudf::empty_like(values); }
+
+  // When finding MIN, we need to consider nulls as larger than non-null elements.
+  // Thing is opposite when finding MAX.
+  auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
+  auto const flattened_values = structs::detail::flatten_nested_columns(
+    table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+  auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+  auto const flattened_null_precedences =
+    (K == aggregation::MIN)
+      ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+      : rmm::device_uvector<null_order>(0, stream);
+
+  // Create a gather map contaning indices of the prefix min/max elements.
+  auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
+  auto const map_begin = gather_map.begin();
+
+  // Perform segmented scan.
+  auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                  group_labels.begin(),
+                                  group_labels.end(),
+                                  inp_iter,
+                                  out_iter,
+                                  thrust::equal_to<size_type>{},
+                                  binop);
+  };
+
+  // Find the indices of the prefix min/max elements within each group.
+  auto const count_iter = thrust::make_counting_iterator<size_type>(0);
+  if (values.has_nulls()) {
+    auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                               *d_flattened_values_ptr,
+                                               flattened_null_precedences.data(),
+                                               K == aggregation::MIN);
+    do_scan(count_iter, map_begin, binop);
+  } else {
+    auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                *d_flattened_values_ptr,
+                                                flattened_null_precedences.data(),
+                                                K == aggregation::MIN);
+    do_scan(count_iter, map_begin, binop);
+  }
+
+  auto gather_map_view =
+    column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
+
+  // Gather the children elements of the prefix min/max struct elements first.
+  auto scanned_children =
+    cudf::detail::gather(
+      table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
+      gather_map_view,
+      cudf::out_of_bounds_policy::DONT_CHECK,
+      cudf::detail::negative_index_policy::NOT_ALLOWED,
+      stream,
+      mr)
+      ->release();
+
+  // After gathering the children elements, we need to push down nulls from the root structs
+  // column to them.
+  if (values.has_nulls()) {
+    for (std::unique_ptr<column>& child : scanned_children) {
+      structs::detail::superimpose_parent_nulls(
+        values.null_mask(), values.null_count(), *child, stream, mr);
+    }
+  }
+
+  return make_structs_column(values.size(),
+                             std::move(scanned_children),
+                             values.null_count(),
+                             cudf::detail::copy_bitmask(values, stream, mr));
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf

From 43be509575703c235a39372a3dbf2f0cd05ad102 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 14:07:10 -0600
Subject: [PATCH 31/48] Fix output order

---
 cpp/src/groupby/sort/group_argminmax_struct.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/src/groupby/sort/group_argminmax_struct.cu b/cpp/src/groupby/sort/group_argminmax_struct.cu
index 145f45c5890..4e97bd02804 100644
--- a/cpp/src/groupby/sort/group_argminmax_struct.cu
+++ b/cpp/src/groupby/sort/group_argminmax_struct.cu
@@ -29,6 +29,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
+#include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
@@ -102,6 +103,14 @@ std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
     do_reduction(count_iter, result_begin, binop);
   }
 
+  // result now stores the indices of minimum elements in the sorted values.
+  // We need the indices of minimum elements in the original unsorted values.
+  thrust::gather(rmm::exec_policy(stream),
+                 result_begin,
+                 result_begin + num_groups,
+                 key_sort_order.template begin<size_type>(),
+                 result_begin);
+
   return result;
 }
 

From 7cee90c2aef86def92c330848142f7c4ddccb322 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 14:13:29 -0600
Subject: [PATCH 32/48] Fix EXPECT conditions

---
 cpp/src/groupby/sort/group_argminmax_struct.cu   | 2 +-
 cpp/src/groupby/sort/group_minmax_scan_struct.cu | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_argminmax_struct.cu b/cpp/src/groupby/sort/group_argminmax_struct.cu
index 4e97bd02804..96106770544 100644
--- a/cpp/src/groupby/sort/group_argminmax_struct.cu
+++ b/cpp/src/groupby/sort/group_argminmax_struct.cu
@@ -45,7 +45,7 @@ std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(K == aggregation::ARGMIN || aggregation::ARGMAX,
+  CUDF_EXPECTS(K == aggregation::ARGMIN || K == aggregation::ARGMAX,
                "Only groupby ARGMIN/ARGMAX are supported for STRUCT type.");
 
   auto result = make_fixed_width_column(
diff --git a/cpp/src/groupby/sort/group_minmax_scan_struct.cu b/cpp/src/groupby/sort/group_minmax_scan_struct.cu
index e92fce36f52..2d3c26d19f7 100644
--- a/cpp/src/groupby/sort/group_minmax_scan_struct.cu
+++ b/cpp/src/groupby/sort/group_minmax_scan_struct.cu
@@ -44,6 +44,9 @@ std::unique_ptr<column> minmax_scan_struct(aggregation::Kind K,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(K == aggregation::MIN || K == aggregation::MAX,
+               "Only groupby MIN/MAX scan are supported for STRUCT type.");
+
   if (values.is_empty()) { return cudf::empty_like(values); }
 
   // When finding MIN, we need to consider nulls as larger than non-null elements.

From fee55e33eee9434dfc42f5ef80d4b916038e08c3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 4 Nov 2021 21:43:46 -0600
Subject: [PATCH 33/48] Refactor `row_operators.cuh`

---
 cpp/include/cudf/table/row_operators.cuh | 53 +++++++++++++++++-------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index c719c564a87..566c6fdbea2 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -268,8 +268,8 @@ class element_relational_comparator {
    * @param null_precedence Indicates how null values are ordered with other
    * values
    */
-  __host__ __device__ element_relational_comparator(column_device_view lhs,
-                                                    column_device_view rhs,
+  __host__ __device__ element_relational_comparator(column_device_view const& lhs,
+                                                    column_device_view const& rhs,
                                                     null_order null_precedence)
     : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
   {
@@ -315,6 +315,28 @@ class element_relational_comparator {
   null_order null_precedence;
 };
 
+/**
+ * @brief Performs a relational comparison between two elements in two columns at the
+ * corresponding given indices.
+ *
+ * This function is explicitly prevented from inlining because it is very heavy-weight due
+ * type-dispatching.
+ *
+ * @tparam has_nulls Indicates the potential for null values in either column.
+ */
+template <bool has_nulls>
+__attribute__((noinline)) __device__ weak_ordering
+compare_column_elements(data_type type,
+                        column_device_view const& lhs,
+                        column_device_view const& rhs,
+                        null_order null_precedence,
+                        size_type lhs_idx,
+                        size_type rhs_idx)
+{
+  auto const comp = element_relational_comparator<has_nulls>{lhs, rhs, null_precedence};
+  return cudf::type_dispatcher(type, comp, lhs_idx, rhs_idx);
+}
+
 /**
  * @brief Computes whether one row is lexicographically *less* than another row.
  *
@@ -350,8 +372,8 @@ class row_lexicographic_comparator {
    * it is nullptr, then null precedence would be `null_order::BEFORE` for all
    * columns.
    */
-  row_lexicographic_comparator(table_device_view lhs,
-                               table_device_view rhs,
+  row_lexicographic_comparator(table_device_view const& lhs,
+                               table_device_view const& rhs,
                                order const* column_order         = nullptr,
                                null_order const* null_precedence = nullptr)
     : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
@@ -373,20 +395,19 @@ class row_lexicographic_comparator {
   __device__ bool operator()(size_type lhs_index, size_type rhs_index) const noexcept
   {
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
-      bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
-
-      weak_ordering state{weak_ordering::EQUIVALENT};
-      null_order null_precedence =
+      auto const ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
+      auto const null_precedence =
         _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
 
-      auto comparator =
-        element_relational_comparator<has_nulls>{_lhs.column(i), _rhs.column(i), null_precedence};
-
-      state = cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
-
-      if (state == weak_ordering::EQUIVALENT) { continue; }
-
-      return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
+      auto const state = compare_column_elements<has_nulls>(_lhs.column(i).type(),
+                                                            _lhs.column(i),
+                                                            _rhs.column(i),
+                                                            null_precedence,
+                                                            lhs_index,
+                                                            rhs_index);
+      if (state != weak_ordering::EQUIVALENT) {
+        return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
+      }
     }
     return false;
   }

From 2fde89a49ffa0e52503178091e46e7b06899ea10 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 14:56:48 -0600
Subject: [PATCH 34/48] Fix function name typo

---
 cpp/src/groupby/sort/group_single_pass_reduction_util.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 8d6abaf3d69..00d3605ffdf 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -146,7 +146,7 @@ struct group_reduction_dispatcher {
  * @brief Check if the given aggregation K with data type T is supported in groupby reduction.
  */
 template <aggregation::Kind K, typename T>
-static constexpr bool is_group_redution_supported()
+static constexpr bool is_group_reduction_supported()
 {
   switch (K) {
     case aggregation::SUM:
@@ -161,7 +161,7 @@ static constexpr bool is_group_redution_supported()
 }
 
 template <aggregation::Kind K, typename T>
-struct group_reduction_functor<K, T, std::enable_if_t<is_group_redution_supported<K, T>()>> {
+struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_supported<K, T>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,

From ac9c6035659067f1a988d485c4603ecf7664bbe1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 14:58:40 -0600
Subject: [PATCH 35/48] Remove redundant header

---
 cpp/src/groupby/sort/group_scan_util.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index f0d27830bfb..b8988c39077 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>

From f5d27ae3019bceac1930ef32e1d7cd61a319b1df Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 14:58:45 -0600
Subject: [PATCH 36/48] Revert "Refactor `row_operators.cuh`"

This reverts commit fee55e33eee9434dfc42f5ef80d4b916038e08c3.
---
 cpp/include/cudf/table/row_operators.cuh | 53 +++++++-----------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 566c6fdbea2..c719c564a87 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -268,8 +268,8 @@ class element_relational_comparator {
    * @param null_precedence Indicates how null values are ordered with other
    * values
    */
-  __host__ __device__ element_relational_comparator(column_device_view const& lhs,
-                                                    column_device_view const& rhs,
+  __host__ __device__ element_relational_comparator(column_device_view lhs,
+                                                    column_device_view rhs,
                                                     null_order null_precedence)
     : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
   {
@@ -315,28 +315,6 @@ class element_relational_comparator {
   null_order null_precedence;
 };
 
-/**
- * @brief Performs a relational comparison between two elements in two columns at the
- * corresponding given indices.
- *
- * This function is explicitly prevented from inlining because it is very heavy-weight due
- * type-dispatching.
- *
- * @tparam has_nulls Indicates the potential for null values in either column.
- */
-template <bool has_nulls>
-__attribute__((noinline)) __device__ weak_ordering
-compare_column_elements(data_type type,
-                        column_device_view const& lhs,
-                        column_device_view const& rhs,
-                        null_order null_precedence,
-                        size_type lhs_idx,
-                        size_type rhs_idx)
-{
-  auto const comp = element_relational_comparator<has_nulls>{lhs, rhs, null_precedence};
-  return cudf::type_dispatcher(type, comp, lhs_idx, rhs_idx);
-}
-
 /**
  * @brief Computes whether one row is lexicographically *less* than another row.
  *
@@ -372,8 +350,8 @@ class row_lexicographic_comparator {
    * it is nullptr, then null precedence would be `null_order::BEFORE` for all
    * columns.
    */
-  row_lexicographic_comparator(table_device_view const& lhs,
-                               table_device_view const& rhs,
+  row_lexicographic_comparator(table_device_view lhs,
+                               table_device_view rhs,
                                order const* column_order         = nullptr,
                                null_order const* null_precedence = nullptr)
     : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
@@ -395,19 +373,20 @@ class row_lexicographic_comparator {
   __device__ bool operator()(size_type lhs_index, size_type rhs_index) const noexcept
   {
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
-      auto const ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
-      auto const null_precedence =
+      bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
+
+      weak_ordering state{weak_ordering::EQUIVALENT};
+      null_order null_precedence =
         _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
 
-      auto const state = compare_column_elements<has_nulls>(_lhs.column(i).type(),
-                                                            _lhs.column(i),
-                                                            _rhs.column(i),
-                                                            null_precedence,
-                                                            lhs_index,
-                                                            rhs_index);
-      if (state != weak_ordering::EQUIVALENT) {
-        return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
-      }
+      auto comparator =
+        element_relational_comparator<has_nulls>{_lhs.column(i), _rhs.column(i), null_precedence};
+
+      state = cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
+
+      if (state == weak_ordering::EQUIVALENT) { continue; }
+
+      return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
     }
     return false;
   }

From 7a7c7060a9567adc2578ec7ba8bcb44685e1a17e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:08:26 -0600
Subject: [PATCH 37/48] Prevent functor code from inlining

---
 cpp/src/groupby/sort/group_util.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
index 98fcb199fb7..31ff29ed4c3 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -42,7 +42,11 @@ struct row_arg_minmax_fn {
   {
   }
 
-  CUDA_DEVICE_CALLABLE auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  // This function is explicitly prevented from inlining, because it calls to
+  // `row_lexicographic_comparator::operator()` which is inlined and very heavy-weight. As a result,
+  // instantiating this functor will result in huge code, and objects of this functor used with
+  // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
+  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
     // github.com/NVIDIA/thrust/issues/1525

From 2a6a106c426f477c657ee5e56d321ece3d1e2670 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:44:53 -0600
Subject: [PATCH 38/48] Revert "Remove redundant header"

This reverts commit ac9c6035659067f1a988d485c4603ecf7664bbe1.
---
 cpp/src/groupby/sort/group_scan_util.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b8988c39077..f0d27830bfb 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <groupby/sort/group_util.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>

From 88ef471e4c9ab87ba60536eb2421d7c2f09c82c4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:45:57 -0600
Subject: [PATCH 39/48] Revert "Remove `struct_view` specialization"

This reverts commit f063e266988541a220b2d2e7615c4a7a30a13766.
---
 cpp/src/groupby/sort/group_scan_util.cuh      | 87 ++++++++++++++++++-
 .../sort/group_single_pass_reduction_util.cuh | 77 +++++++++++++++-
 2 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index f0d27830bfb..013ea924cce 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -74,7 +74,8 @@ static constexpr bool is_group_scan_supported()
   if (K == aggregation::SUM)
     return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
   else if (K == aggregation::MIN or K == aggregation::MAX)
-    return not cudf::is_dictionary<T>() and is_relationally_comparable<T, T>();
+    return not cudf::is_dictionary<T>() and
+           (is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>);
   else
     return false;
 }
@@ -179,6 +180,90 @@ struct group_scan_functor<K,
   }
 };
 
+template <aggregation::Kind K>
+struct group_scan_functor<K,
+                          cudf::struct_view,
+                          std::enable_if_t<is_group_scan_supported<K, cudf::struct_view>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    if (values.is_empty()) { return cudf::empty_like(values); }
+
+    // When finding MIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite when finding MAX.
+    auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::MIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
+
+    // Create a gather map contaning indices of the prefix min/max elements.
+    auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
+    auto const map_begin = gather_map.begin();
+
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    inp_iter,
+                                    out_iter,
+                                    thrust::equal_to<size_type>{},
+                                    binop);
+    };
+
+    // Find the indices of the prefix min/max elements within each group.
+    auto const count_iter = thrust::make_counting_iterator<size_type>(0);
+    if (values.has_nulls()) {
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
+    } else {
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
+    }
+
+    auto gather_map_view =
+      column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
+
+    // Gather the children elements of the prefix min/max struct elements first.
+    auto scanned_children =
+      cudf::detail::gather(
+        table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
+        gather_map_view,
+        cudf::out_of_bounds_policy::DONT_CHECK,
+        cudf::detail::negative_index_policy::NOT_ALLOWED,
+        stream,
+        mr)
+        ->release();
+
+    // After gathering the children elements, we need to push down nulls from the root structs
+    // column to them.
+    if (values.has_nulls()) {
+      for (std::unique_ptr<column>& child : scanned_children) {
+        structs::detail::superimpose_parent_nulls(
+          values.null_mask(), values.null_count(), *child, stream, mr);
+      }
+    }
+
+    return make_structs_column(values.size(),
+                               std::move(scanned_children),
+                               values.null_count(),
+                               cudf::detail::copy_bitmask(values, stream, mr));
+  }
+};
+
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 00d3605ffdf..0c43acdeb4c 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -155,7 +155,8 @@ static constexpr bool is_group_reduction_supported()
     case aggregation::MIN:
     case aggregation::MAX: return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
     case aggregation::ARGMIN:
-    case aggregation::ARGMAX: return is_relationally_comparable<T, T>();
+    case aggregation::ARGMAX:
+      return is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>;
     default: return false;
   }
 }
@@ -224,6 +225,80 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
   }
 };
 
+template <aggregation::Kind K>
+struct group_reduction_functor<
+  K,
+  cudf::struct_view,
+  std::enable_if_t<is_group_redution_supported<K, cudf::struct_view>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // This is be expected to be size_type.
+    using ResultType = cudf::detail::target_type_t<cudf::struct_view, K>;
+
+    auto result = make_fixed_width_column(
+      data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
+
+    if (values.is_empty()) { return result; }
+
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite for ARGMAX.
+    auto const null_precedence =
+      (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::ARGMIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
+
+    // Perform segmented reduction to find ARGMIN/ARGMAX.
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+      thrust::reduce_by_key(rmm::exec_policy(stream),
+                            group_labels.data(),
+                            group_labels.data() + group_labels.size(),
+                            inp_iter,
+                            thrust::make_discard_iterator(),
+                            out_iter,
+                            thrust::equal_to<size_type>{},
+                            binop);
+    };
+
+    auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
+    auto const result_begin = result->mutable_view().template begin<ResultType>();
+    if (values.has_nulls()) {
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
+
+      // Generate bitmask for the output by segmented reduction of the input bitmask.
+      auto const d_values_ptr = column_device_view::create(values, stream);
+      auto validity           = rmm::device_uvector<bool>(num_groups, stream);
+      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
+                   validity.begin(),
+                   thrust::logical_or<bool>{});
+
+      auto [null_mask, null_count] = cudf::detail::valid_if(
+        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      result->set_null_mask(std::move(null_mask), null_count);
+    } else {
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
+    }
+
+    return result;
+  }
+};
+
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf

From 494af0143eb907a22e110e45d7750db409675aee Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:48:00 -0600
Subject: [PATCH 40/48] Revert "Add new .cu files"

This reverts commit 0c2b0b4f03ad14e47e22dc2c0788e65918bd2c33.

# Conflicts:
#	cpp/CMakeLists.txt
---
 cpp/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f2bee0ee0f7..2182f4e1848 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -244,7 +244,6 @@ add_library(
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
-  src/groupby/sort/group_argminmax_struct.cu
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
@@ -263,7 +262,6 @@ add_library(
   src/groupby/sort/group_count_scan.cu
   src/groupby/sort/group_max_scan.cu
   src/groupby/sort/group_min_scan.cu
-  src/groupby/sort/group_minmax_scan_struct.cu
   src/groupby/sort/group_rank_scan.cu
   src/groupby/sort/group_replace_nulls.cu
   src/groupby/sort/group_sum_scan.cu
@@ -437,7 +435,7 @@ add_library(
   src/strings/split/split.cu
   src/strings/split/split_record.cu
   src/strings/strings_column_factories.cu
-  src/strings/strings_column_view.cpp
+  src/strings/strings_column_view.cu
   src/strings/strings_scalar_factories.cpp
   src/strings/strip.cu
   src/strings/substring.cu

From 3833bbc3d98320dc95004b006b5aed4a239b89d1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:49:09 -0600
Subject: [PATCH 41/48] Revert "Call the specialized functions for struct type
 values"

This reverts commit ad7998fcdc4ee96d8ecc43cdb07f0e77071a4319.
---
 cpp/src/groupby/sort/aggregate.cpp | 48 ++++++++++--------------------
 cpp/src/groupby/sort/scan.cpp      | 34 +++++++--------------
 2 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 7bce5a3e0fa..83c6c1bca57 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -117,22 +117,14 @@ void aggregate_result_functor::operator()<aggregation::ARGMAX>(aggregation const
 {
   if (cache.has_result(values, agg)) return;
 
-  auto result = values.type().id() == type_id::STRUCT
-                  ? detail::group_argminmax_struct(aggregation::ARGMAX,
-                                                   get_grouped_values(),
-                                                   helper.num_groups(stream),
-                                                   helper.group_labels(stream),
-                                                   helper.key_sort_order(stream),
-                                                   stream,
-                                                   mr)
-                  : detail::group_argmax(get_grouped_values(),
-                                         helper.num_groups(stream),
-                                         helper.group_labels(stream),
-                                         helper.key_sort_order(stream),
-                                         stream,
-                                         mr);
-
-  cache.add_result(values, agg, std::move(result));
+  cache.add_result(values,
+                   agg,
+                   detail::group_argmax(get_grouped_values(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
+                                        stream,
+                                        mr));
 };
 
 template <>
@@ -140,22 +132,14 @@ void aggregate_result_functor::operator()<aggregation::ARGMIN>(aggregation const
 {
   if (cache.has_result(values, agg)) return;
 
-  auto result = values.type().id() == type_id::STRUCT
-                  ? detail::group_argminmax_struct(aggregation::ARGMIN,
-                                                   get_grouped_values(),
-                                                   helper.num_groups(stream),
-                                                   helper.group_labels(stream),
-                                                   helper.key_sort_order(stream),
-                                                   stream,
-                                                   mr)
-                  : detail::group_argmin(get_grouped_values(),
-                                         helper.num_groups(stream),
-                                         helper.group_labels(stream),
-                                         helper.key_sort_order(stream),
-                                         stream,
-                                         mr);
-
-  cache.add_result(values, agg, std::move(result));
+  cache.add_result(values,
+                   agg,
+                   detail::group_argmin(get_grouped_values(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
+                                        stream,
+                                        mr));
 };
 
 template <>
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index eed6bd52faf..b22f82ce7e4 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -81,18 +81,11 @@ void scan_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
 
-  auto result =
-    values.type().id() == type_id::STRUCT
-      ? detail::minmax_scan_struct(aggregation::MIN,
-                                   get_grouped_values(),
-                                   helper.num_groups(stream),
-                                   helper.group_labels(stream),
-                                   stream,
-                                   mr)
-      : detail::min_scan(
-          get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
-
-  cache.add_result(values, agg, std::move(result));
+  cache.add_result(
+    values,
+    agg,
+    detail::min_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -100,18 +93,11 @@ void scan_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
 
-  auto result =
-    values.type().id() == type_id::STRUCT
-      ? detail::minmax_scan_struct(aggregation::MAX,
-                                   get_grouped_values(),
-                                   helper.num_groups(stream),
-                                   helper.group_labels(stream),
-                                   stream,
-                                   mr)
-      : detail::max_scan(
-          get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
-
-  cache.add_result(values, agg, std::move(result));
+  cache.add_result(
+    values,
+    agg,
+    detail::max_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>

From 61c774a50f1f95fda5c7cc8d196a0aeea5559054 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:49:17 -0600
Subject: [PATCH 42/48] Revert "Add declaration for new internal APIs"

This reverts commit d4d46447583b860a9f181d850240bfea7f1bece4.
---
 cpp/src/groupby/sort/group_reductions.hpp | 20 --------------------
 cpp/src/groupby/sort/group_scan.hpp       | 18 ------------------
 2 files changed, 38 deletions(-)

diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 268c088fe87..75708c7b01c 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -173,26 +173,6 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
 
-/**
- * @brief Internal API to calculate group-wise indices of minimum/maximum values, specialized for
- * STRUCT type.
- *
- * @param K The aggregation kind, must be `aggregation::ARGMIN` or `aggregation::ARGMAX`
- * @param values Grouped values to get minimum value's index from
- * @param num_groups Number of groups
- * @param group_labels ID of group that the corresponding value belongs to
- * @param key_sort_order Indices indicating sort order of groupby keys
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory
- */
-std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
-                                               column_view const& values,
-                                               size_type num_groups,
-                                               cudf::device_span<size_type const> group_labels,
-                                               column_view const& key_sort_order,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Internal API to calculate number of non-null values in each group of
  *  @p values
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 2be779b6f3b..82ef0e25380 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -72,24 +72,6 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr);
 
-/**
- * @brief Internal API to calculate groupwise cumulative minimum/maximum value, specialized for
- * STRUCT type.
- *
- * @param K The aggregation kind, must be `aggregation::MIN` or `aggregation::MAX`
- * @param values Grouped values to get maximum from
- * @param num_groups Number of groups
- * @param group_labels ID of group that the corresponding value belongs to
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- */
-std::unique_ptr<column> minmax_scan_struct(aggregation::Kind K,
-                                           column_view const& values,
-                                           size_type num_groups,
-                                           device_span<size_type const> group_labels,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Internal API to calculate cumulative number of values in each group
  *

From a5ab52dec7057e165e3d45cf2b1c33aec7602c71 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:50:53 -0600
Subject: [PATCH 43/48] Fix function name

---
 cpp/src/groupby/sort/group_single_pass_reduction_util.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 0c43acdeb4c..4e0820af236 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -229,7 +229,7 @@ template <aggregation::Kind K>
 struct group_reduction_functor<
   K,
   cudf::struct_view,
-  std::enable_if_t<is_group_redution_supported<K, cudf::struct_view>()>> {
+  std::enable_if_t<is_group_reduction_supported<K, cudf::struct_view>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,

From 731426ac6a93a6017bfdf73a05a22f434fcd7247 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:51:42 -0600
Subject: [PATCH 44/48] Fix CMakeList.txt

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2182f4e1848..1a0c853ef48 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -435,7 +435,7 @@ add_library(
   src/strings/split/split.cu
   src/strings/split/split_record.cu
   src/strings/strings_column_factories.cu
-  src/strings/strings_column_view.cu
+  src/strings/strings_column_view.cpp
   src/strings/strings_scalar_factories.cpp
   src/strings/strip.cu
   src/strings/substring.cu

From 49259f3e0597540b77f28cf7472de6ba3a19ec66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 5 Nov 2021 15:55:12 -0600
Subject: [PATCH 45/48] Remove files

---
 .../groupby/sort/group_argminmax_struct.cu    | 119 -----------------
 .../groupby/sort/group_minmax_scan_struct.cu  | 125 ------------------
 2 files changed, 244 deletions(-)
 delete mode 100644 cpp/src/groupby/sort/group_argminmax_struct.cu
 delete mode 100644 cpp/src/groupby/sort/group_minmax_scan_struct.cu

diff --git a/cpp/src/groupby/sort/group_argminmax_struct.cu b/cpp/src/groupby/sort/group_argminmax_struct.cu
deleted file mode 100644
index 96106770544..00000000000
--- a/cpp/src/groupby/sort/group_argminmax_struct.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <groupby/sort/group_util.cuh>
-
-#include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/valid_if.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/functional.h>
-#include <thrust/gather.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/reduce.h>
-
-namespace cudf {
-namespace groupby {
-namespace detail {
-std::unique_ptr<column> group_argminmax_struct(aggregation::Kind K,
-                                               column_view const& values,
-                                               size_type num_groups,
-                                               cudf::device_span<size_type const> group_labels,
-                                               column_view const& key_sort_order,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(K == aggregation::ARGMIN || K == aggregation::ARGMAX,
-               "Only groupby ARGMIN/ARGMAX are supported for STRUCT type.");
-
-  auto result = make_fixed_width_column(
-    data_type{type_to_id<size_type>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
-
-  if (values.is_empty()) { return result; }
-
-  // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
-  // Thing is opposite for ARGMAX.
-  auto const null_precedence  = (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
-  auto const flattened_values = structs::detail::flatten_nested_columns(
-    table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-  auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-  auto const flattened_null_precedences =
-    (K == aggregation::ARGMIN)
-      ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-      : rmm::device_uvector<null_order>(0, stream);
-
-  // Perform segmented reduction to find ARGMIN/ARGMAX.
-  auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.data(),
-                          group_labels.data() + group_labels.size(),
-                          inp_iter,
-                          thrust::make_discard_iterator(),
-                          out_iter,
-                          thrust::equal_to<size_type>{},
-                          binop);
-  };
-
-  auto const count_iter   = thrust::make_counting_iterator<size_type>(0);
-  auto const result_begin = result->mutable_view().template begin<size_type>();
-  if (values.has_nulls()) {
-    auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                               *d_flattened_values_ptr,
-                                               flattened_null_precedences.data(),
-                                               K == aggregation::ARGMIN);
-    do_reduction(count_iter, result_begin, binop);
-
-    // Generate bitmask for the output by segmented reduction of the input bitmask.
-    auto const d_values_ptr = column_device_view::create(values, stream);
-    auto validity           = rmm::device_uvector<bool>(num_groups, stream);
-    do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
-                 validity.begin(),
-                 thrust::logical_or<bool>{});
-
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
-    result->set_null_mask(std::move(null_mask), null_count);
-  } else {
-    auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                *d_flattened_values_ptr,
-                                                flattened_null_precedences.data(),
-                                                K == aggregation::ARGMIN);
-    do_reduction(count_iter, result_begin, binop);
-  }
-
-  // result now stores the indices of minimum elements in the sorted values.
-  // We need the indices of minimum elements in the original unsorted values.
-  thrust::gather(rmm::exec_policy(stream),
-                 result_begin,
-                 result_begin + num_groups,
-                 key_sort_order.template begin<size_type>(),
-                 result_begin);
-
-  return result;
-}
-
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_minmax_scan_struct.cu b/cpp/src/groupby/sort/group_minmax_scan_struct.cu
deleted file mode 100644
index 2d3c26d19f7..00000000000
--- a/cpp/src/groupby/sort/group_minmax_scan_struct.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <groupby/sort/group_util.cuh>
-
-#include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/functional.h>
-#include <thrust/scan.h>
-
-namespace cudf {
-namespace groupby {
-namespace detail {
-std::unique_ptr<column> minmax_scan_struct(aggregation::Kind K,
-                                           column_view const& values,
-                                           size_type num_groups,
-                                           device_span<size_type const> group_labels,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(K == aggregation::MIN || K == aggregation::MAX,
-               "Only groupby MIN/MAX scan are supported for STRUCT type.");
-
-  if (values.is_empty()) { return cudf::empty_like(values); }
-
-  // When finding MIN, we need to consider nulls as larger than non-null elements.
-  // Thing is opposite when finding MAX.
-  auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
-  auto const flattened_values = structs::detail::flatten_nested_columns(
-    table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-  auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-  auto const flattened_null_precedences =
-    (K == aggregation::MIN)
-      ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-      : rmm::device_uvector<null_order>(0, stream);
-
-  // Create a gather map contaning indices of the prefix min/max elements.
-  auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
-  auto const map_begin = gather_map.begin();
-
-  // Perform segmented scan.
-  auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
-    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                  group_labels.begin(),
-                                  group_labels.end(),
-                                  inp_iter,
-                                  out_iter,
-                                  thrust::equal_to<size_type>{},
-                                  binop);
-  };
-
-  // Find the indices of the prefix min/max elements within each group.
-  auto const count_iter = thrust::make_counting_iterator<size_type>(0);
-  if (values.has_nulls()) {
-    auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                               *d_flattened_values_ptr,
-                                               flattened_null_precedences.data(),
-                                               K == aggregation::MIN);
-    do_scan(count_iter, map_begin, binop);
-  } else {
-    auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                *d_flattened_values_ptr,
-                                                flattened_null_precedences.data(),
-                                                K == aggregation::MIN);
-    do_scan(count_iter, map_begin, binop);
-  }
-
-  auto gather_map_view =
-    column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
-
-  // Gather the children elements of the prefix min/max struct elements first.
-  auto scanned_children =
-    cudf::detail::gather(
-      table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
-      gather_map_view,
-      cudf::out_of_bounds_policy::DONT_CHECK,
-      cudf::detail::negative_index_policy::NOT_ALLOWED,
-      stream,
-      mr)
-      ->release();
-
-  // After gathering the children elements, we need to push down nulls from the root structs
-  // column to them.
-  if (values.has_nulls()) {
-    for (std::unique_ptr<column>& child : scanned_children) {
-      structs::detail::superimpose_parent_nulls(
-        values.null_mask(), values.null_count(), *child, stream, mr);
-    }
-  }
-
-  return make_structs_column(values.size(),
-                             std::move(scanned_children),
-                             values.null_count(),
-                             cudf::detail::copy_bitmask(values, stream, mr));
-}
-
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf

From cbf386fbe69b430906fffd3716769ed1d7dba3c7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 8 Nov 2021 08:01:11 -0700
Subject: [PATCH 46/48] Add groupby struct benchmark

---
 cpp/benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 193ac1f006c..fa1e61e26fd 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -157,7 +157,7 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
   GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu
+  groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
 )
 
 # ##################################################################################################

From 8b3f72ed584c768d41ad193e351e8ed52bac4d09 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 8 Nov 2021 09:38:38 -0700
Subject: [PATCH 47/48] Implement benchmark

---
 .../groupby/group_struct_benchmark.cu         | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 cpp/benchmarks/groupby/group_struct_benchmark.cu

diff --git a/cpp/benchmarks/groupby/group_struct_benchmark.cu b/cpp/benchmarks/groupby/group_struct_benchmark.cu
new file mode 100644
index 00000000000..8192b23fd78
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_struct_benchmark.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+static constexpr cudf::size_type num_struct_members = 8;
+static constexpr cudf::size_type max_int            = 100;
+static constexpr cudf::size_type max_str_length     = 32;
+
+static auto create_data_table(cudf::size_type n_rows)
+{
+  data_profile table_profile;
+  table_profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, max_int);
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+
+  // The first two struct members are int32 and string.
+  // The first column is also used as keys in groupby.
+  auto col_ids = std::vector<cudf::type_id>{cudf::type_id::INT32, cudf::type_id::STRING};
+
+  // The subsequent struct members are int32 and string again.
+  for (cudf::size_type i = 3; i <= num_struct_members; ++i) {
+    if (i % 2) {
+      col_ids.push_back(cudf::type_id::INT32);
+    } else {
+      col_ids.push_back(cudf::type_id::STRING);
+    }
+  }
+
+  return create_random_table(col_ids, num_struct_members, row_count{n_rows}, table_profile);
+}
+
+// Max aggregation technically has the same performance as min.
+void BM_min_aggregation(benchmark::State& state)
+{
+  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
+  auto data_cols    = create_data_table(n_rows)->release();
+
+  // Extract keys (integers column) and values (structs column).
+  auto const keys_view = data_cols.front()->view();
+  auto const values =
+    cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
+
+  auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
+  auto requests = std::vector<cudf::groupby::aggregation_request>();
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.front().values = values->view();
+  requests.front().aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+
+  for (auto _ : state) {
+    [[maybe_unused]] auto const timer  = cuda_event_timer(state, true);
+    [[maybe_unused]] auto const result = gb_obj.aggregate(requests);
+  }
+}
+
+// Max aggregation technically has the same performance as min.
+void BM_min_scan(benchmark::State& state)
+{
+  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
+  auto data_cols    = create_data_table(n_rows)->release();
+
+  // Extract keys (integers column) and values (structs column).
+  auto const keys_view = data_cols.front()->view();
+  auto const values =
+    cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
+
+  auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
+  auto requests = std::vector<cudf::groupby::scan_request>();
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests.front().values = values->view();
+  requests.front().aggregations.push_back(
+    cudf::make_min_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    [[maybe_unused]] auto const timer  = cuda_event_timer(state, true);
+    [[maybe_unused]] auto const result = gb_obj.scan(requests);
+  }
+}
+
+class Groupby : public cudf::benchmark {
+};
+
+#define MIN_RANGE 10'000
+#define MAX_RANGE 10'000'000
+
+#define REGISTER_BENCHMARK(name, func)                                           \
+  BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) { func(state); } \
+  BENCHMARK_REGISTER_F(Groupby, name)                                            \
+    ->UseManualTime()                                                            \
+    ->Unit(benchmark::kMillisecond)                                              \
+    ->RangeMultiplier(4)                                                         \
+    ->Ranges({{MIN_RANGE, MAX_RANGE}});
+
+REGISTER_BENCHMARK(Aggregation, BM_min_aggregation)
+REGISTER_BENCHMARK(Scan, BM_min_scan)

From cdfc6026ae93db2ea677c3b24a54953c31a5df62 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 8 Nov 2021 10:34:53 -0700
Subject: [PATCH 48/48] Unify 2 functions into a template function

---
 .../groupby/group_struct_benchmark.cu         | 65 ++++++++-----------
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_struct_benchmark.cu b/cpp/benchmarks/groupby/group_struct_benchmark.cu
index 8192b23fd78..702983a63bf 100644
--- a/cpp/benchmarks/groupby/group_struct_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_struct_benchmark.cu
@@ -55,50 +55,34 @@ static auto create_data_table(cudf::size_type n_rows)
   return create_random_table(col_ids, num_struct_members, row_count{n_rows}, table_profile);
 }
 
-// Max aggregation technically has the same performance as min.
-void BM_min_aggregation(benchmark::State& state)
+// Max aggregation/scan technically has the same performance as min.
+template <typename OpType>
+void BM_groupby_min_struct(benchmark::State& state)
 {
   auto const n_rows = static_cast<cudf::size_type>(state.range(0));
   auto data_cols    = create_data_table(n_rows)->release();
 
-  // Extract keys (integers column) and values (structs column).
   auto const keys_view = data_cols.front()->view();
   auto const values =
     cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
 
-  auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
-  auto requests = std::vector<cudf::groupby::aggregation_request>();
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests.front().values = values->view();
-  requests.front().aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
-
-  for (auto _ : state) {
-    [[maybe_unused]] auto const timer  = cuda_event_timer(state, true);
-    [[maybe_unused]] auto const result = gb_obj.aggregate(requests);
-  }
-}
-
-// Max aggregation technically has the same performance as min.
-void BM_min_scan(benchmark::State& state)
-{
-  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
-  auto data_cols    = create_data_table(n_rows)->release();
-
-  // Extract keys (integers column) and values (structs column).
-  auto const keys_view = data_cols.front()->view();
-  auto const values =
-    cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
+  using RequestType = std::conditional_t<std::is_same_v<OpType, cudf::groupby_aggregation>,
+                                         cudf::groupby::aggregation_request,
+                                         cudf::groupby::scan_request>;
 
   auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
-  auto requests = std::vector<cudf::groupby::scan_request>();
-  requests.emplace_back(cudf::groupby::scan_request());
+  auto requests = std::vector<RequestType>();
+  requests.emplace_back(RequestType());
   requests.front().values = values->view();
-  requests.front().aggregations.push_back(
-    cudf::make_min_aggregation<cudf::groupby_scan_aggregation>());
+  requests.front().aggregations.push_back(cudf::make_min_aggregation<OpType>());
 
   for (auto _ : state) {
-    [[maybe_unused]] auto const timer  = cuda_event_timer(state, true);
-    [[maybe_unused]] auto const result = gb_obj.scan(requests);
+    [[maybe_unused]] auto const timer = cuda_event_timer(state, true);
+    if constexpr (std::is_same_v<OpType, cudf::groupby_aggregation>) {
+      [[maybe_unused]] auto const result = gb_obj.aggregate(requests);
+    } else {
+      [[maybe_unused]] auto const result = gb_obj.scan(requests);
+    }
   }
 }
 
@@ -108,13 +92,16 @@ class Groupby : public cudf::benchmark {
 #define MIN_RANGE 10'000
 #define MAX_RANGE 10'000'000
 
-#define REGISTER_BENCHMARK(name, func)                                           \
-  BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) { func(state); } \
-  BENCHMARK_REGISTER_F(Groupby, name)                                            \
-    ->UseManualTime()                                                            \
-    ->Unit(benchmark::kMillisecond)                                              \
-    ->RangeMultiplier(4)                                                         \
+#define REGISTER_BENCHMARK(name, op_type)                       \
+  BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \
+  {                                                             \
+    BM_groupby_min_struct<op_type>(state);                      \
+  }                                                             \
+  BENCHMARK_REGISTER_F(Groupby, name)                           \
+    ->UseManualTime()                                           \
+    ->Unit(benchmark::kMillisecond)                             \
+    ->RangeMultiplier(4)                                        \
     ->Ranges({{MIN_RANGE, MAX_RANGE}});
 
-REGISTER_BENCHMARK(Aggregation, BM_min_aggregation)
-REGISTER_BENCHMARK(Scan, BM_min_scan)
+REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation)
+REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation)