From fc5661fd1336d9809a8d396cd651fc3561bb5f3e Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 16 Dec 2021 12:44:38 -0600
Subject: [PATCH] Fix fallback to sort aggregation for grouping only hash
 aggregate (#9891) (#9898)

The following fixes what looks like an unintended fallback to sort aggregate introduced in https://github.com/rapidsai/cudf/pull/9545 for a grouping only (no aggregation request) case.

In the PR, the `std::all_of` function is used to determine whether the aggregation requests would be for struct types. That said, when there are no aggregation requests the `std::all_of` function will return true, causing a fallback to the sort aggregation (relevant code: https://github.com/rapidsai/cudf/pull/9545/files#diff-e409f72ddc11ad10fa0099e21b409b92f12bfac8ba1817266696c34a620aa081R645-R650).

I added a benchmark `group_no_requests_benchmark.cu` by mostly copying `group_sum_benchmark.cu` but I changed one critical part. I am re-creating the `groupby` object for each `state`:

```
  for (auto _ : state) {
    cuda_event_timer timer(state, true);
    cudf::groupby::groupby gb_obj(cudf::table_view({keys}));e
    auto result = gb_obj.aggregate(requests);
  }
```

This shows what would happen in the scenario where the `groupby` instance is created each time an aggregate is issued, which would re-create the `helper` each time for the sorted case.

If the `groupby` object is not recreated each time, the difference in performance between the before/after cases is negligible. We never recycle a `groupby` instance when using the groupby API from Spark.

Posting this as draft for feedback as I am not sure if I handled the benchmark part correctly.

This was executed on a T4 GPU.

Before the patch:

```
Groupby/BasicNoRequest/10000/manual_time               0.158 ms        0.184 ms         4420
Groupby/BasicNoRequest/1000000/manual_time              1.72 ms         1.74 ms          408
Groupby/BasicNoRequest/10000000/manual_time             18.9 ms         18.9 ms           37
Groupby/BasicNoRequest/100000000/manual_time             198 ms          198 ms            3
```

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9891
---
 cpp/benchmarks/CMakeLists.txt                 |   8 +-
 .../groupby/group_no_requests_benchmark.cu    | 115 ++++++++++++++++++
 cpp/src/groupby/hash/groupby.cu               |  27 ++--
 3 files changed, 131 insertions(+), 19 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_no_requests_benchmark.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 72b247ae748..34a5aebff6f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -156,8 +156,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
+  GROUPBY_BENCH
+  groupby/group_sum_benchmark.cu
+  groupby/group_nth_benchmark.cu
+  groupby/group_shift_benchmark.cu
+  groupby/group_struct_benchmark.cu
+  groupby/group_no_requests_benchmark.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
new file mode 100644
index 00000000000..7dbe1888cee
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <memory>
+#include <random>
+
+class Groupby : public cudf::benchmark {
+};
+
+// TODO: put it in a struct so `uniform` can be remade with different min, max
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
+
+void BM_basic_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state)
+{
+  BM_basic_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, BasicNoRequest)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(10000)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state)
+{
+  BM_pre_sorted_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e35fa36a289..950cc9727fd 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -635,23 +635,16 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
  */
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
-  auto const all_hash_aggregations =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return cudf::has_atomic_support(r.values.type()) and
-             std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-               return is_hash_aggregation(a->kind);
-             });
-    });
-
-  // Currently, structs are not supported in any of hash-based aggregations.
-  // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
-  // TODO: Support structs in hash-based aggregations.
-  auto const has_struct =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return r.values.type().id() == type_id::STRUCT;
-    });
-
-  return all_hash_aggregations && !has_struct;
+  return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+    // Currently, structs are not supported in any of hash-based aggregations.
+    // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
+    // TODO: Support structs in hash-based aggregations.
+    return not(r.values.type().id() == type_id::STRUCT) and
+           cudf::has_atomic_support(r.values.type()) and
+           std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+             return is_hash_aggregation(a->kind);
+           });
+  });
 }
 
 // Hash-based groupby