rapidsai · rapids-bot · Jan 10, 2022 · Nov 22, 2021 · Jan 6, 2022 · Jan 6, 2022
@@ -177,6 +177,7 @@ ConfigureBench(
   groupby/group_shift_benchmark.cu
   groupby/group_struct_benchmark.cu
   groupby/group_no_requests_benchmark.cu
+  groupby/group_scan_benchmark.cu
 )
 
 # ##################################################################################################

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <random>
+
+class Groupby : public cudf::benchmark {
+};
+
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
+
+void BM_basic_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicSumScan)(::benchmark::State& state) { BM_basic_sum_scan(state); }
+
+BENCHMARK_REGISTER_F(Groupby, BasicSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedSumScan)(::benchmark::State& state)
+{
+  BM_pre_sorted_sum_scan(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
@@ -40,12 +40,23 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
-    : helper(helper), cache(cache), values(values), stream(stream), mr(mr)
+                       rmm::mr::device_memory_resource* mr,
+                       sorted keys_are_sorted = sorted::NO)
+    : helper(helper),
+      cache(cache),
+      values(values),
+      stream(stream),
+      mr(mr),
+      keys_are_sorted(keys_are_sorted)
   {
   }
 
  protected:
+  /**
+   * @brief Check if the groupby keys are presorted
+   */
+  bool is_presorted() const { return keys_are_sorted == sorted::YES; }
+
   /**
    * @brief Get the grouped values
    *
@@ -54,6 +65,8 @@ struct store_result_functor {
    */
   column_view get_grouped_values()
   {
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -74,6 +87,7 @@ struct store_result_functor {
    */
   column_view get_sorted_values()
   {
+    if (is_presorted()) { return values; }
     return sorted_values ? sorted_values->view()
                          : (sorted_values = helper.sorted_values(values, stream))->view();
   };
@@ -86,6 +100,7 @@ struct store_result_functor {
   rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
   rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
+  sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
   std::unique_ptr<column> grouped_values;  ///< Memoised grouped values
 };

@@ -55,6 +55,9 @@ struct scan_result_functor final : store_result_functor {
  private:
   column_view get_grouped_values()
   {
+    // early exit if presorted
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -155,7 +158,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
   cudf::detail::result_cache cache(requests.size());
 
   for (auto const& request : requests) {
-    auto store_functor = detail::scan_result_functor(request.values, helper(), cache, stream, mr);
+    auto store_functor =
+      detail::scan_result_functor(request.values, helper(), cache, stream, mr, _keys_are_sorted);
     for (auto const& aggregation : request.aggregations) {
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation);

@@ -59,6 +59,24 @@ TYPED_TEST(groupby_max_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_max_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals({5, 8, 1, 6, 9, 0, 4, 7, 2, 3});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7});
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_max_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;

@@ -57,6 +57,24 @@ TYPED_TEST(groupby_min_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_min_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals({5, 8, 1, 6, 9, 0, 4, 7, 2, 3});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2});
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_min_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;

@@ -61,6 +61,24 @@ TYPED_TEST(groupby_sum_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_sum_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys  {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals{0, 3, 6, 1, 4, 5, 9, 2, 7, 8};
+
+  key_wrapper expect_keys   {1, 1, 1, 2, 2,  2,  2, 3, 3, 3};
+  result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_sum_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;