rapidsai · rapids-bot · Jan 10, 2022 · Nov 22, 2021 · Jan 6, 2022 · Jan 6, 2022
@@ -177,6 +177,7 @@ ConfigureBench(
   groupby/group_shift_benchmark.cu
   groupby/group_struct_benchmark.cu
   groupby/group_no_requests_benchmark.cu
+  groupby/group_scan_benchmark.cu
 )
 
 # ##################################################################################################

@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <random>
+
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_basic_no_requests(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_pre_sorted_nth(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+class Groupby : public cudf::benchmark {
+};
+
+void BM_basic_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicSumScan)(::benchmark::State& state) { BM_basic_sum_scan(state); }
+
+BENCHMARK_REGISTER_F(Groupby, BasicSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedSumScan)(::benchmark::State& state)
+{
+  BM_pre_sorted_sum_scan(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -24,24 +26,9 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <benchmark/benchmark.h>
-
-#include <random>
-
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_group_shift(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_basic_sum(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

@@ -40,12 +40,23 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
-    : helper(helper), cache(cache), values(values), stream(stream), mr(mr)
+                       rmm::mr::device_memory_resource* mr,
+                       sorted keys_are_sorted = sorted::NO)
+    : helper(helper),
+      cache(cache),
+      values(values),
+      stream(stream),
+      mr(mr),
+      keys_are_sorted(keys_are_sorted)
   {
   }
 
  protected:
+  /**
+   * @brief Check if the groupby keys are presorted
+   */
+  bool is_presorted() const { return keys_are_sorted == sorted::YES; }
+
   /**
    * @brief Get the grouped values
    *
@@ -54,6 +65,8 @@ struct store_result_functor {
    */
   column_view get_grouped_values()
   {
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -74,6 +87,7 @@ struct store_result_functor {
    */
   column_view get_sorted_values()
   {
+    if (is_presorted()) { return values; }
     return sorted_values ? sorted_values->view()
                          : (sorted_values = helper.sorted_values(values, stream))->view();
   };
@@ -86,6 +100,7 @@ struct store_result_functor {
   rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
   rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
+  sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
   std::unique_ptr<column> grouped_values;  ///< Memoised grouped values
 };

@@ -55,6 +55,9 @@ struct scan_result_functor final : store_result_functor {
  private:
   column_view get_grouped_values()
   {
+    // early exit if presorted
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -155,7 +158,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
   cudf::detail::result_cache cache(requests.size());
 
   for (auto const& request : requests) {
-    auto store_functor = detail::scan_result_functor(request.values, helper(), cache, stream, mr);
+    auto store_functor =
+      detail::scan_result_functor(request.values, helper(), cache, stream, mr, _keys_are_sorted);
     for (auto const& aggregation : request.aggregations) {
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation);