From 8fbc469032bc5df8cbc63712c779012a45beb44f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 13 Dec 2021 15:08:39 -0600 Subject: [PATCH] Fix fallback to sort aggregation for grouping only hash aggregate (#9891) The following fixes what looks like an unintended fallback to sort aggregate introduced in https://github.com/rapidsai/cudf/pull/9545 for a grouping only (no aggregation request) case. In the PR, the `std::all_of` function is used to determine whether the aggregation requests would be for struct types. That said, when there are no aggregation requests the `std::all_of` function will return true, causing a fallback to the sort aggregation (relevant code: https://github.com/rapidsai/cudf/pull/9545/files#diff-e409f72ddc11ad10fa0099e21b409b92f12bfac8ba1817266696c34a620aa081R645-R650). I added a benchmark `group_no_requests_benchmark.cu` by mostly copying `group_sum_benchmark.cu` but I changed one critical part. I am re-creating the `groupby` object for each `state`: ``` for (auto _ : state) { cuda_event_timer timer(state, true); cudf::groupby::groupby gb_obj(cudf::table_view({keys}));e auto result = gb_obj.aggregate(requests); } ``` This shows what would happen in the scenario where the `groupby` instance is created each time an aggregate is issued, which would re-create the `helper` each time for the sorted case. If the `groupby` object is not recreated each time, the difference in performance between the before/after cases is negligible. We never recycle a `groupby` instance when using the groupby API from Spark. Posting this as draft for feedback as I am not sure if I handled the benchmark part correctly. This was executed on a T4 GPU. Before the patch: ``` Groupby/BasicNoRequest/10000/manual_time 0.158 ms 0.184 ms 4420 Groupby/BasicNoRequest/1000000/manual_time 1.72 ms 1.74 ms 408 Groupby/BasicNoRequest/10000000/manual_time 18.9 ms 18.9 ms 37 Groupby/BasicNoRequest/100000000/manual_time 198 ms 198 ms 3 ```
Full output

``` 2021-12-12T13:41:08+00:00 Running ./GROUPBY_BENCH Run on (64 X 2801.89 MHz CPU s) CPU Caches: L1 Data 32 KiB (x32) L1 Instruction 32 KiB (x32) L2 Unified 1024 KiB (x32) L3 Unified 22528 KiB (x2) Load Average: 1.01, 0.78, 0.42 -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- Groupby/Basic/10000/manual_time 0.117 ms 0.143 ms 5906 Groupby/Basic/1000000/manual_time 0.524 ms 0.542 ms 1352 Groupby/Basic/10000000/manual_time 4.41 ms 4.43 ms 159 Groupby/Basic/100000000/manual_time 50.1 ms 50.1 ms 13 Groupby/PreSorted/1000000/manual_time 0.332 ms 0.350 ms 2118 Groupby/PreSorted/10000000/manual_time 2.22 ms 2.23 ms 315 Groupby/PreSorted/100000000/manual_time 22.2 ms 22.2 ms 30 Groupby/PreSortedNth/1000000/manual_time 0.160 ms 0.188 ms 4381 Groupby/PreSortedNth/10000000/manual_time 0.890 ms 0.917 ms 774 Groupby/PreSortedNth/100000000/manual_time 8.43 ms 8.46 ms 68 Groupby/Shift/1000000/manual_time 0.764 ms 0.785 ms 902 Groupby/Shift/10000000/manual_time 9.51 ms 9.53 ms 63 Groupby/Shift/100000000/manual_time 145 ms 145 ms 4 Groupby/Aggregation/10000/manual_time 1.56 ms 1.58 ms 442 Groupby/Aggregation/16384/manual_time 1.59 ms 1.62 ms 435 Groupby/Aggregation/65536/manual_time 1.73 ms 1.76 ms 404 Groupby/Aggregation/262144/manual_time 2.95 ms 2.98 ms 237 Groupby/Aggregation/1048576/manual_time 9.20 ms 9.23 ms 73 Groupby/Aggregation/4194304/manual_time 36.3 ms 36.3 ms 19 Groupby/Aggregation/10000000/manual_time 92.0 ms 92.1 ms 7 Groupby/Scan/10000/manual_time 1.56 ms 1.58 ms 447 Groupby/Scan/16384/manual_time 1.62 ms 1.65 ms 429 Groupby/Scan/65536/manual_time 1.85 ms 1.88 ms 378 Groupby/Scan/262144/manual_time 3.54 ms 3.56 ms 197 Groupby/Scan/1048576/manual_time 12.0 ms 12.0 ms 57 Groupby/Scan/4194304/manual_time 48.6 ms 48.6 ms 14 Groupby/Scan/10000000/manual_time 126 ms 126 ms 4 Groupby/BasicNoRequest/10000/manual_time 0.158 ms 0.184 ms 4420 Groupby/BasicNoRequest/1000000/manual_time 1.72 ms 1.74 ms 408 Groupby/BasicNoRequest/10000000/manual_time 18.9 ms 18.9 ms 37 Groupby/BasicNoRequest/100000000/manual_time 198 ms 198 ms 3 Groupby/PreSortedNoRequests/1000000/manual_time 0.194 ms 0.214 ms 3624 Groupby/PreSortedNoRequests/10000000/manual_time 1.25 ms 1.27 ms 571 Groupby/PreSortedNoRequests/100000000/manual_time 12.6 ms 12.7 ms 50 ```

After the patch: ``` Groupby/BasicNoRequest/10000/manual_time 0.058 ms 0.085 ms 11991 Groupby/BasicNoRequest/1000000/manual_time 0.282 ms 0.301 ms 2478 Groupby/BasicNoRequest/10000000/manual_time 2.42 ms 2.44 ms 291 Groupby/BasicNoRequest/100000000/manual_time 29.2 ms 29.2 ms 21 ```
Full output

``` 2021-12-12T13:37:50+00:00 Running ./GROUPBY_BENCH Run on (64 X 2654.22 MHz CPU s) CPU Caches: L1 Data 32 KiB (x32) L1 Instruction 32 KiB (x32) L2 Unified 1024 KiB (x32) L3 Unified 22528 KiB (x2) Load Average: 0.64, 0.50, 0.26 -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- Groupby/Basic/10000/manual_time 0.116 ms 0.142 ms 5918 Groupby/Basic/1000000/manual_time 0.523 ms 0.542 ms 1374 Groupby/Basic/10000000/manual_time 4.37 ms 4.39 ms 162 Groupby/Basic/100000000/manual_time 51.4 ms 51.5 ms 10 Groupby/PreSorted/1000000/manual_time 0.331 ms 0.350 ms 2121 Groupby/PreSorted/10000000/manual_time 2.21 ms 2.23 ms 316 Groupby/PreSorted/100000000/manual_time 22.2 ms 22.2 ms 27 Groupby/PreSortedNth/1000000/manual_time 0.160 ms 0.188 ms 4384 Groupby/PreSortedNth/10000000/manual_time 0.888 ms 0.915 ms 775 Groupby/PreSortedNth/100000000/manual_time 8.36 ms 8.39 ms 70 Groupby/Shift/1000000/manual_time 0.764 ms 0.785 ms 904 Groupby/Shift/10000000/manual_time 9.50 ms 9.52 ms 63 Groupby/Shift/100000000/manual_time 146 ms 146 ms 4 Groupby/Aggregation/10000/manual_time 1.53 ms 1.55 ms 446 Groupby/Aggregation/16384/manual_time 1.58 ms 1.61 ms 437 Groupby/Aggregation/65536/manual_time 1.72 ms 1.75 ms 405 Groupby/Aggregation/262144/manual_time 2.93 ms 2.96 ms 236 Groupby/Aggregation/1048576/manual_time 9.18 ms 9.21 ms 74 Groupby/Aggregation/4194304/manual_time 36.2 ms 36.3 ms 19 Groupby/Aggregation/10000000/manual_time 91.5 ms 91.6 ms 7 Groupby/Scan/10000/manual_time 1.55 ms 1.57 ms 452 Groupby/Scan/16384/manual_time 1.60 ms 1.62 ms 434 Groupby/Scan/65536/manual_time 1.84 ms 1.87 ms 379 Groupby/Scan/262144/manual_time 3.54 ms 3.56 ms 198 Groupby/Scan/1048576/manual_time 12.0 ms 12.0 ms 57 Groupby/Scan/4194304/manual_time 48.4 ms 48.4 ms 14 Groupby/Scan/10000000/manual_time 125 ms 125 ms 4 Groupby/BasicNoRequest/10000/manual_time 0.058 ms 0.085 ms 11991 Groupby/BasicNoRequest/1000000/manual_time 0.282 ms 0.301 ms 2478 Groupby/BasicNoRequest/10000000/manual_time 2.42 ms 2.44 ms 291 Groupby/BasicNoRequest/100000000/manual_time 29.2 ms 29.2 ms 21 Groupby/PreSortedNoRequests/1000000/manual_time 0.195 ms 0.215 ms 3604 Groupby/PreSortedNoRequests/10000000/manual_time 1.25 ms 1.27 ms 575 Groupby/PreSortedNoRequests/100000000/manual_time 12.7 ms 12.8 ms 50 ```

Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Nghia Truong (https://github.com/ttnghia) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9891 --- cpp/benchmarks/CMakeLists.txt | 8 +- .../groupby/group_no_requests_benchmark.cu | 115 ++++++++++++++++++ cpp/src/groupby/hash/groupby.cu | 27 ++-- 3 files changed, 131 insertions(+), 19 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_no_requests_benchmark.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 72b247ae748..34a5aebff6f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -156,8 +156,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp) # ################################################################################################## # * groupby benchmark ----------------------------------------------------------------------------- ConfigureBench( - GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu - groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu + GROUPBY_BENCH + groupby/group_sum_benchmark.cu + groupby/group_nth_benchmark.cu + groupby/group_shift_benchmark.cu + groupby/group_struct_benchmark.cu + groupby/group_no_requests_benchmark.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu new file mode 100644 index 00000000000..7dbe1888cee --- /dev/null +++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +class Groupby : public cudf::benchmark { +}; + +// TODO: put it in a struct so `uniform` can be remade with different min, max +template +T random_int(T min, T max) +{ + static unsigned seed = 13377331; + static std::mt19937 engine{seed}; + static std::uniform_int_distribution uniform{min, max}; + + return uniform(engine); +} + +void BM_basic_no_requests(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size); + + std::vector requests; + + for (auto _ : state) { + cuda_event_timer timer(state, true); + cudf::groupby::groupby gb_obj(cudf::table_view({keys})); + auto result = gb_obj.aggregate(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state) +{ + BM_basic_no_requests(state); +} + +BENCHMARK_REGISTER_F(Groupby, BasicNoRequest) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(10000) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); + +void BM_pre_sorted_no_requests(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + auto valid_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100) < 90; }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size, valid_it); + + auto keys_table = cudf::table_view({keys}); + auto sort_order = cudf::sorted_order(keys_table); + auto sorted_keys = cudf::gather(keys_table, *sort_order); + // No need to sort values using sort_order because they were generated randomly + + std::vector requests; + + for (auto _ : state) { + cuda_event_timer timer(state, true); + cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES); + auto result = gb_obj.aggregate(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state) +{ + BM_pre_sorted_no_requests(state); +} + +BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index e35fa36a289..950cc9727fd 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -635,23 +635,16 @@ std::unique_ptr groupby_null_templated(table_view const& keys, */ bool can_use_hash_groupby(table_view const& keys, host_span requests) { - auto const all_hash_aggregations = - std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { - return cudf::has_atomic_support(r.values.type()) and - std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) { - return is_hash_aggregation(a->kind); - }); - }); - - // Currently, structs are not supported in any of hash-based aggregations. - // Therefore, if any request contains structs then we must fallback to sort-based aggregations. - // TODO: Support structs in hash-based aggregations. - auto const has_struct = - std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { - return r.values.type().id() == type_id::STRUCT; - }); - - return all_hash_aggregations && !has_struct; + return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { + // Currently, structs are not supported in any of hash-based aggregations. + // Therefore, if any request contains structs then we must fallback to sort-based aggregations. + // TODO: Support structs in hash-based aggregations. + return not(r.values.type().id() == type_id::STRUCT) and + cudf::has_atomic_support(r.values.type()) and + std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) { + return is_hash_aggregation(a->kind); + }); + }); } // Hash-based groupby