From 335862bf5bc8433d5d0896adf8210a13c5a21ab3 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 13 Dec 2021 15:08:39 -0600 Subject: [PATCH 1/2] Fix fallback to sort aggregation for grouping only hash aggregate (#9891) The following fixes what looks like an unintended fallback to sort aggregate introduced in https://github.com/rapidsai/cudf/pull/9545 for a grouping only (no aggregation request) case. In the PR, the `std::all_of` function is used to determine whether the aggregation requests would be for struct types. That said, when there are no aggregation requests the `std::all_of` function will return true, causing a fallback to the sort aggregation (relevant code: https://github.com/rapidsai/cudf/pull/9545/files#diff-e409f72ddc11ad10fa0099e21b409b92f12bfac8ba1817266696c34a620aa081R645-R650). I added a benchmark `group_no_requests_benchmark.cu` by mostly copying `group_sum_benchmark.cu` but I changed one critical part. I am re-creating the `groupby` object for each `state`: ``` for (auto _ : state) { cuda_event_timer timer(state, true); cudf::groupby::groupby gb_obj(cudf::table_view({keys}));e auto result = gb_obj.aggregate(requests); } ``` This shows what would happen in the scenario where the `groupby` instance is created each time an aggregate is issued, which would re-create the `helper` each time for the sorted case. If the `groupby` object is not recreated each time, the difference in performance between the before/after cases is negligible. We never recycle a `groupby` instance when using the groupby API from Spark. Posting this as draft for feedback as I am not sure if I handled the benchmark part correctly. This was executed on a T4 GPU. Before the patch: ``` Groupby/BasicNoRequest/10000/manual_time 0.158 ms 0.184 ms 4420 Groupby/BasicNoRequest/1000000/manual_time 1.72 ms 1.74 ms 408 Groupby/BasicNoRequest/10000000/manual_time 18.9 ms 18.9 ms 37 Groupby/BasicNoRequest/100000000/manual_time 198 ms 198 ms 3 ```
Full output

``` 2021-12-12T13:41:08+00:00 Running ./GROUPBY_BENCH Run on (64 X 2801.89 MHz CPU s) CPU Caches: L1 Data 32 KiB (x32) L1 Instruction 32 KiB (x32) L2 Unified 1024 KiB (x32) L3 Unified 22528 KiB (x2) Load Average: 1.01, 0.78, 0.42 -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- Groupby/Basic/10000/manual_time 0.117 ms 0.143 ms 5906 Groupby/Basic/1000000/manual_time 0.524 ms 0.542 ms 1352 Groupby/Basic/10000000/manual_time 4.41 ms 4.43 ms 159 Groupby/Basic/100000000/manual_time 50.1 ms 50.1 ms 13 Groupby/PreSorted/1000000/manual_time 0.332 ms 0.350 ms 2118 Groupby/PreSorted/10000000/manual_time 2.22 ms 2.23 ms 315 Groupby/PreSorted/100000000/manual_time 22.2 ms 22.2 ms 30 Groupby/PreSortedNth/1000000/manual_time 0.160 ms 0.188 ms 4381 Groupby/PreSortedNth/10000000/manual_time 0.890 ms 0.917 ms 774 Groupby/PreSortedNth/100000000/manual_time 8.43 ms 8.46 ms 68 Groupby/Shift/1000000/manual_time 0.764 ms 0.785 ms 902 Groupby/Shift/10000000/manual_time 9.51 ms 9.53 ms 63 Groupby/Shift/100000000/manual_time 145 ms 145 ms 4 Groupby/Aggregation/10000/manual_time 1.56 ms 1.58 ms 442 Groupby/Aggregation/16384/manual_time 1.59 ms 1.62 ms 435 Groupby/Aggregation/65536/manual_time 1.73 ms 1.76 ms 404 Groupby/Aggregation/262144/manual_time 2.95 ms 2.98 ms 237 Groupby/Aggregation/1048576/manual_time 9.20 ms 9.23 ms 73 Groupby/Aggregation/4194304/manual_time 36.3 ms 36.3 ms 19 Groupby/Aggregation/10000000/manual_time 92.0 ms 92.1 ms 7 Groupby/Scan/10000/manual_time 1.56 ms 1.58 ms 447 Groupby/Scan/16384/manual_time 1.62 ms 1.65 ms 429 Groupby/Scan/65536/manual_time 1.85 ms 1.88 ms 378 Groupby/Scan/262144/manual_time 3.54 ms 3.56 ms 197 Groupby/Scan/1048576/manual_time 12.0 ms 12.0 ms 57 Groupby/Scan/4194304/manual_time 48.6 ms 48.6 ms 14 Groupby/Scan/10000000/manual_time 126 ms 126 ms 4 Groupby/BasicNoRequest/10000/manual_time 0.158 ms 0.184 ms 4420 Groupby/BasicNoRequest/1000000/manual_time 1.72 ms 1.74 ms 408 Groupby/BasicNoRequest/10000000/manual_time 18.9 ms 18.9 ms 37 Groupby/BasicNoRequest/100000000/manual_time 198 ms 198 ms 3 Groupby/PreSortedNoRequests/1000000/manual_time 0.194 ms 0.214 ms 3624 Groupby/PreSortedNoRequests/10000000/manual_time 1.25 ms 1.27 ms 571 Groupby/PreSortedNoRequests/100000000/manual_time 12.6 ms 12.7 ms 50 ```

After the patch: ``` Groupby/BasicNoRequest/10000/manual_time 0.058 ms 0.085 ms 11991 Groupby/BasicNoRequest/1000000/manual_time 0.282 ms 0.301 ms 2478 Groupby/BasicNoRequest/10000000/manual_time 2.42 ms 2.44 ms 291 Groupby/BasicNoRequest/100000000/manual_time 29.2 ms 29.2 ms 21 ```
Full output

``` 2021-12-12T13:37:50+00:00 Running ./GROUPBY_BENCH Run on (64 X 2654.22 MHz CPU s) CPU Caches: L1 Data 32 KiB (x32) L1 Instruction 32 KiB (x32) L2 Unified 1024 KiB (x32) L3 Unified 22528 KiB (x2) Load Average: 0.64, 0.50, 0.26 -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- Groupby/Basic/10000/manual_time 0.116 ms 0.142 ms 5918 Groupby/Basic/1000000/manual_time 0.523 ms 0.542 ms 1374 Groupby/Basic/10000000/manual_time 4.37 ms 4.39 ms 162 Groupby/Basic/100000000/manual_time 51.4 ms 51.5 ms 10 Groupby/PreSorted/1000000/manual_time 0.331 ms 0.350 ms 2121 Groupby/PreSorted/10000000/manual_time 2.21 ms 2.23 ms 316 Groupby/PreSorted/100000000/manual_time 22.2 ms 22.2 ms 27 Groupby/PreSortedNth/1000000/manual_time 0.160 ms 0.188 ms 4384 Groupby/PreSortedNth/10000000/manual_time 0.888 ms 0.915 ms 775 Groupby/PreSortedNth/100000000/manual_time 8.36 ms 8.39 ms 70 Groupby/Shift/1000000/manual_time 0.764 ms 0.785 ms 904 Groupby/Shift/10000000/manual_time 9.50 ms 9.52 ms 63 Groupby/Shift/100000000/manual_time 146 ms 146 ms 4 Groupby/Aggregation/10000/manual_time 1.53 ms 1.55 ms 446 Groupby/Aggregation/16384/manual_time 1.58 ms 1.61 ms 437 Groupby/Aggregation/65536/manual_time 1.72 ms 1.75 ms 405 Groupby/Aggregation/262144/manual_time 2.93 ms 2.96 ms 236 Groupby/Aggregation/1048576/manual_time 9.18 ms 9.21 ms 74 Groupby/Aggregation/4194304/manual_time 36.2 ms 36.3 ms 19 Groupby/Aggregation/10000000/manual_time 91.5 ms 91.6 ms 7 Groupby/Scan/10000/manual_time 1.55 ms 1.57 ms 452 Groupby/Scan/16384/manual_time 1.60 ms 1.62 ms 434 Groupby/Scan/65536/manual_time 1.84 ms 1.87 ms 379 Groupby/Scan/262144/manual_time 3.54 ms 3.56 ms 198 Groupby/Scan/1048576/manual_time 12.0 ms 12.0 ms 57 Groupby/Scan/4194304/manual_time 48.4 ms 48.4 ms 14 Groupby/Scan/10000000/manual_time 125 ms 125 ms 4 Groupby/BasicNoRequest/10000/manual_time 0.058 ms 0.085 ms 11991 Groupby/BasicNoRequest/1000000/manual_time 0.282 ms 0.301 ms 2478 Groupby/BasicNoRequest/10000000/manual_time 2.42 ms 2.44 ms 291 Groupby/BasicNoRequest/100000000/manual_time 29.2 ms 29.2 ms 21 Groupby/PreSortedNoRequests/1000000/manual_time 0.195 ms 0.215 ms 3604 Groupby/PreSortedNoRequests/10000000/manual_time 1.25 ms 1.27 ms 575 Groupby/PreSortedNoRequests/100000000/manual_time 12.7 ms 12.8 ms 50 ```

Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Nghia Truong (https://github.com/ttnghia) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9891 --- cpp/benchmarks/CMakeLists.txt | 8 +- .../groupby/group_no_requests_benchmark.cu | 115 ++++++++++++++++++ cpp/src/groupby/hash/groupby.cu | 27 ++-- 3 files changed, 131 insertions(+), 19 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_no_requests_benchmark.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 71b452a1004..63f6857ee08 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -171,8 +171,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp) # ################################################################################################## # * groupby benchmark ----------------------------------------------------------------------------- ConfigureBench( - GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu - groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu + GROUPBY_BENCH + groupby/group_sum_benchmark.cu + groupby/group_nth_benchmark.cu + groupby/group_shift_benchmark.cu + groupby/group_struct_benchmark.cu + groupby/group_no_requests_benchmark.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu new file mode 100644 index 00000000000..7dbe1888cee --- /dev/null +++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +class Groupby : public cudf::benchmark { +}; + +// TODO: put it in a struct so `uniform` can be remade with different min, max +template +T random_int(T min, T max) +{ + static unsigned seed = 13377331; + static std::mt19937 engine{seed}; + static std::uniform_int_distribution uniform{min, max}; + + return uniform(engine); +} + +void BM_basic_no_requests(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size); + + std::vector requests; + + for (auto _ : state) { + cuda_event_timer timer(state, true); + cudf::groupby::groupby gb_obj(cudf::table_view({keys})); + auto result = gb_obj.aggregate(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state) +{ + BM_basic_no_requests(state); +} + +BENCHMARK_REGISTER_F(Groupby, BasicNoRequest) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(10000) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); + +void BM_pre_sorted_no_requests(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + auto valid_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100) < 90; }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size, valid_it); + + auto keys_table = cudf::table_view({keys}); + auto sort_order = cudf::sorted_order(keys_table); + auto sorted_keys = cudf::gather(keys_table, *sort_order); + // No need to sort values using sort_order because they were generated randomly + + std::vector requests; + + for (auto _ : state) { + cuda_event_timer timer(state, true); + cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES); + auto result = gb_obj.aggregate(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state) +{ + BM_pre_sorted_no_requests(state); +} + +BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 1474cfd8a19..4f2cb4de14b 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -636,23 +636,16 @@ std::unique_ptr groupby(table_view const& keys, */ bool can_use_hash_groupby(table_view const& keys, host_span requests) { - auto const all_hash_aggregations = - std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { - return cudf::has_atomic_support(r.values.type()) and - std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) { - return is_hash_aggregation(a->kind); - }); - }); - - // Currently, structs are not supported in any of hash-based aggregations. - // Therefore, if any request contains structs then we must fallback to sort-based aggregations. - // TODO: Support structs in hash-based aggregations. - auto const has_struct = - std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { - return r.values.type().id() == type_id::STRUCT; - }); - - return all_hash_aggregations && !has_struct; + return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { + // Currently, structs are not supported in any of hash-based aggregations. + // Therefore, if any request contains structs then we must fallback to sort-based aggregations. + // TODO: Support structs in hash-based aggregations. + return not(r.values.type().id() == type_id::STRUCT) and + cudf::has_atomic_support(r.values.type()) and + std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) { + return is_hash_aggregation(a->kind); + }); + }); } // Hash-based groupby From b3b299ae22f31adb8f380d7add1ce2bdb726ab26 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 13 Dec 2021 20:00:37 -0800 Subject: [PATCH 2/2] Break tie for `top` categorical columns in `Series.describe` (#9867) Closes #9825 This PR fixes the bug that when there are ties with most frequent categories in categorical series, the order returned is undefined. We break the tie by sorting the category and take its top. Also included in this PR: refactors `describe` to use a dictionary items to describe for better readability. And enables https://github.com/rapidsai/cudf/blob/024003ca444f9d1a8374a1133337419f22cc880a/python/cudf/cudf/tests/test_series.py#L405 Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9867 --- python/cudf/cudf/core/series.py | 144 ++++++++++++++------------ python/cudf/cudf/tests/test_series.py | 37 +++---- 2 files changed, 93 insertions(+), 88 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3ca77105d1b..bbeae1adc5e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles): return ["{0}%".format(int(x * 100)) for x in percentiles] def _format_stats_values(stats_data): - return list(map(lambda x: round(x, 6), stats_data)) + return map(lambda x: round(x, 6), stats_data) def _describe_numeric(self): # mimicking pandas - index = ( - ["count", "mean", "std", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - data = ( - [self.count(), self.mean(), self.std(), self.min()] - + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist() - + [self.max()] - ) - data = _format_stats_values(data) + data = { + "count": self.count(), + "mean": self.mean(), + "std": self.std(), + "min": self.min(), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .to_numpy(na_value=np.nan) + .tolist(), + ) + ), + "max": self.max(), + } return Series( - data=data, index=index, nan_as_null=False, name=self.name, + data=_format_stats_values(data.values()), + index=data.keys(), + nan_as_null=False, + name=self.name, ) def _describe_timedelta(self): # mimicking pandas - index = ( - ["count", "mean", "std", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - - data = ( - [ - str(self.count()), - str(self.mean()), - str(self.std()), - str(pd.Timedelta(self.min())), - ] - + self.quantile(percentiles) - .astype("str") - .to_numpy(na_value=None) - .tolist() - + [str(pd.Timedelta(self.max()))] - ) + data = { + "count": str(self.count()), + "mean": str(self.mean()), + "std": str(self.std()), + "min": str(pd.Timedelta(self.min())), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .astype("str") + .to_numpy(na_value=np.nan) + .tolist(), + ) + ), + "max": str(pd.Timedelta(self.max())), + } return Series( - data=data, - index=index, + data=data.values(), + index=data.keys(), dtype="str", nan_as_null=False, name=self.name, @@ -3365,51 +3369,55 @@ def _describe_timedelta(self): def _describe_categorical(self): # blocked by StringColumn/DatetimeColumn support for # value_counts/unique - index = ["count", "unique", "top", "freq"] - val_counts = self.value_counts(ascending=False) - data = [self.count(), self.unique().size] - - if data[1] > 0: - top, freq = val_counts.index[0], val_counts.iloc[0] - data += [str(top), freq] - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - data += [None, None] + data = { + "count": self.count(), + "unique": len(self.unique()), + "top": None, + "freq": None, + } + if data["count"] > 0: + # In case there's a tie, break the tie by sorting the index + # and take the top. + val_counts = self.value_counts(ascending=False) + tied_val_counts = val_counts[ + val_counts == val_counts.iloc[0] + ].sort_index() + data.update( + { + "top": tied_val_counts.index[0], + "freq": tied_val_counts.iloc[0], + } + ) return Series( - data=data, + data=data.values(), dtype="str", - index=index, + index=data.keys(), nan_as_null=False, name=self.name, ) def _describe_timestamp(self): - - index = ( - ["count", "mean", "min"] - + _format_percentile_names(percentiles) - + ["max"] - ) - - data = ( - [ - str(self.count()), - str(self.mean().to_numpy().astype("datetime64[ns]")), - str(pd.Timestamp(self.min().astype("datetime64[ns]"))), - ] - + self.quantile(percentiles) - .astype("str") - .to_numpy(na_value=None) - .tolist() - + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))] - ) + data = { + "count": str(self.count()), + "mean": str(pd.Timestamp(self.mean())), + "min": str(pd.Timestamp(self.min())), + **dict( + zip( + _format_percentile_names(percentiles), + self.quantile(percentiles) + .astype(self.dtype) + .astype("str") + .to_numpy(na_value=np.nan), + ) + ), + "max": str(pd.Timestamp((self.max()))), + } return Series( - data=data, + data=data.values(), dtype="str", - index=index, + index=data.keys(), nan_as_null=False, name=self.name, ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f214e54c57e..1e11e862329 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -12,7 +12,6 @@ import cudf from cudf.testing._utils import ( - DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES, assert_eq, @@ -402,30 +401,21 @@ def test_series_describe_numeric(dtype): assert_eq(expected, actual) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6219") -@pytest.mark.parametrize("dtype", DATETIME_TYPES) +@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) def test_series_describe_datetime(dtype): + # Note that other datetime units are not tested because pandas does not + # support them. When specified coarser units, cuDF datetime columns cannot + # represent fractional time for quantiles of the column, which may require + # interpolation, this differs from pandas which always stay in [ns] unit. gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) ps = gs.to_pandas() - pdf_results = ps.describe(datetime_is_numeric=True) - gdf_results = gs.describe() - - # Assert count - p_count = pdf_results["count"] - g_count = gdf_results["count"] - - assert_eq(int(g_count), p_count) - - # Assert Index - assert_eq(gdf_results.index, pdf_results.index) + # Treating datetimes as categoricals is deprecated in pandas and will + # be removed in future. Future behavior is treating datetime as numeric. + expected = ps.describe(datetime_is_numeric=True) + actual = gs.describe() - # Assert rest of the element apart from - # the first index('count') - actual = gdf_results.tail(-1).astype("datetime64[ns]") - expected = pdf_results.tail(-1).astype("str").astype("datetime64[ns]") - - assert_eq(expected, actual) + assert_eq(expected.astype("str"), actual) @pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) @@ -446,6 +436,13 @@ def test_series_describe_timedelta(dtype): pd.Series([True, False, True, True, False]), pd.Series([], dtype="str"), pd.Series(["a", "b", "c", "a"], dtype="category"), + pd.Series(["d", "e", "f"], dtype="category"), + pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), + pd.Series( + pd.Categorical( + ["d", "e", "f"], categories=["f", "e", "d"], ordered=True + ) + ), ], ) def test_series_describe_other_types(ps):