From 500f42c16f19ed475569b9bf2d7055a3abfc25ed Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 23 Mar 2021 07:01:37 +0530 Subject: [PATCH] Add groupby scan operations (sort groupby) (#7387) Adds support for groupby scan operations. Addresses part of https://github.com/rapidsai/cudf/issues/1298 cumsum https://github.com/rapidsai/cudf/issues/1296 cumcount - sum - min - max - count Authors: - Karthikeyan (@karthikeyann) - Michael Wang (@isVoid) Approvers: - Vukasin Milovanovic (@vuule) - Jake Hemstad (@jrhemstad) - Nghia Truong (@ttnghia) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7387 --- cpp/CMakeLists.txt | 9 +- .../cudf/detail/groupby/sort_helper.hpp | 4 +- cpp/include/cudf/detail/null_mask.cuh | 2 - cpp/include/cudf/groupby.hpp | 62 +++- cpp/src/bitmask/null_mask.cu | 2 - cpp/src/groupby/groupby.cu | 20 +- .../sort/{groupby.cu => aggregate.cpp} | 96 ++---- cpp/src/groupby/sort/functors.hpp | 96 ++++++ cpp/src/groupby/sort/group_count.cu | 2 +- cpp/src/groupby/sort/group_count_scan.cu | 53 ++++ cpp/src/groupby/sort/group_max.cu | 2 +- cpp/src/groupby/sort/group_max_scan.cu | 36 +++ cpp/src/groupby/sort/group_min_scan.cu | 36 +++ cpp/src/groupby/sort/group_scan.hpp | 88 ++++++ cpp/src/groupby/sort/group_scan_util.cuh | 115 ++++++++ cpp/src/groupby/sort/group_sum_scan.cu | 36 +++ cpp/src/groupby/sort/scan.cpp | 133 +++++++++ cpp/tests/CMakeLists.txt | 6 +- cpp/tests/groupby/group_count_scan_test.cpp | 213 ++++++++++++++ cpp/tests/groupby/group_max_scan_test.cpp | 158 ++++++++++ cpp/tests/groupby/group_min_scan_test.cpp | 173 +++++++++++ cpp/tests/groupby/group_sum_scan_test.cpp | 162 +++++++++++ cpp/tests/groupby/groupby_keys_test.cpp | 273 +++++++++++------- cpp/tests/groupby/groupby_test_util.hpp | 26 ++ 24 files changed, 1610 insertions(+), 193 deletions(-) rename cpp/src/groupby/sort/{groupby.cu => aggregate.cpp} (79%) create mode 100644 cpp/src/groupby/sort/functors.hpp create mode 100644 cpp/src/groupby/sort/group_count_scan.cu create mode 100644 cpp/src/groupby/sort/group_max_scan.cu create mode 100644 cpp/src/groupby/sort/group_min_scan.cu create mode 100644 cpp/src/groupby/sort/group_scan.hpp create mode 100644 cpp/src/groupby/sort/group_scan_util.cuh create mode 100644 cpp/src/groupby/sort/group_sum_scan.cu create mode 100644 cpp/src/groupby/sort/scan.cpp create mode 100644 cpp/tests/groupby/group_count_scan_test.cpp create mode 100644 cpp/tests/groupby/group_max_scan_test.cpp create mode 100644 cpp/tests/groupby/group_min_scan_test.cpp create mode 100644 cpp/tests/groupby/group_sum_scan_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3e875b71ca6..b663d4f4b6d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -194,7 +194,7 @@ add_library(cudf src/groupby/hash/groupby.cu src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu - src/groupby/sort/groupby.cu + src/groupby/sort/aggregate.cpp src/groupby/sort/group_collect.cu src/groupby/sort/group_count.cu src/groupby/sort/group_max.cu @@ -204,6 +204,11 @@ add_library(cudf src/groupby/sort/group_quantiles.cu src/groupby/sort/group_std.cu src/groupby/sort/group_sum.cu + src/groupby/sort/scan.cpp + src/groupby/sort/group_count_scan.cu + src/groupby/sort/group_max_scan.cu + src/groupby/sort/group_min_scan.cu + src/groupby/sort/group_sum_scan.cu src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/interop/dlpack.cpp diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index cadcb1265c4..a68d649b8c8 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -63,8 +63,8 @@ struct sort_groupby_helper { sorted keys_pre_sorted = sorted::NO) : _keys(keys), _num_keys(-1), - _include_null_keys(include_null_keys), - _keys_pre_sorted(keys_pre_sorted) + _keys_pre_sorted(keys_pre_sorted), + _include_null_keys(include_null_keys) { if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and has_nulls(keys)) { diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 93f54cff588..08dae998944 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -23,8 +23,6 @@ #include -using cudf::device_span; - namespace cudf { namespace detail { /** diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index f7f7f51479d..1dfacd53e0d 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,61 @@ class groupby { std::vector const& requests, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** + * @brief Performs grouped scans on the specified values. + * + * The values to aggregate and the aggregations to perform are specifed in an + * `aggregation_request`. Each request contains a `column_view` of values to + * aggregate and a set of `aggregation`s to perform on those elements. + * + * For each `aggregation` in a request, `values[i]` is scan aggregated with + * all previous `values[j]` where rows `i` and `j` in `keys` are equivalent. + * + * The `size()` of the request column must equal `keys.num_rows()`. + * + * For every `aggregation_request` an `aggregation_result` will be returned. + * The `aggregation_result` holds the resulting column(s) for each requested + * aggregation on the `request`s values. The order of the columns in each + * result is the same order as was specified in the request. + * + * The returned `table` contains the group labels for each row, i.e., the + * `keys` given to groupby object. Element `i` across all aggregation results + * belongs to the group at row `i` in the group labels table. + * + * The order of the rows in the group labels is arbitrary. Furthermore, + * successive `groupby::scan` calls may return results in different orders. + * + * @throws cudf::logic_error If `requests[i].values.size() != + * keys.num_rows()`. + * + * Example: + * ``` + * Input: + * keys: {1 2 1 3 1} + * {1 2 1 4 1} + * request: + * values: {3 1 4 9 2} + * aggregations: {{SUM}, {MIN}} + * + * result: + * + * keys: {3 1 1 1 2} + * {4 1 1 1 2} + * values: + * SUM: {9 3 7 9 1} + * MIN: {9 3 3 2 1} + * ``` + * + * @param requests The set of columns to scan and the scans to perform + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * @return Pair containing the table with each group's key and + * a vector of aggregation_results for each request in the same order as + * specified in `requests`. + */ + std::pair, std::vector> scan( + std::vector const& requests, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief The grouped data corresponding to a groupby operation on a set of values. * @@ -231,6 +286,11 @@ class groupby { std::vector const& requests, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); + + std::pair, std::vector> sort_scan( + std::vector const& requests, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); }; /** @} */ } // namespace groupby diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 60167d77507..845a5512c27 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -44,8 +44,6 @@ #include #include -using cudf::device_span; - namespace cudf { size_type state_null_count(mask_state state, size_type size) { diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 487aed4b411..cdd8ceb0a6c 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,24 @@ std::pair, std::vector> groupby::aggr return dispatch_aggregation(requests, 0, mr); } +// Compute scan requests +std::pair, std::vector> groupby::scan( + std::vector const& requests, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS( + std::all_of(requests.begin(), + requests.end(), + [this](auto const& request) { return request.values.size() == _keys.num_rows(); }), + "Size mismatch between request values and groupby keys."); + + verify_valid_requests(requests); + + if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); } + + return sort_scan(requests, rmm::cuda_stream_default, mr); +} + groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/aggregate.cpp similarity index 79% rename from cpp/src/groupby/sort/groupby.cu rename to cpp/src/groupby/sort/aggregate.cpp index 5c54dd3cb4c..ace25820ac9 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,8 @@ */ #include -#include "group_reductions.hpp" +#include +#include #include #include @@ -51,71 +52,16 @@ namespace detail { * memoised sorted and/or grouped values and re-using will save on computation * of these values. */ -struct store_result_functor { - store_result_functor(size_type col_idx, - column_view const& values, - sort::sort_groupby_helper& helper, - cudf::detail::result_cache& cache, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) - { - } - +struct aggregrate_result_functor final : store_result_functor { + using store_result_functor::store_result_functor; template void operator()(aggregation const& agg) { } - - private: - /** - * @brief Get the grouped values - * - * Computes the grouped values from @p values on first invocation and returns - * the stored result on subsequent invocation - */ - column_view get_grouped_values() - { - // TODO (dm): After implementing single pass multi-agg, explore making a - // cache of all grouped value columns rather than one at a time - if (grouped_values) - return grouped_values->view(); - else if (sorted_values) - // TODO (dm): When we implement scan, it wouldn't be ok to return sorted - // values when asked for grouped values. Change this then. - return sorted_values->view(); - else - grouped_values = helper.grouped_values(values); - return grouped_values->view(); - }; - - /** - * @brief Get the grouped and sorted values - * - * Computes the grouped and sorted (within each group) values from @p values - * on first invocation and returns the stored result on subsequent invocation - */ - column_view get_sorted_values() - { - if (not sorted_values) sorted_values = helper.sorted_values(values); - return sorted_values->view(); - }; - - private: - size_type col_idx; ///< Index of column in requests being operated on - sort::sort_groupby_helper& helper; ///< Sort helper - cudf::detail::result_cache& cache; ///< cache of results to store into - column_view const& values; ///< Column of values to group and aggregate - - rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels - rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results - - std::unique_ptr sorted_values; ///< Memoised grouped and sorted values - std::unique_ptr grouped_values; ///< Memoised grouped values }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -129,7 +75,7 @@ void store_result_functor::operator()(aggregation cons } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -138,7 +84,7 @@ void store_result_functor::operator()(aggregation const& } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -149,7 +95,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -164,7 +110,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -179,7 +125,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -216,7 +162,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -253,7 +199,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -277,7 +223,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -300,7 +246,7 @@ void store_result_functor::operator()(aggregation const& }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -314,7 +260,7 @@ void store_result_functor::operator()(aggregation const& agg) }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -335,7 +281,7 @@ void store_result_functor::operator()(aggregation const& }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -355,7 +301,7 @@ void store_result_functor::operator()(aggregation const& ag }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -372,7 +318,7 @@ void store_result_functor::operator()(aggregation const& a }; template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) return; @@ -401,7 +347,7 @@ void store_result_functor::operator()(aggregation cons } template <> -void store_result_functor::operator()(aggregation const& agg) +void aggregrate_result_functor::operator()(aggregation const& agg) { auto null_handling = static_cast(agg)._null_handling; @@ -431,7 +377,7 @@ std::pair, std::vector> groupby::sort for (size_t i = 0; i < requests.size(); i++) { auto store_functor = - detail::store_result_functor(i, requests[i].values, helper(), cache, stream, mr); + detail::aggregrate_result_functor(i, requests[i].values, helper(), cache, stream, mr); for (size_t j = 0; j < requests[i].aggregations.size(); j++) { // TODO (dm): single pass compute all supported reductions cudf::detail::aggregation_dispatcher( diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp new file mode 100644 index 00000000000..565320fbe80 --- /dev/null +++ b/cpp/src/groupby/sort/functors.hpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Functor to dispatch aggregation with + * + * This functor is to be used with `aggregation_dispatcher` to compute the + * appropriate aggregation. If the values on which to run the aggregation are + * unchanged, then this functor should be re-used. This is because it stores + * memoised sorted and/or grouped values and re-using will save on computation + * of these values. + */ +struct store_result_functor { + store_result_functor(size_type col_idx, + column_view const& values, + sort::sort_groupby_helper& helper, + cudf::detail::result_cache& cache, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) + { + } + + protected: + /** + * @brief Get the grouped values + * + * Computes the grouped values from @p values on first invocation and returns + * the stored result on subsequent invocation + */ + column_view get_grouped_values() + { + // TODO (dm): After implementing single pass multi-agg, explore making a + // cache of all grouped value columns rather than one at a time + if (grouped_values) + return grouped_values->view(); + else if (sorted_values) + // In scan, it wouldn't be ok to return sorted values when asked for grouped values. + // It's overridden in scan implementation. + return sorted_values->view(); + else + return (grouped_values = helper.grouped_values(values))->view(); + }; + + /** + * @brief Get the grouped and sorted values + * + * Computes the grouped and sorted (within each group) values from @p values + * on first invocation and returns the stored result on subsequent invocation + */ + column_view get_sorted_values() + { + return sorted_values ? sorted_values->view() + : (sorted_values = helper.sorted_values(values))->view(); + }; + + protected: + size_type col_idx; ///< Index of column in requests being operated on + sort::sort_groupby_helper& helper; ///< Sort helper + cudf::detail::result_cache& cache; ///< cache of results to store into + column_view const& values; ///< Column of values to group and aggregate + + rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels + rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results + + std::unique_ptr sorted_values; ///< Memoised grouped and sorted values + std::unique_ptr grouped_values; ///< Memoised grouped values +}; +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu index 60e0ce31db1..121e4bb889d 100644 --- a/cpp/src/groupby/sort/group_count.cu +++ b/cpp/src/groupby/sort/group_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu new file mode 100644 index 00000000000..4ad533aebdc --- /dev/null +++ b/cpp/src/groupby/sort/group_count_scan.cu @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr count_scan(cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::unique_ptr result = make_fixed_width_column( + data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr); + + if (group_labels.empty()) { return result; } + + auto resultview = result->mutable_view(); + // aggregation::COUNT_ALL + thrust::exclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + thrust::make_constant_iterator(1), + resultview.begin()); + return result; +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu index bd4e676b83d..3f5592186df 100644 --- a/cpp/src/groupby/sort/group_max.cu +++ b/cpp/src/groupby/sort/group_max.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu new file mode 100644 index 00000000000..303d606be9d --- /dev/null +++ b/cpp/src/groupby/sort/group_max_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr max_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu new file mode 100644 index 00000000000..4a692cdf0bd --- /dev/null +++ b/cpp/src/groupby/sort/group_min_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr min_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp new file mode 100644 index 00000000000..efb39068d2e --- /dev/null +++ b/cpp/src/groupby/sort/group_scan.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Internal API to calculate groupwise cumulative sum + * + * @param values Grouped values to get sum of + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr sum_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise cumulative minimum value + * + * @param values Grouped values to get minimum from + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr min_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise cumulative maximum value + * + * @param values Grouped values to get maximum from + * @param num_groups Number of groups + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr max_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate cumulative number of values in each group + * + * @param group_labels ID of group that the corresponding value belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Column of type INT32 of count values + */ +std::unique_ptr count_scan(cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh new file mode 100644 index 00000000000..9f8614a61b4 --- /dev/null +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +template +struct scan_functor { + template + static constexpr bool is_supported() + { + if (K == aggregation::SUM) + return cudf::is_numeric() || cudf::is_duration() || cudf::is_fixed_point(); + else if (K == aggregation::MIN or K == aggregation::MAX) + return cudf::is_fixed_width() and is_relationally_comparable(); + else + return false; + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using DeviceType = device_storage_type_t; + using OpType = cudf::detail::corresponding_operator_t; + using ResultType = cudf::detail::target_type_t; + using ResultDeviceType = device_storage_type_t; + + auto result_type = is_fixed_point() + ? data_type{type_to_id(), values.type().scale()} + : data_type{type_to_id()}; + + std::unique_ptr result = + make_fixed_width_column(result_type, values.size(), mask_state::UNALLOCATED, stream, mr); + + if (values.is_empty()) { return result; } + + auto result_table = mutable_table_view({*result}); + cudf::detail::initialize_with_identity(result_table, {K}, stream); + + auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); + auto values_view = column_device_view::create(values, stream); + + if (values.has_nulls()) { + auto input = thrust::make_transform_iterator( + make_null_replacement_iterator(*values_view, OpType::template identity()), + thrust::identity{}); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + input, + result_view->begin(), + thrust::equal_to{}, + OpType{}); + result->set_null_mask(cudf::detail::copy_bitmask(values, stream)); + } else { + auto input = thrust::make_transform_iterator(values_view->begin(), + thrust::identity{}); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + input, + result_view->begin(), + thrust::equal_to{}, + OpType{}); + } + return result; + } + + template + std::enable_if_t(), std::unique_ptr> operator()(Args&&... args) + { + CUDF_FAIL("Unsupported groupby scan type-agg combination"); + } +}; + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu new file mode 100644 index 00000000000..ae9b1c321d4 --- /dev/null +++ b/cpp/src/groupby/sort/group_sum_scan.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +std::unique_ptr sum_scan(column_view const& values, + size_type num_groups, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher( + values.type(), scan_functor{}, values, num_groups, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp new file mode 100644 index 00000000000..63de4ea8684 --- /dev/null +++ b/cpp/src/groupby/sort/scan.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace groupby { +namespace detail { +/** + * @brief Functor to dispatch aggregation with + * + * This functor is to be used with `aggregation_dispatcher` to compute the + * appropriate aggregation. If the values on which to run the aggregation are + * unchanged, then this functor should be re-used. This is because it stores + * memoised sorted and/or grouped values and re-using will save on computation + * of these values. + */ +struct scan_result_functor final : store_result_functor { + using store_result_functor::store_result_functor; + template + void operator()(aggregation const& agg) + { + CUDF_FAIL("Unsupported groupby scan aggregation"); + } + + private: + column_view get_grouped_values() + { + // TODO (dm): After implementing single pass multi-agg, explore making a + // cache of all grouped value columns rather than one at a time + if (grouped_values) + return grouped_values->view(); + else + return (grouped_values = helper.grouped_values(values))->view(); + }; +}; + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::sum_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::min_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result( + col_idx, + agg, + detail::max_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); +} + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(), stream, mr)); +} +} // namespace detail + +// Sort-based groupby +std::pair, std::vector> groupby::sort_scan( + std::vector const& requests, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // We're going to start by creating a cache of results so that aggs that + // depend on other aggs will not have to be recalculated. e.g. mean depends on + // sum and count. std depends on mean and count + cudf::detail::result_cache cache(requests.size()); + + for (size_t i = 0; i < requests.size(); i++) { + auto store_functor = + detail::scan_result_functor(i, requests[i].values, helper(), cache, stream, mr); + for (auto const& aggregation : requests[i].aggregations) { + // TODO (dm): single pass compute all supported reductions + cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation); + } + } + + auto results = detail::extract_results(requests, cache); + + return std::make_pair(helper().sorted_keys(stream, mr), std::move(results)); +} +} // namespace groupby +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e95aab16098..4b2d1e04ac5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -70,7 +70,11 @@ ConfigureTest(GROUPBY_TEST groupby/group_quantile_test.cpp groupby/group_nunique_test.cpp groupby/group_nth_element_test.cpp - groupby/group_collect_test.cpp) + groupby/group_collect_test.cpp + groupby/group_sum_scan_test.cpp + groupby/group_min_scan_test.cpp + groupby/group_max_scan_test.cpp + groupby/group_count_scan_test.cpp) ################################################################################################### # - join tests ------------------------------------------------------------------------------------ diff --git a/cpp/tests/groupby/group_count_scan_test.cpp b/cpp/tests/groupby/group_count_scan_test.cpp new file mode 100644 index 00000000000..b7b18982f51 --- /dev/null +++ b/cpp/tests/groupby/group_count_scan_test.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_count_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_count_scan_test, cudf::test::AllTypes); + +TYPED_TEST(groupby_count_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), + "Unsupported groupby scan aggregation"); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys; + value_wrapper vals; + + key_wrapper expect_keys; + result_wrapper expect_vals; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1))); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3}, all_null()); + value_wrapper vals{3, 4, 5}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys{1, 1, 1}; + result_wrapper expect_vals{0, 1, 2}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +TYPED_TEST(groupby_count_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( {1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // {0, 3, 6, 1, 4, _, 9, 2, 7, 8, -} + result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 0}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +struct groupby_count_scan_string_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_count_scan_string_test, basic) +{ + using V = cudf::string_view; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + // clang-format off + key_wrapper keys { 1, 3, 3, 5, 5, 0}; + strings_column_wrapper vals{"1", "1", "1", "1", "1", "1"}; + + key_wrapper expect_keys {0, 1, 3, 3, 5, 5}; + result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; + // clang-format on + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupByCountScan) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + using V = decimalXX; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + auto const scale = scale_type{-1}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals = result_wrapper{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + // clang-format on + + CUDF_EXPECT_THROW_MESSAGE( + test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()), + "Unsupported groupby scan aggregation"); + + auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2)); +} + +struct groupby_dictionary_count_scan_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_dictionary_count_scan_test, basic) +{ + using V = std::string; + using R = cudf::detail::target_type_t; + using result_wrapper = fixed_width_column_wrapper; + + // clang-format off + strings_column_wrapper keys{"1", "3", "3", "5", "5", "0"}; + dictionary_column_wrapper vals{1, 1, 1, 1, 1, 1}; + strings_column_wrapper expect_keys{"0", "1", "3", "3", "5", "5"}; + result_wrapper expect_vals{0, 0, 0, 1, 0, 1}; + // clang-format on + + auto agg1 = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)), + "Unsupported groupby scan aggregation"); + test_single_scan( + keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE)); +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_max_scan_test.cpp b/cpp/tests/groupby/group_max_scan_test.cpp new file mode 100644 index 00000000000..c1fc48ca698 --- /dev/null +++ b/cpp/tests/groupby/group_max_scan_test.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_max_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_max_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4}); + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7}); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3}, all_null()); + value_wrapper vals({3, 4, 5}); + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({-1, -1, -1}, all_null()); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_max_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( {1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { -, 3, 6, 1, 4, -, 9, 2, _, 8, -} + result_wrapper expect_vals({-1, 8, 8, 6, 9, -1, 9, 7, 7, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale}; + + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale}; + // clang-format on + + auto agg = cudf::make_max_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_min_scan_test.cpp b/cpp/tests/groupby/group_min_scan_test.cpp new file mode 100644 index 00000000000..d3186d880cc --- /dev/null +++ b/cpp/tests/groupby/group_min_scan_test.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_min_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +TYPED_TEST_CASE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_min_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4}); + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2}); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys({1, 2, 3}, all_null()); + value_wrapper vals({3, 4, 5}); + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({-1, -1, -1}, all_null()); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_min_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4} + key_wrapper expect_keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { _, 8, 1, 6, 9, _, 4, 7, 2, 3, _} + result_wrapper expect_vals({-1, 8, 1, 6, 6, -1, 4, 7, 3, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +struct groupby_min_scan_string_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_min_scan_string_test, basic) +{ + key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"}; + + key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + strings_column_wrapper expect_vals; + + auto agg = cudf::make_min_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)), + "Unsupported groupby scan type-agg combination"); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale}; + + // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3} + auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale}; + // clang-format on + + auto agg = cudf::make_min_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/group_sum_scan_test.cpp b/cpp/tests/groupby/group_sum_scan_test.cpp new file mode 100644 index 00000000000..9f6c21462b3 --- /dev/null +++ b/cpp/tests/groupby/group_sum_scan_test.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { +using K = int32_t; +using key_wrapper = fixed_width_column_wrapper; + +template +struct groupby_sum_scan_test : public cudf::test::BaseFixture { + using V = T; + using R = cudf::detail::target_type_t; + using value_wrapper = fixed_width_column_wrapper; + using result_wrapper = fixed_width_column_wrapper; +}; + +using supported_types = + cudf::test::Concat, + cudf::test::DurationTypes>; + +TYPED_TEST_CASE(groupby_sum_scan_test, supported_types); + +TYPED_TEST(groupby_sum_scan_test, basic) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + key_wrapper expect_keys {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + // {0, 3, 6, 1, 4, 5, 9, 2, 7, 8} + result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}; + // clang-format on + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, empty_cols) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys{}; + value_wrapper vals{}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, zero_valid_keys) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys({1, 2, 3}, all_null()); + value_wrapper vals{3, 4, 5}; + + key_wrapper expect_keys{}; + result_wrapper expect_vals{}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, zero_valid_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys {1, 1, 1}; + value_wrapper vals({3, 4, 5}, all_null()); + + key_wrapper expect_keys {1, 1, 1}; + result_wrapper expect_vals({3, 4, 5}, all_null()); + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +TYPED_TEST(groupby_sum_scan_test, null_keys_and_values) +{ + using value_wrapper = typename TestFixture::value_wrapper; + using result_wrapper = typename TestFixture::result_wrapper; + + // clang-format off + key_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 1, 2, 2, 2, 2, 3, *, 3, 4}; + key_wrapper expect_keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, all_valid()); + // { -, 3, 6, 1, 4, -, 9, 2, _, 8, -} + result_wrapper expect_vals({-1, 3, 9, 1, 5, -1, 14, 2, 10, -1}, + { 0, 1, 1, 1, 1, 0, 1, 1, 1, 0}); + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = fixed_point_column_wrapper; + using out_fp_wrapper = fixed_point_column_wrapper; + + for (auto const i : {2, 1, 0, -1, -2}) { + auto const scale = scale_type{i}; + // clang-format off + auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + + auto const expect_keys = key_wrapper {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}; + auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale}; + // clang-format on + + auto agg2 = cudf::make_sum_aggregation(); + test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2)); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/groupby_keys_test.cpp b/cpp/tests/groupby/groupby_keys_test.cpp index 06ec9eb8968..78299e1a18c 100644 --- a/cpp/tests/groupby/groupby_keys_test.cpp +++ b/cpp/tests/groupby/groupby_keys_test.cpp @@ -33,166 +33,229 @@ using supported_types = cudf::test:: TYPED_TEST_CASE(groupby_keys_test, supported_types); -// clang-format off TYPED_TEST(groupby_keys_test, basic) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + // clang-format off + fixed_width_column_wrapper keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - fixed_width_column_wrapper expect_keys { 1, 2, 3 }; - fixed_width_column_wrapper expect_vals { 3, 4, 3 }; + fixed_width_column_wrapper expect_keys { 1, 2, 3 }; + fixed_width_column_wrapper expect_vals { 3, 4, 3 }; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, zero_valid_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys ( { 1, 2, 3}, all_null() ); - fixed_width_column_wrapper vals { 3, 4, 5}; + // clang-format off + fixed_width_column_wrapper keys ( { 1, 2, 3}, all_null() ); + fixed_width_column_wrapper vals { 3, 4, 5}; - fixed_width_column_wrapper expect_keys { }; - fixed_width_column_wrapper expect_vals { }; + fixed_width_column_wrapper expect_keys { }; + fixed_width_column_wrapper expect_vals { }; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, some_null_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, - { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4} - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); - // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -} - fixed_width_column_wrapper expect_vals { 3, 4, 2, 1}; + // clang-format off + fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4} + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); + // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -} + fixed_width_column_wrapper expect_vals { 3, 4, 2, 1}; + // clang-format on - auto agg = cudf::make_count_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_count_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } TYPED_TEST(groupby_keys_test, include_null_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, - { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - - // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, -} - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4, 3}, - { 1, 1, 1, 1, 0}); - // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -, -} - fixed_width_column_wrapper expect_vals { 9, 19, 10, 4, 7}; - - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::NO, null_policy::INCLUDE); + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys( { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, -} + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4, 3}, + { 1, 1, 1, 1, 0}); + // { 0, 3, 6, 1, 4, 5, 9, 2, 8, -, -} + fixed_width_column_wrapper expect_vals { 9, 19, 10, 4, 7}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::NO, + null_policy::INCLUDE); } TYPED_TEST(groupby_keys_test, pre_sorted_keys) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys { 1, 2, 3, 4}; - fixed_width_column_wrapper expect_vals { 3, 18, 24, 4}; + fixed_width_column_wrapper expect_keys { 1, 2, 3, 4}; + fixed_width_column_wrapper expect_vals { 3, 18, 24, 4}; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys { 4, 3, 2, 1 }; - fixed_width_column_wrapper expect_vals { 0, 6, 22, 21 }; + fixed_width_column_wrapper expect_keys { 4, 3, 2, 1 }; + fixed_width_column_wrapper expect_vals { 0, 6, 22, 21 }; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES, {order::DESCENDING}); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES, + {order::DESCENDING}); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; - fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, - { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + // clang-format off + fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, + { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); - fixed_width_column_wrapper expect_vals { 3, 15, 17, 4}; + fixed_width_column_wrapper expect_keys({ 1, 2, 3, 4}, all_valid()); + fixed_width_column_wrapper expect_vals { 3, 15, 17, 4}; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::EXCLUDE, + sorted::YES); } TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls) { - using K = TypeParam; - using V = int32_t; - using R = cudf::detail::target_type_t; - - fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, - { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; - - // { 1, 1, 1, -, -, 2, 2, -, 3, 3, 4} - fixed_width_column_wrapper expect_keys({ 1, 2, 2, 3, 3, 4}, - { 1, 0, 1, 0, 1, 1}); - fixed_width_column_wrapper expect_vals { 3, 7, 11, 7, 17, 4}; - - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), - force_use_sort_impl::YES, null_policy::INCLUDE, sorted::YES); + using K = TypeParam; + using V = int32_t; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys( { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4}, + { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}; + + // { 1, 1, 1, -, -, 2, 2, -, 3, 3, 4} + fixed_width_column_wrapper expect_keys({ 1, 2, 2, 3, 3, 4}, + { 1, 0, 1, 0, 1, 1}); + fixed_width_column_wrapper expect_vals { 3, 7, 11, 7, 17, 4}; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::YES, + null_policy::INCLUDE, + sorted::YES); +} + +TYPED_TEST(groupby_keys_test, mismatch_num_rows) +{ + using K = TypeParam; + using V = int32_t; + + fixed_width_column_wrapper keys{1, 2, 3}; + fixed_width_column_wrapper vals{0, 1, 2, 3, 4}; + + auto agg = cudf::make_count_aggregation(); + CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)), + "Size mismatch between request values and groupby keys."); + CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)), + "Size mismatch between request values and groupby keys."); } -struct groupby_string_keys_test : public cudf::test::BaseFixture {}; +struct groupby_string_keys_test : public cudf::test::BaseFixture { +}; TEST_F(groupby_string_keys_test, basic) { - using V = int32_t; - using R = cudf::detail::target_type_t; + using V = int32_t; + using R = cudf::detail::target_type_t; - strings_column_wrapper keys { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"}; - fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + // clang-format off + strings_column_wrapper keys { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"}; + fixed_width_column_wrapper vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - strings_column_wrapper expect_keys({ "aaa", "año", "₹1" }); - fixed_width_column_wrapper expect_vals { 9, 19, 17 }; + strings_column_wrapper expect_keys({ "aaa", "año", "₹1" }); + fixed_width_column_wrapper expect_vals { 9, 19, 17 }; + // clang-format on - auto agg = cudf::make_sum_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + auto agg = cudf::make_sum_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } // clang-format on diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 0b68b7bbfb4..c7e27cd6367 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -99,6 +99,32 @@ inline void test_single_agg(column_view const& keys, } } +inline void test_single_scan(column_view const& keys, + column_view const& values, + column_view const& expect_keys, + column_view const& expect_vals, + std::unique_ptr&& agg, + null_policy include_null_keys = null_policy::EXCLUDE, + sorted keys_are_sorted = sorted::NO, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}) +{ + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = values; + + requests[0].aggregations.push_back(std::move(agg)); + + groupby::groupby gb_obj( + table_view({keys}), include_null_keys, keys_are_sorted, column_order, null_precedence); + + // groupby scan uses sort implementation + auto result = gb_obj.scan(requests); + + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({expect_keys}), result.first->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, *result.second[0].results[0], true); +} + inline auto all_valid() { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });