From d3351537089c18d9f30efe3d8a769ecd2d0f8187 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 May 2022 13:14:15 -0400 Subject: [PATCH 01/31] Use new row hasher and comparator --- cpp/src/groupby/hash/groupby.cu | 39 +++++++++++++++++---------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index e22b3a4f3a4..60538275b28 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include @@ -421,7 +421,7 @@ void sparse_to_dense_results(table_view const& keys, * @brief Construct hash map that uses row comparator and row hasher on * `d_keys` table and stores indices */ -auto create_hash_map(table_device_view const& d_keys, +auto create_hash_map(table_view const& keys, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream) @@ -429,28 +429,30 @@ auto create_hash_map(table_device_view const& d_keys, size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; - using map_type = - concurrent_unordered_map, - row_equality_comparator>; + using map_type = concurrent_unordered_map< + cudf::size_type, + cudf::size_type, + cudf::experimental::row::hash::device_row_hasher, + cudf::experimental::row::equality::device_row_comparator>; using allocator_type = typename map_type::allocator_type; auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; + auto has_null = nullate::DYNAMIC{keys_have_nulls}; - row_hasher hasher{nullate::DYNAMIC{keys_have_nulls}, - d_keys}; - row_equality_comparator rows_equal{ - nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal}; + auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); + cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); + auto key_equal = row_equal.device_comparator(has_null, null_keys_are_equal); + auto row_hash = cudf::experimental::row::hash::row_hasher(preprocessed_keys); - return map_type::create(compute_hash_table_size(d_keys.num_rows()), + return map_type::create(compute_hash_table_size(keys.num_rows()), stream, unused_key, unused_value, - hasher, - rows_equal, + row_hash.device_hasher(has_null), + key_equal, allocator_type()); } @@ -509,8 +511,8 @@ void compute_single_pass_aggs(table_view const& keys, auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); auto d_values = table_device_view::create(flattened_values, stream); auto const d_aggs = cudf::detail::make_device_uvector_async(agg_kinds, stream); - - bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; + auto const skip_key_rows_with_nulls = + keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto row_bitmask = skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; @@ -594,8 +596,7 @@ std::unique_ptr groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto d_keys_ptr = table_device_view::create(keys, stream); - auto map = create_hash_map(*d_keys_ptr, keys_have_nulls, include_null_keys, stream); + auto map = create_hash_map(keys, keys_have_nulls, include_null_keys, stream); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash map @@ -670,7 +671,7 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr); + groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } From b227ec415c37369a6ea45a5b7276af230e695dea Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 May 2022 18:17:57 -0400 Subject: [PATCH 02/31] Get rid of Map template --- cpp/src/groupby/hash/groupby.cu | 49 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 60538275b28..fb799ccbc1d 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -65,6 +65,13 @@ namespace detail { namespace hash { namespace { +using map_type = concurrent_unordered_map< + cudf::size_type, + cudf::size_type, + cudf::experimental::row::hash::device_row_hasher, + cudf::experimental::row::equality::device_row_comparator>; + /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -179,14 +186,13 @@ class groupby_simple_aggregations_collector final } }; -template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; data_type result_type; cudf::detail::result_cache* sparse_results; cudf::detail::result_cache* dense_results; device_span gather_map; - Map const& map; + map_type const& map; bitmask_type const* __restrict__ row_bitmask; rmm::cuda_stream_view stream; rmm::mr::device_memory_resource* mr; @@ -198,7 +204,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - Map const& map, + map_type const& map, bitmask_type const* row_bitmask, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -327,7 +333,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final rmm::exec_policy(stream), thrust::make_counting_iterator(0), col.size(), - ::cudf::detail::var_hash_functor{ + ::cudf::detail::var_hash_functor{ map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); sparse_results->add_result(col, agg, std::move(var_result)); dense_results->add_result(col, agg, to_dense_agg_result(agg)); @@ -385,14 +391,12 @@ flatten_single_pass_aggs(host_span requests) * * @see groupby_null_templated() */ -template void sparse_to_dense_results(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - // size_type map_size, - Map const& map, + map_type const& map, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream, @@ -409,7 +413,7 @@ void sparse_to_dense_results(table_view const& keys, // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( + auto finalizer = hash_compound_agg_finalizer( col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); @@ -429,18 +433,11 @@ auto create_hash_map(table_view const& keys, size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; - using map_type = concurrent_unordered_map< - cudf::size_type, - cudf::size_type, - cudf::experimental::row::hash::device_row_hasher, - cudf::experimental::row::equality::device_row_comparator>; - using allocator_type = typename map_type::allocator_type; auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto has_null = nullate::DYNAMIC{keys_have_nulls}; + auto const has_null = nullate::DYNAMIC{keys_have_nulls}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); @@ -493,11 +490,10 @@ auto create_sparse_results_table(table_view const& flattened_values, * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` */ -template void compute_single_pass_aggs(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, - Map& map, + map_type& map, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream) @@ -520,13 +516,13 @@ void compute_single_pass_aggs(table_view const& keys, rmm::exec_policy(stream), thrust::make_counting_iterator(0), keys.num_rows(), - hash::compute_single_pass_aggs_fn{map, - keys.num_rows(), - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); + hash::compute_single_pass_aggs_fn{map, + keys.num_rows(), + *d_values, + *d_sparse_table, + d_aggs.data(), + static_cast(row_bitmask.data()), + skip_key_rows_with_nulls}); // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { @@ -540,8 +536,7 @@ void compute_single_pass_aggs(table_view const& keys, * @brief Computes and returns a device vector containing all populated keys in * `map`. */ -template -rmm::device_uvector extract_populated_keys(Map map, +rmm::device_uvector extract_populated_keys(map_type const& map, size_type num_keys, rmm::cuda_stream_view stream) { From 3deb64ba4ee44d368534834b6a9e3b67def6f9dc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 5 May 2022 11:24:56 -0400 Subject: [PATCH 03/31] Fix a bug: update the lifecycle of preprocessed table --- cpp/src/groupby/hash/groupby.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index fb799ccbc1d..b6d8c699ecf 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -425,10 +425,12 @@ void sparse_to_dense_results(table_view const& keys, * @brief Construct hash map that uses row comparator and row hasher on * `d_keys` table and stores indices */ -auto create_hash_map(table_view const& keys, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream) +auto create_hash_map( + std::shared_ptr const& preprocessed_keys, + cudf::size_type const num_keys, + bool const keys_have_nulls, + null_policy const include_null_keys, + rmm::cuda_stream_view stream) { size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; @@ -439,12 +441,11 @@ auto create_hash_map(table_view const& keys, include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; auto const has_null = nullate::DYNAMIC{keys_have_nulls}; - auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); auto key_equal = row_equal.device_comparator(has_null, null_keys_are_equal); auto row_hash = cudf::experimental::row::hash::row_hasher(preprocessed_keys); - return map_type::create(compute_hash_table_size(keys.num_rows()), + return map_type::create(compute_hash_table_size(num_keys), stream, unused_key, unused_value, @@ -591,7 +592,10 @@ std::unique_ptr
groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto map = create_hash_map(keys, keys_have_nulls, include_null_keys, stream); + auto const num_keys = keys.num_rows(); + auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); + auto map = + create_hash_map(preprocessed_keys, num_keys, keys_have_nulls, include_null_keys, stream); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash map From bf94a8d63913eeb47e474be54bb1182ecc04f830 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 May 2022 16:34:02 -0400 Subject: [PATCH 04/31] Get rid of flattened columns --- cpp/src/groupby/groupby.cu | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a002b0bb744..e0503953d10 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -75,15 +75,7 @@ std::pair, std::vector> groupby::disp // satisfied with a hash implementation if (_keys_are_sorted == sorted::NO and not _helper and detail::hash::can_use_hash_groupby(_keys, requests)) { - // Optionally flatten nested key columns. - auto flattened = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE); - auto flattened_keys = flattened.flattened_columns(); - auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); }; - CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type), - "Unsupported groupby key type does not support equality comparison"); - auto [grouped_keys, results] = - detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr); - return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results)); + return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr); } else { return sort_aggregate(requests, stream, mr); } From 80d8f876f358e9fbf47b329cc5f054006e841344 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 9 May 2022 12:59:58 -0400 Subject: [PATCH 05/31] Fix a bug: keys always have nulls --- cpp/src/groupby/hash/groupby.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index b6d8c699ecf..f4765020625 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -428,7 +428,6 @@ void sparse_to_dense_results(table_view const& keys, auto create_hash_map( std::shared_ptr const& preprocessed_keys, cudf::size_type const num_keys, - bool const keys_have_nulls, null_policy const include_null_keys, rmm::cuda_stream_view stream) { @@ -439,7 +438,7 @@ auto create_hash_map( auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto const has_null = nullate::DYNAMIC{keys_have_nulls}; + auto const has_null = nullate::DYNAMIC{true}; // always has null TODO: why?? cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); auto key_equal = row_equal.device_comparator(has_null, null_keys_are_equal); @@ -594,8 +593,7 @@ std::unique_ptr
groupby(table_view const& keys, { auto const num_keys = keys.num_rows(); auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); - auto map = - create_hash_map(preprocessed_keys, num_keys, keys_have_nulls, include_null_keys, stream); + auto map = create_hash_map(preprocessed_keys, num_keys, include_null_keys, stream); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash map From 5f704ecfcfa7dadb958039dfc9d4a3f24af61408 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 9 May 2022 13:22:44 -0400 Subject: [PATCH 06/31] Pass shared_ptr of preprocessed table by value --- cpp/src/groupby/hash/groupby.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f4765020625..1a43b0067b7 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -426,7 +426,7 @@ void sparse_to_dense_results(table_view const& keys, * `d_keys` table and stores indices */ auto create_hash_map( - std::shared_ptr const& preprocessed_keys, + std::shared_ptr preprocessed_keys, cudf::size_type const num_keys, null_policy const include_null_keys, rmm::cuda_stream_view stream) @@ -442,7 +442,7 @@ auto create_hash_map( cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); auto key_equal = row_equal.device_comparator(has_null, null_keys_are_equal); - auto row_hash = cudf::experimental::row::hash::row_hasher(preprocessed_keys); + auto row_hash = cudf::experimental::row::hash::row_hasher(std::move(preprocessed_keys)); return map_type::create(compute_hash_table_size(num_keys), stream, From 6de7c0b5157c390c9f61d380ac140e8e39577963 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 May 2022 12:31:57 -0400 Subject: [PATCH 07/31] Add structs argmax unit tests --- cpp/tests/groupby/argmax_tests.cpp | 57 +++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp index 0b06c184b75..ea6a0414d4c 100644 --- a/cpp/tests/groupby/argmax_tests.cpp +++ b/cpp/tests/groupby/argmax_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -118,6 +118,61 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } +template +using FWCW = cudf::test::fixed_width_column_wrapper; + +TYPED_TEST(groupby_argmax_test, structs) +{ + using V = TypeParam; + + using R = cudf::detail::target_type_t; + using STRINGS = cudf::test::strings_column_wrapper; + using STRUCTS = cudf::test::structs_column_wrapper; + + if (std::is_same_v) return; + + /* + `@` indicates null + keys: values: + /+---------------+ + |s1{s2{a,b}, c}| + +----------------+ + 0 | { {1, 1}, "a"}| 1 + 1 | { {1, 2}, "b"}| 2 + 2 | {@{2, 1}, "c"}| 3 + 3 | {@{2, 1}, "c"}| 4 + 4 | @{ {2, 2}, "d"}| 5 + 5 | @{ {2, 2}, "d"}| 6 + 6 | { {1, 1}, "a"}| 7 + 7 | {@{2, 1}, "c"}| 8 + 8 | { {1, 1}, "a"}| 9 + +----------------+ + */ + + // clang-format off + auto col_a = FWCW{ 1, 1, 2, 2, 2, 2, 1, 2, 1}; + auto col_b = FWCW{ 1, 2, 1, 1, 2, 2, 1, 1, 1}; + auto col_c = STRINGS{"a", "b", "c", "c", "d", "d", "a", "c", "a"}; + // clang-format on + auto s2 = STRUCTS{{col_a, col_b}, nulls_at({2, 3, 7})}; + + auto keys = STRUCTS{{s2, col_c}, nulls_at({4, 5})}; + auto vals = FWCW{1, 2, 3, 4, 5, 6, 7, 8, 9}; + + // clang-format off + auto expected_col_a = FWCW{ 1, 1, 2, 2, 2}; + auto expected_col_b = FWCW{ 1, 2, 1, 1, 1}; + auto expected_col_c = STRINGS{"a", "b", "c", "c", "c"}; + // clang-format on + auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, nulls_at({2, 3, 4})}; + + auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()}; + auto expect_vals = FWCW{8, 1, 2, 3, 7}; + + auto agg = cudf::make_argmax_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + struct groupby_argmax_string_test : public cudf::test::BaseFixture { }; From 70d740fb5b8a13a324322d1bc7c034b60c76287c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 May 2022 18:46:33 -0400 Subject: [PATCH 08/31] Add basic list tests --- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/groupby/lists_tests.cpp | 69 ------------ cpp/tests/groupby/lists_tests.cu | 172 ++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 70 deletions(-) delete mode 100644 cpp/tests/groupby/lists_tests.cpp create mode 100644 cpp/tests/groupby/lists_tests.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 95c54d7596e..d22e8735260 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -83,7 +83,7 @@ ConfigureTest( groupby/covariance_tests.cpp groupby/groups_tests.cpp groupby/keys_tests.cpp - groupby/lists_tests.cpp + groupby/lists_tests.cu groupby/m2_tests.cpp groupby/min_tests.cpp groupby/max_scan_tests.cpp diff --git a/cpp/tests/groupby/lists_tests.cpp b/cpp/tests/groupby/lists_tests.cpp deleted file mode 100644 index 11b8ffa92b9..00000000000 --- a/cpp/tests/groupby/lists_tests.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace test { - -template -struct groupby_lists_test : public cudf::test::BaseFixture { -}; - -TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes); - -namespace { -// Checking with a single aggregation, and aggregation column. -// This test is orthogonal to the aggregation type; it focuses on testing the grouping -// with LISTS keys. -auto sum_agg() { return cudf::make_sum_aggregation(); } - -void test_sort_based_sum_agg(column_view const& keys, column_view const& values) -{ - test_single_agg( - keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE); -} - -void test_hash_based_sum_agg(column_view const& keys, column_view const& values) -{ - test_single_agg( - keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE); -} - -} // namespace - -TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported) -{ - // Test that grouping on LISTS columns fails visibly. - - // clang-format off - auto keys = lists_column_wrapper { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; - auto values = fixed_width_column_wrapper { 0, 1, 2, 3, 4 }; - // clang-format on - - EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error); - EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error); -} - -} // namespace test -} // namespace cudf diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu new file mode 100644 index 00000000000..33d0884febd --- /dev/null +++ b/cpp/tests/groupby/lists_tests.cu @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "groupby_test_util.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf { +namespace test { + +template +struct groupby_lists_test : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes); + +using namespace cudf::test::iterators; + +using R = cudf::detail::target_type_t; // Type of aggregation result. +using strings = strings_column_wrapper; +using structs = structs_column_wrapper; + +template +using fwcw = cudf::test::fixed_width_column_wrapper; + +template +using lcw = cudf::test::lists_column_wrapper; + +namespace { +// Checking with a single aggregation, and aggregation column. +// This test is orthogonal to the aggregation type; it focuses on testing the grouping +// with LISTS keys. +auto sum_agg() { return cudf::make_sum_aggregation(); } + +// TODO: this is a naive way to compare expected key/value against resulting key/value. To be +// replaced once list lex comparator is supported (https://github.com/rapidsai/cudf/issues/5890) +template +struct match_expected_fn { + match_expected_fn(cudf::size_type const num_rows, Equal equal) + : _num_rows{num_rows}, _equal{equal} + { + } + + __device__ bool operator()(cudf::size_type const idx) + { + for (auto i = _num_rows; i < 2 * _num_rows; i++) { + if (_equal(idx, i)) { return true; } + } + return false; + } + + cudf::size_type const _num_rows; + Equal _equal; +}; + +inline void test_hash_based_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expect_keys, + column_view const& expect_vals) +{ + auto agg = cudf::make_sum_aggregation(); + auto const include_null_keys = null_policy::INCLUDE; + auto const keys_are_sorted = sorted::NO; + + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = values; + + requests[0].aggregations.push_back(std::move(agg)); + + groupby::groupby gb_obj(cudf::table_view({keys}), include_null_keys, keys_are_sorted); + + auto result = gb_obj.aggregate(requests); + + cudf::table_view result_kv{ + {result.first->get_column(0).view(), result.second[0].results[0]->view()}}; + cudf::table_view expected_kv{{expect_keys, expect_vals}}; + + auto const num_rows = result_kv.num_rows(); + EXPECT_EQ(num_rows, expected_kv.num_rows()); + + // Concatenate expected table and resulting table into one unique table `t`: + // expected table: `t [ 0, num_rows - 1]` + // resulting table: `t [num_rows, 2 * num_rows - 1]` + auto combined_table = cudf::concatenate(std::vector{expected_kv, result_kv}); + auto preprocessed_t = cudf::experimental::row::hash::preprocessed_table::create( + combined_table->view(), rmm::cuda_stream_default); + cudf::experimental::row::equality::self_comparator comparator(preprocessed_t); + + auto const null_keys_are_equal = + include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; + auto row_equal = comparator.device_comparator(nullate::DYNAMIC{true}, null_keys_are_equal); + auto func = match_expected_fn{num_rows, row_equal}; + + // For each row in expected table `t[0, num_rows)`, there must be a match + // in the resulting table `t[num_rows, 2 * num_rows)` + EXPECT_TRUE(thrust::all_of(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + func)); +} + +void test_sort_based_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expect_keys, + column_view const& expect_vals) +{ + test_single_agg(keys, + values, + expect_keys, + expect_vals, + sum_agg(), + force_use_sort_impl::YES, + null_policy::INCLUDE); +} + +void test_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expected_keys, + column_view const& expected_values) +{ + EXPECT_THROW(test_sort_based_sum_agg(keys, values, expected_keys, expected_values), + cudf::logic_error); + test_hash_based_sum_agg(keys, values, expected_keys, expected_values); +} +} // namespace + +TYPED_TEST(groupby_lists_test, basic) +{ + if (std::is_same_v) { return; } + + // clang-format off + auto keys = lcw { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; + auto values = fwcw { 0, 1, 2, 3, 4 }; + + auto expected_keys = lcw { {1,1}, {2,2}, {3,3} }; + auto expected_values = fwcw { 3, 5, 2 }; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +} // namespace test +} // namespace cudf From 1a700167a1cd2119e96cd02f3ff47574ea21ef7f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 May 2022 19:42:27 -0400 Subject: [PATCH 09/31] Add all null input tests --- cpp/tests/groupby/lists_tests.cu | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index 33d0884febd..8fd30a75050 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -56,6 +56,8 @@ template using lcw = cudf::test::lists_column_wrapper; namespace { +static constexpr auto null = -1; + // Checking with a single aggregation, and aggregation column. // This test is orthogonal to the aggregation type; it focuses on testing the grouping // with LISTS keys. @@ -158,11 +160,24 @@ TYPED_TEST(groupby_lists_test, basic) if (std::is_same_v) { return; } // clang-format off - auto keys = lcw { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; - auto values = fwcw { 0, 1, 2, 3, 4 }; + auto keys = lcw { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; + auto values = fwcw { 0, 1, 2, 3, 4 }; + + auto expected_keys = lcw { {1,1}, {2,2}, {3,3} }; + auto expected_values = fwcw { 3, 5, 2 }; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_lists_test, all_null_input) +{ + // clang-format off + auto keys = lcw { {{1,1}, {2,2}, {3,3}, {1,1}, {2,2}}, all_nulls()}; + auto values = fwcw { 0, 1, 2, 3, 4 }; - auto expected_keys = lcw { {1,1}, {2,2}, {3,3} }; - auto expected_values = fwcw { 3, 5, 2 }; + auto expected_keys = lcw { {{null,null}}, all_nulls()}; + auto expected_values = fwcw { 10 }; // clang-format on test_sum_agg(keys, values, expected_keys, expected_values); From 965eba466db32b036b7af5d424260aa86ef92ed7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 May 2022 21:05:15 -0400 Subject: [PATCH 10/31] Add lists with nulls tests --- cpp/tests/groupby/lists_tests.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index 8fd30a75050..b8b06fea636 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -183,5 +183,18 @@ TYPED_TEST(groupby_lists_test, all_null_input) test_sum_agg(keys, values, expected_keys, expected_values); } +TYPED_TEST(groupby_lists_test, lists_with_nulls) +{ + // clang-format off + auto keys = lcw { {{1,1}, {2,2}, {3,3}, {1,1}, {2,2}}, nulls_at({1,2,4})}; + auto values = fwcw { 0, 1, 2, 3, 4 }; + + auto expected_keys = lcw { {{1,1}, {null,null}}, null_at(1)}; + auto expected_values = fwcw { 3, 7 }; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + } // namespace test } // namespace cudf From 762bf692485dea0ecea083054579349efd366672 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 11 May 2022 10:28:42 -0400 Subject: [PATCH 11/31] Fix a lifetime bug for row operators --- cpp/src/groupby/hash/groupby.cu | 56 +++++++++++++-------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 1a43b0067b7..78b1d0dc66c 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -421,38 +421,6 @@ void sparse_to_dense_results(table_view const& keys, } } -/** - * @brief Construct hash map that uses row comparator and row hasher on - * `d_keys` table and stores indices - */ -auto create_hash_map( - std::shared_ptr preprocessed_keys, - cudf::size_type const num_keys, - null_policy const include_null_keys, - rmm::cuda_stream_view stream) -{ - size_type constexpr unused_key{std::numeric_limits::max()}; - size_type constexpr unused_value{std::numeric_limits::max()}; - - using allocator_type = typename map_type::allocator_type; - - auto const null_keys_are_equal = - include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto const has_null = nullate::DYNAMIC{true}; // always has null TODO: why?? - - cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys); - auto key_equal = row_equal.device_comparator(has_null, null_keys_are_equal); - auto row_hash = cudf::experimental::row::hash::row_hasher(std::move(preprocessed_keys)); - - return map_type::create(compute_hash_table_size(num_keys), - stream, - unused_key, - unused_value, - row_hash.device_hasher(has_null), - key_equal, - allocator_type()); -} - // make table that will hold sparse results auto create_sparse_results_table(table_view const& flattened_values, std::vector aggs, @@ -591,9 +559,29 @@ std::unique_ptr
groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const num_keys = keys.num_rows(); + auto const num_keys = keys.num_rows(); + auto const null_keys_are_equal = + include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; + auto const has_null = nullate::DYNAMIC{true}; // always has null TODO: why?? + auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); - auto map = create_hash_map(preprocessed_keys, num_keys, include_null_keys, stream); + cudf::experimental::row::equality::self_comparator comparator(preprocessed_keys); + auto row_hash = cudf::experimental::row::hash::row_hasher(std::move(preprocessed_keys)); + auto d_key_equal = comparator.device_comparator(has_null, null_keys_are_equal); + auto d_row_hash = row_hash.device_hasher(has_null); + + size_type constexpr unused_key{std::numeric_limits::max()}; + size_type constexpr unused_value{std::numeric_limits::max()}; + + using allocator_type = typename map_type::allocator_type; + + auto map = map_type::create(compute_hash_table_size(num_keys), + stream, + unused_key, + unused_value, + d_row_hash, + d_key_equal, + allocator_type()); // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash map From a000e65a0f14581b361180c78a2ce44b4e978404 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 12 May 2022 14:16:17 -0400 Subject: [PATCH 12/31] Fix a bug: check nested nulls when initing row operators --- cpp/src/groupby/hash/groupby.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 78b1d0dc66c..b28015ac3d1 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -562,7 +562,7 @@ std::unique_ptr
groupby(table_view const& keys, auto const num_keys = keys.num_rows(); auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto const has_null = nullate::DYNAMIC{true}; // always has null TODO: why?? + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); cudf::experimental::row::equality::self_comparator comparator(preprocessed_keys); From 208f224a10ec11f4803f4f2d7bd0d202c6d7c607 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 12 May 2022 17:35:37 -0400 Subject: [PATCH 13/31] Add const + comments --- cpp/src/groupby/hash/groupby.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index b28015ac3d1..c520351b1d5 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -65,6 +65,8 @@ namespace detail { namespace hash { namespace { +// TODO: replace it with `cuco::static_map` +// https://github.com/rapidsai/cudf/issues/10401 using map_type = concurrent_unordered_map< cudf::size_type, cudf::size_type, @@ -554,8 +556,8 @@ rmm::device_uvector extract_populated_keys(map_type const& map, std::unique_ptr
groupby(table_view const& keys, host_span requests, cudf::detail::result_cache* cache, - bool keys_have_nulls, - null_policy include_null_keys, + bool const keys_have_nulls, + null_policy const include_null_keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { From b6346e5d630440ea7272448949e7bba650d9cff3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 12 May 2022 17:57:00 -0400 Subject: [PATCH 14/31] Consistently use has_nested_nulls --- cpp/src/groupby/hash/groupby.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c520351b1d5..f4e8eb1d38f 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -564,7 +564,7 @@ std::unique_ptr
groupby(table_view const& keys, auto const num_keys = keys.num_rows(); auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const has_null = nullate::DYNAMIC{keys_have_nulls}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); cudf::experimental::row::equality::self_comparator comparator(preprocessed_keys); @@ -658,7 +658,7 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); + groupby(keys, requests, &cache, cudf::has_nested_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } From 2f70d8f52bde35e9bcc02c3badec508e0b31390d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 17 May 2022 11:52:52 -0400 Subject: [PATCH 15/31] Use auto const consistently --- cpp/src/groupby/hash/groupby.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f4e8eb1d38f..efbf95a607d 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -567,10 +567,10 @@ std::unique_ptr
groupby(table_view const& keys, auto const has_null = nullate::DYNAMIC{keys_have_nulls}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); - cudf::experimental::row::equality::self_comparator comparator(preprocessed_keys); - auto row_hash = cudf::experimental::row::hash::row_hasher(std::move(preprocessed_keys)); - auto d_key_equal = comparator.device_comparator(has_null, null_keys_are_equal); - auto d_row_hash = row_hash.device_hasher(has_null); + auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; + auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; + auto const d_key_equal = comparator.device_comparator(has_null, null_keys_are_equal); + auto const d_row_hash = row_hash.device_hasher(has_null); size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; From 4e6de362af01cb6bb00aecfe12238070d6e235a2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 17 May 2022 12:17:08 -0400 Subject: [PATCH 16/31] Remove unused parameter --- cpp/include/cudf/detail/groupby.hpp | 5 ++--- cpp/src/groupby/groupby.cu | 2 +- cpp/src/groupby/hash/groupby.cu | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp index 36a76c7b6de..0037a01b496 100644 --- a/cpp/include/cudf/detail/groupby.hpp +++ b/cpp/include/cudf/detail/groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,13 +31,12 @@ namespace hash { * @brief Indicates if a set of aggregation requests can be satisfied with a * hash-based groupby implementation. * - * @param keys The table of keys * @param requests The set of columns to aggregate and the aggregations to * perform * @return true A hash-based groupby can be used * @return false A hash-based groupby cannot be used */ -bool can_use_hash_groupby(table_view const& keys, host_span requests); +bool can_use_hash_groupby(host_span requests); // Hash-based groupby std::pair, std::vector> groupby( diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index e0503953d10..c12d24886ad 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -74,7 +74,7 @@ std::pair, std::vector> groupby::disp // Only use hash groupby if the keys aren't sorted and all requests can be // satisfied with a hash implementation if (_keys_are_sorted == sorted::NO and not _helper and - detail::hash::can_use_hash_groupby(_keys, requests)) { + detail::hash::can_use_hash_groupby(requests)) { return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr); } else { return sort_aggregate(requests, stream, mr); diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index efbf95a607d..0a19c6b2f42 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -623,13 +623,12 @@ std::unique_ptr
groupby(table_view const& keys, * @brief Indicates if a set of aggregation requests can be satisfied with a * hash-based groupby implementation. * - * @param keys The table of keys * @param requests The set of columns to aggregate and the aggregations to * perform * @return true A hash-based groupby should be used * @return false A hash-based groupby should not be used */ -bool can_use_hash_groupby(table_view const& keys, host_span requests) +bool can_use_hash_groupby(host_span requests) { return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) { // Currently, structs are not supported in any of hash-based aggregations. From 935ccf6244143db195ad194e2687fbf65d5a7b74 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 17 May 2022 13:32:04 -0400 Subject: [PATCH 17/31] Move test to proper file --- cpp/tests/groupby/argmax_tests.cpp | 57 +----------------------------- cpp/tests/groupby/keys_tests.cpp | 57 +++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp index ea6a0414d4c..0b06c184b75 100644 --- a/cpp/tests/groupby/argmax_tests.cpp +++ b/cpp/tests/groupby/argmax_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -118,61 +118,6 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } -template -using FWCW = cudf::test::fixed_width_column_wrapper; - -TYPED_TEST(groupby_argmax_test, structs) -{ - using V = TypeParam; - - using R = cudf::detail::target_type_t; - using STRINGS = cudf::test::strings_column_wrapper; - using STRUCTS = cudf::test::structs_column_wrapper; - - if (std::is_same_v) return; - - /* - `@` indicates null - keys: values: - /+---------------+ - |s1{s2{a,b}, c}| - +----------------+ - 0 | { {1, 1}, "a"}| 1 - 1 | { {1, 2}, "b"}| 2 - 2 | {@{2, 1}, "c"}| 3 - 3 | {@{2, 1}, "c"}| 4 - 4 | @{ {2, 2}, "d"}| 5 - 5 | @{ {2, 2}, "d"}| 6 - 6 | { {1, 1}, "a"}| 7 - 7 | {@{2, 1}, "c"}| 8 - 8 | { {1, 1}, "a"}| 9 - +----------------+ - */ - - // clang-format off - auto col_a = FWCW{ 1, 1, 2, 2, 2, 2, 1, 2, 1}; - auto col_b = FWCW{ 1, 2, 1, 1, 2, 2, 1, 1, 1}; - auto col_c = STRINGS{"a", "b", "c", "c", "d", "d", "a", "c", "a"}; - // clang-format on - auto s2 = STRUCTS{{col_a, col_b}, nulls_at({2, 3, 7})}; - - auto keys = STRUCTS{{s2, col_c}, nulls_at({4, 5})}; - auto vals = FWCW{1, 2, 3, 4, 5, 6, 7, 8, 9}; - - // clang-format off - auto expected_col_a = FWCW{ 1, 1, 2, 2, 2}; - auto expected_col_b = FWCW{ 1, 2, 1, 1, 1}; - auto expected_col_c = STRINGS{"a", "b", "c", "c", "c"}; - // clang-format on - auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, nulls_at({2, 3, 4})}; - - auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()}; - auto expect_vals = FWCW{8, 1, 2, 3, 7}; - - auto agg = cudf::make_argmax_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); -} - struct groupby_argmax_string_test : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index 94c26f3fe8f..81b065a70cb 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -242,6 +242,61 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows) "Size mismatch between request values and groupby keys."); } +template +using FWCW = cudf::test::fixed_width_column_wrapper; + +TYPED_TEST(groupby_keys_test, structs) +{ + using V = TypeParam; + + using R = cudf::detail::target_type_t; + using STRINGS = cudf::test::strings_column_wrapper; + using STRUCTS = cudf::test::structs_column_wrapper; + + if (std::is_same_v) return; + + /* + `@` indicates null + keys: values: + /+---------------+ + |s1{s2{a,b}, c}| + +----------------+ + 0 | { {1, 1}, "a"}| 1 + 1 | { {1, 2}, "b"}| 2 + 2 | {@{2, 1}, "c"}| 3 + 3 | {@{2, 1}, "c"}| 4 + 4 | @{ {2, 2}, "d"}| 5 + 5 | @{ {2, 2}, "d"}| 6 + 6 | { {1, 1}, "a"}| 7 + 7 | {@{2, 1}, "c"}| 8 + 8 | { {1, 1}, "a"}| 9 + +----------------+ + */ + + // clang-format off + auto col_a = FWCW{ 1, 1, 2, 2, 2, 2, 1, 2, 1 }; + auto col_b = FWCW{ 1, 2, 1, 1, 2, 2, 1, 1, 1 }; + auto col_c = STRINGS{"a", "b", "c", "c", "d", "d", "a", "c", "a"}; + // clang-format on + auto s2 = STRUCTS{{col_a, col_b}, nulls_at({2, 3, 7})}; + + auto keys = STRUCTS{{s2, col_c}, nulls_at({4, 5})}; + auto vals = FWCW{1, 2, 3, 4, 5, 6, 7, 8, 9}; + + // clang-format off + auto expected_col_a = FWCW{ 1, 1, 2, 2, 2 }; + auto expected_col_b = FWCW{ 1, 2, 1, 1, 1 }; + auto expected_col_c = STRINGS{"a", "b", "c", "c", "c"}; + // clang-format on + auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, nulls_at({2, 3, 4})}; + + auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()}; + auto expect_vals = FWCW{8, 1, 2, 3, 7}; + + auto agg = cudf::make_argmax_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + struct groupby_string_keys_test : public cudf::test::BaseFixture { }; From bd277238096a81cc5a1c4a4836c55f22fdcde22c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 May 2022 11:31:27 -0400 Subject: [PATCH 18/31] Minor cleanups --- cpp/tests/groupby/lists_tests.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index b8b06fea636..3a30054eb16 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -89,15 +89,13 @@ inline void test_hash_based_sum_agg(column_view const& keys, column_view const& expect_keys, column_view const& expect_vals) { - auto agg = cudf::make_sum_aggregation(); auto const include_null_keys = null_policy::INCLUDE; auto const keys_are_sorted = sorted::NO; std::vector requests; - requests.emplace_back(groupby::aggregation_request()); - requests[0].values = values; - - requests[0].aggregations.push_back(std::move(agg)); + auto& request = requests.emplace_back(groupby::aggregation_request()); + request.values = values; + request.aggregations.push_back(std::move(cudf::make_sum_aggregation())); groupby::groupby gb_obj(cudf::table_view({keys}), include_null_keys, keys_are_sorted); From d4724be63da63a80c061881933a532f747807410 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 May 2022 20:32:36 -0400 Subject: [PATCH 19/31] Add group struct keys benchmark --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/groupby/group_struct_keys.cpp | 103 ++++++++++++++++++ ...roup_struct.cu => group_struct_values.cpp} | 0 3 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_struct_keys.cpp rename cpp/benchmarks/groupby/{group_struct.cu => group_struct_values.cpp} (100%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 04dcf51dd40..cb4ead20d00 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -198,13 +198,13 @@ ConfigureBench( groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu - groupby/group_struct.cu + groupby/group_struct_values.cpp groupby/group_no_requests.cu groupby/group_scan.cu groupby/group_rank_benchmark.cu ) -ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu) +ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu groupby/group_struct_keys.cpp) # ################################################################################################## # * hashing benchmark ----------------------------------------------------------------------------- diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp new file mode 100644 index 00000000000..cc770d09be3 --- /dev/null +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include + +#include + +#include + +void bench_groupby_struct_keys(nvbench::state& state) +{ + cudf::rmm_pool_raii pool_raii; + + using Type = int; + using column_wrapper = cudf::test::fixed_width_column_wrapper; + std::default_random_engine generator; + std::uniform_int_distribution distribution(0, 100); + + const cudf::size_type n_rows{static_cast(state.get_int64("NumRows"))}; + const cudf::size_type n_cols{1}; + const cudf::size_type depth{static_cast(state.get_int64("Depth"))}; + const bool nulls{static_cast(state.get_int64("Nulls"))}; + + // Create columns with values in the range [0,100) + std::vector columns; + columns.reserve(n_cols); + std::generate_n(std::back_inserter(columns), n_cols, [&]() { + auto const elements = cudf::detail::make_counting_transform_iterator( + 0, [&](auto row) { return distribution(generator); }); + if (!nulls) return column_wrapper(elements, elements + n_rows); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 10 != 0; }); + return column_wrapper(elements, elements + n_rows, valids); + }); + + std::vector> cols; + std::transform(columns.begin(), columns.end(), std::back_inserter(cols), [](column_wrapper& col) { + return col.release(); + }); + + std::vector> child_cols = std::move(cols); + // Add some layers + for (int i = 0; i < depth; i++) { + std::vector struct_validity; + std::uniform_int_distribution bool_distribution(0, 100 * (i + 1)); + std::generate_n( + std::back_inserter(struct_validity), n_rows, [&]() { return bool_distribution(generator); }); + cudf::test::structs_column_wrapper struct_col(std::move(child_cols), struct_validity); + child_cols = std::vector>{}; + child_cols.push_back(struct_col.release()); + } + + data_profile profile; + profile.set_null_frequency(std::nullopt); + profile.set_cardinality(0); + profile.set_distribution_params( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + + auto const keys_table = cudf::table(std::move(child_cols)); + auto const vals_table = + create_random_table({cudf::type_to_id()}, row_count{n_rows}, profile); + + cudf::groupby::groupby gb_obj(keys_table.view()); + + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[0].values = vals_table->get_column(0).view(); + requests[0].aggregations.push_back(cudf::make_min_aggregation()); + + // Set up nvbench default stream + auto stream = rmm::cuda_stream_default; + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = gb_obj.aggregate(requests); + cudaStreamSynchronize(stream.value()); + }); +} + +NVBENCH_BENCH(bench_groupby_struct_keys) + .set_name("groupby_struct_keys") + .add_int64_power_of_two_axis("NumRows", {10, 16, 20}) + .add_int64_axis("Depth", {0, 1, 8}) + .add_int64_axis("Nulls", {0, 1}); diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct_values.cpp similarity index 100% rename from cpp/benchmarks/groupby/group_struct.cu rename to cpp/benchmarks/groupby/group_struct_values.cpp From 9aa2f8ddcc5b64132ab96af3f3d37c776f9f1509 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 May 2022 20:58:50 -0400 Subject: [PATCH 20/31] Remove unnecessary sync --- cpp/benchmarks/groupby/group_struct_keys.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index cc770d09be3..8398125db21 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -90,10 +90,8 @@ void bench_groupby_struct_keys(nvbench::state& state) auto stream = rmm::cuda_stream_default; state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto const result = gb_obj.aggregate(requests); - cudaStreamSynchronize(stream.value()); - }); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); } NVBENCH_BENCH(bench_groupby_struct_keys) From 055c31a227ba591b6e32749792e43bfa795f7adb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 May 2022 08:42:44 -0400 Subject: [PATCH 21/31] Remove unused parameter --- cpp/src/groupby/hash/groupby.cu | 1 - cpp/src/groupby/hash/groupby_kernels.cuh | 4 ---- 2 files changed, 5 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 0a19c6b2f42..169b59defed 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -487,7 +487,6 @@ void compute_single_pass_aggs(table_view const& keys, thrust::make_counting_iterator(0), keys.num_rows(), hash::compute_single_pass_aggs_fn{map, - keys.num_rows(), *d_values, *d_sparse_table, d_aggs.data(), diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 79286fb3839..eedb07200a5 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -65,7 +65,6 @@ namespace hash { template struct compute_single_pass_aggs_fn { Map map; - size_type num_keys; table_device_view input_values; mutable_table_device_view output_values; aggregation::Kind const* __restrict__ aggs; @@ -76,7 +75,6 @@ struct compute_single_pass_aggs_fn { * @brief Construct a new compute_single_pass_aggs_fn functor object * * @param map Hash map object to insert key,value pairs into. - * @param num_keys The number of rows in input keys table * @param input_values The table whose rows will be aggregated in the values * of the hash map * @param output_values Table that stores the results of aggregating rows of @@ -90,14 +88,12 @@ struct compute_single_pass_aggs_fn { * bitmask where bit `i` indicates the presence of a null value in row `i`. */ compute_single_pass_aggs_fn(Map map, - size_type num_keys, table_device_view input_values, mutable_table_device_view output_values, aggregation::Kind const* aggs, bitmask_type const* row_bitmask, bool skip_rows_with_nulls) : map(map), - num_keys(num_keys), input_values(input_values), output_values(output_values), aggs(aggs), From abdb431d70fcbab9e7531e9652d31bffd9b07e91 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 May 2022 10:36:02 -0400 Subject: [PATCH 22/31] Update unit test --- cpp/tests/groupby/keys_tests.cpp | 42 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index 81b065a70cb..d5a9301e0d9 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -257,26 +257,26 @@ TYPED_TEST(groupby_keys_test, structs) /* `@` indicates null - keys: values: - /+---------------+ + keys: values: + /+----------------+ |s1{s2{a,b}, c}| - +----------------+ - 0 | { {1, 1}, "a"}| 1 - 1 | { {1, 2}, "b"}| 2 - 2 | {@{2, 1}, "c"}| 3 - 3 | {@{2, 1}, "c"}| 4 - 4 | @{ {2, 2}, "d"}| 5 - 5 | @{ {2, 2}, "d"}| 6 - 6 | { {1, 1}, "a"}| 7 - 7 | {@{2, 1}, "c"}| 8 - 8 | { {1, 1}, "a"}| 9 - +----------------+ + +-----------------+ + 0 | { { 1, 1}, "a"}| 1 + 1 | { { 1, 2}, "b"}| 2 + 2 | {@{ 2, 1}, "c"}| 3 + 3 | {@{ 2, 1}, "c"}| 4 + 4 | @{ { 2, 2}, "d"}| 5 + 5 | @{ { 2, 2}, "d"}| 6 + 6 | { { 1, 1}, "a"}| 7 + 7 | {@{ 2, 1}, "c"}| 8 + 8 | { {@1, 1}, "a"}| 9 + +-----------------+ */ // clang-format off - auto col_a = FWCW{ 1, 1, 2, 2, 2, 2, 1, 2, 1 }; - auto col_b = FWCW{ 1, 2, 1, 1, 2, 2, 1, 1, 1 }; - auto col_c = STRINGS{"a", "b", "c", "c", "d", "d", "a", "c", "a"}; + auto col_a = FWCW{{ 1, 1, 2, 2, 2, 2, 1, 2, 1 }, null_at(8)}; + auto col_b = FWCW { 1, 2, 1, 1, 2, 2, 1, 1, 1 }; + auto col_c = STRINGS {"a", "b", "c", "c", "d", "d", "a", "c", "a"}; // clang-format on auto s2 = STRUCTS{{col_a, col_b}, nulls_at({2, 3, 7})}; @@ -284,14 +284,14 @@ TYPED_TEST(groupby_keys_test, structs) auto vals = FWCW{1, 2, 3, 4, 5, 6, 7, 8, 9}; // clang-format off - auto expected_col_a = FWCW{ 1, 1, 2, 2, 2 }; - auto expected_col_b = FWCW{ 1, 2, 1, 1, 1 }; - auto expected_col_c = STRINGS{"a", "b", "c", "c", "c"}; + auto expected_col_a = FWCW{ 1, 1 }; + auto expected_col_b = FWCW{ 1, 2 }; + auto expected_col_c = STRINGS{"a", "b"}; // clang-format on - auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, nulls_at({2, 3, 4})}; + auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}}; auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()}; - auto expect_vals = FWCW{8, 1, 2, 3, 7}; + auto expect_vals = FWCW{6, 1}; auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); From 70ca9a0f721967e8ca78c50eb50b65c02aa168d4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 May 2022 10:58:12 -0400 Subject: [PATCH 23/31] Improvement: use flattened keys to compute row bitmask --- cpp/src/groupby/hash/groupby.cu | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 169b59defed..44c2fe29f46 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -480,8 +481,15 @@ void compute_single_pass_aggs(table_view const& keys, auto const skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - auto row_bitmask = - skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; + // TODO: This is a temporary workaround. Get rid of "flattened" logic + // once `bitmask_and` supports nested nulls handling. + auto const flattened = cudf::structs::detail::flatten_nested_columns( + keys, {}, {}, cudf::structs::detail::column_nullability::FORCE); + auto const flattened_keys = flattened.flattened_columns(); + auto row_bitmask = skip_key_rows_with_nulls + ? cudf::detail::bitmask_and(flattened_keys, stream).first + : rmm::device_buffer{}; + thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -563,7 +571,11 @@ std::unique_ptr
groupby(table_view const& keys, auto const num_keys = keys.num_rows(); auto const null_keys_are_equal = include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - auto const has_null = nullate::DYNAMIC{keys_have_nulls}; + // Minor optimization: safely bypass null handling in row operators + // which are never called on rows containing nulls + auto const has_null = include_null_keys == null_policy::EXCLUDE + ? nullate::DYNAMIC{false} + : nullate::DYNAMIC{keys_have_nulls}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; From ece43210d7d826ac8f64530d91b2b7417e3cebca Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 May 2022 22:12:48 -0400 Subject: [PATCH 24/31] Add tests for lists with null elements --- cpp/tests/groupby/lists_tests.cu | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index 3a30054eb16..fa3fcd98328 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -194,5 +194,19 @@ TYPED_TEST(groupby_lists_test, lists_with_nulls) test_sum_agg(keys, values, expected_keys, expected_values); } +TYPED_TEST(groupby_lists_test, lists_with_null_elements) +{ + auto keys = lcw{{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, + {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, + {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, + {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}}, + nulls_at({2, 3})}; + auto values = fwcw{1, 2, 4, 5}; + + auto expected_keys = lcw{{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, {}}, null_at(1)}; + auto expected_values = fwcw{3, 9}; + + test_sum_agg(keys, values, expected_keys, expected_values); +} } // namespace test } // namespace cudf From 55902dd0e700b642d11175203381863cd24f5e0d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 07:00:28 -0400 Subject: [PATCH 25/31] Minor cleanups --- cpp/src/groupby/hash/groupby.cu | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 44c2fe29f46..831ddaa6c7c 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -481,14 +481,17 @@ void compute_single_pass_aggs(table_view const& keys, auto const skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - // TODO: This is a temporary workaround. Get rid of "flattened" logic - // once `bitmask_and` supports nested nulls handling. - auto const flattened = cudf::structs::detail::flatten_nested_columns( - keys, {}, {}, cudf::structs::detail::column_nullability::FORCE); - auto const flattened_keys = flattened.flattened_columns(); - auto row_bitmask = skip_key_rows_with_nulls - ? cudf::detail::bitmask_and(flattened_keys, stream).first - : rmm::device_buffer{}; + auto row_bitmask = [&] { + if (skip_key_rows_with_nulls) { + // TODO: This is a temporary workaround. Get rid of "flattened" logic + // once `bitmask_and` supports nested nulls handling. + auto const flattened = cudf::structs::detail::flatten_nested_columns( + keys, {}, {}, cudf::structs::detail::column_nullability::FORCE); + auto const flattened_keys = flattened.flattened_columns(); + return cudf::detail::bitmask_and(flattened_keys, stream).first; + } + return rmm::device_buffer{}; + }(); thrust::for_each_n( rmm::exec_policy(stream), From 9086f338b4bf9a8c828d6b28587a0ceb6eaac2b9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 12:51:16 -0400 Subject: [PATCH 26/31] Add exception to null exclude case --- cpp/src/groupby/hash/groupby.cu | 8 ++++++++ cpp/tests/groupby/keys_tests.cpp | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 831ddaa6c7c..915c129a0f0 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -668,6 +668,14 @@ std::pair, std::vector> groupby( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto const has_list_column = + std::any_of(keys.begin(), keys.end(), [](cudf::column_view const& col) { + return col.type().id() == type_id::LIST; + }); + if (has_list_column and include_null_keys == cudf::null_policy::EXCLUDE) { + CUDF_FAIL("Null LIST keys cannot be excluded."); + } + cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index d5a9301e0d9..4a92e902492 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -297,6 +297,26 @@ TYPED_TEST(groupby_keys_test, structs) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } +template +using LCW = cudf::test::lists_column_wrapper; + +TYPED_TEST(groupby_keys_test, lists) +{ + using R = cudf::detail::target_type_t; + + // clang-format off + auto keys = LCW { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; + auto values = FWCW { 0, 1, 2, 3, 4 }; + + auto expected_keys = LCW { {1,1}, {2,2}, {3,3} }; + auto expected_values = FWCW { 3, 5, 2 }; + // clang-format on + + auto agg = cudf::make_sum_aggregation(); + EXPECT_THROW(test_single_agg(keys, values, expected_keys, expected_values, std::move(agg)), + cudf::logic_error); +} + struct groupby_string_keys_test : public cudf::test::BaseFixture { }; From 6f170d87c39def9fb5b37863c1cd83b6a608ed4c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 15:07:16 -0400 Subject: [PATCH 27/31] Revert changes to match pandas dropna behavior --- cpp/src/groupby/hash/groupby.cu | 26 ++++++-------------------- cpp/tests/groupby/keys_tests.cpp | 10 +++++----- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 915c129a0f0..91f8235d070 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -481,17 +481,8 @@ void compute_single_pass_aggs(table_view const& keys, auto const skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - auto row_bitmask = [&] { - if (skip_key_rows_with_nulls) { - // TODO: This is a temporary workaround. Get rid of "flattened" logic - // once `bitmask_and` supports nested nulls handling. - auto const flattened = cudf::structs::detail::flatten_nested_columns( - keys, {}, {}, cudf::structs::detail::column_nullability::FORCE); - auto const flattened_keys = flattened.flattened_columns(); - return cudf::detail::bitmask_and(flattened_keys, stream).first; - } - return rmm::device_buffer{}; - }(); + auto row_bitmask = + skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; thrust::for_each_n( rmm::exec_policy(stream), @@ -571,14 +562,9 @@ std::unique_ptr
groupby(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const num_keys = keys.num_rows(); - auto const null_keys_are_equal = - include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL; - // Minor optimization: safely bypass null handling in row operators - // which are never called on rows containing nulls - auto const has_null = include_null_keys == null_policy::EXCLUDE - ? nullate::DYNAMIC{false} - : nullate::DYNAMIC{keys_have_nulls}; + auto const num_keys = keys.num_rows(); + auto const null_keys_are_equal = null_equality::EQUAL; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; @@ -679,7 +665,7 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nested_nulls(keys), include_null_keys, stream, mr); + groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index 4a92e902492..fe1040acd63 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -284,14 +284,14 @@ TYPED_TEST(groupby_keys_test, structs) auto vals = FWCW{1, 2, 3, 4, 5, 6, 7, 8, 9}; // clang-format off - auto expected_col_a = FWCW{ 1, 1 }; - auto expected_col_b = FWCW{ 1, 2 }; - auto expected_col_c = STRINGS{"a", "b"}; + auto expected_col_a = FWCW{{1, 1, 1, 2 }, null_at(2)}; + auto expected_col_b = FWCW{ 1, 2, 1, 1 }; + auto expected_col_c = STRINGS{"a", "b", "a", "c"}; // clang-format on - auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}}; + auto expected_s2 = STRUCTS{{expected_col_a, expected_col_b}, null_at(3)}; auto expect_keys = STRUCTS{{expected_s2, expected_col_c}, no_nulls()}; - auto expect_vals = FWCW{6, 1}; + auto expect_vals = FWCW{6, 1, 8, 7}; auto agg = cudf::make_argmax_aggregation(); test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); From c8b1aabe06cd9c179f50abe9d3d95857aa4dedac Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 15:20:31 -0400 Subject: [PATCH 28/31] Update unit tests to exercise null elements in list keys --- cpp/tests/groupby/lists_tests.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index fa3fcd98328..7c145271662 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -196,14 +196,16 @@ TYPED_TEST(groupby_lists_test, lists_with_nulls) TYPED_TEST(groupby_lists_test, lists_with_null_elements) { - auto keys = lcw{{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, - {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, - {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, - {{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}}, - nulls_at({2, 3})}; + auto keys = + lcw{{lcw{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, + lcw{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, + lcw{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, + lcw{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}}, + nulls_at({2, 3})}; auto values = fwcw{1, 2, 4, 5}; - auto expected_keys = lcw{{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, {}}, null_at(1)}; + auto expected_keys = lcw{ + {lcw{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, {}}, null_at(1)}; auto expected_values = fwcw{3, 9}; test_sum_agg(keys, values, expected_keys, expected_values); From 002ad40078b16246bd92b3787587a273c3f6f8ea Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 16:43:07 -0400 Subject: [PATCH 29/31] Minor cleanup --- cpp/src/groupby/groupby.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index c12d24886ad..e25512f80c5 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -65,8 +65,6 @@ std::pair, std::vector> groupby::disp rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - using namespace cudf::structs::detail; - // If sort groupby has been called once on this groupby object, then // always use sort groupby from now on. Because once keys are sorted, // all the aggs that can be done by hash groupby are efficiently done by From b76677cadaea0f5f8ba6a91b252ee891d7ada280 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 16:49:48 -0400 Subject: [PATCH 30/31] Remove unused header --- cpp/src/groupby/hash/groupby.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 91f8235d070..9fc3d641e9e 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include From efd497e04dce165d15a2822dd8917c46af5f189b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 May 2022 17:02:13 -0400 Subject: [PATCH 31/31] Throw when null structs are excluded --- cpp/src/groupby/hash/groupby.cu | 8 ++++---- cpp/tests/groupby/keys_tests.cpp | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 9fc3d641e9e..ab8d0089347 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -653,12 +653,12 @@ std::pair, std::vector> groupby( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const has_list_column = + auto const has_nested_column = std::any_of(keys.begin(), keys.end(), [](cudf::column_view const& col) { - return col.type().id() == type_id::LIST; + return cudf::is_nested(col.type()); }); - if (has_list_column and include_null_keys == cudf::null_policy::EXCLUDE) { - CUDF_FAIL("Null LIST keys cannot be excluded."); + if (has_nested_column and include_null_keys == cudf::null_policy::EXCLUDE) { + CUDF_FAIL("Null keys of nested type cannot be excluded."); } cudf::detail::result_cache cache(requests.size()); diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp index fe1040acd63..19e82c4ffd1 100644 --- a/cpp/tests/groupby/keys_tests.cpp +++ b/cpp/tests/groupby/keys_tests.cpp @@ -294,7 +294,8 @@ TYPED_TEST(groupby_keys_test, structs) auto expect_vals = FWCW{6, 1, 8, 7}; auto agg = cudf::make_argmax_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + EXPECT_THROW(test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)), + cudf::logic_error); } template