From 11b81262634a12e520a653e0bb861da698d353da Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Jan 2024 14:59:59 -0800 Subject: [PATCH 01/16] Rewrite hash groupby with hash set --- cpp/src/groupby/hash/groupby.cu | 101 ++++++++++---------- cpp/src/groupby/hash/groupby_kernels.cuh | 18 ++-- cpp/src/groupby/hash/multi_pass_kernels.cuh | 13 ++- 3 files changed, 66 insertions(+), 66 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 32693487c32..59c416e2892 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,9 +45,13 @@ #include #include #include -#include +#include #include +#include + + +#include #include #include @@ -67,15 +71,24 @@ namespace detail { namespace hash { namespace { -// TODO: replace it with `cuco::static_map` -// https://github.com/rapidsai/cudf/issues/10401 +int constexpr cg_size = 1; ///< Number of threads used to handle each input key +int constexpr window_size = 1; ///< Number of slots checked per thread +cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot + +using probing_scheme_type = cuco::experimental::linear_probing>; +using allocator_type = rmm::mr::stream_allocator_adaptor>; + template -using map_type = concurrent_unordered_map< - cudf::size_type, +using set_type = cuco::experimental::static_set< cudf::size_type, - cudf::experimental::row::hash::device_row_hasher, - ComparatorType>; + cuco::experimental::extent, + cuda::thread_scope_device, + ComparatorType, + probing_scheme_type, + allocator_type, + cuco::experimental::storage>; /** * @brief List of aggregation operations that can be computed with a hash-based @@ -191,14 +204,14 @@ class groupby_simple_aggregations_collector final } }; -template +template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; data_type result_type; cudf::detail::result_cache* sparse_results; cudf::detail::result_cache* dense_results; device_span gather_map; - map_type const& map; + SetType set; bitmask_type const* __restrict__ row_bitmask; rmm::cuda_stream_view stream; rmm::mr::device_memory_resource* mr; @@ -210,7 +223,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - map_type const& map, + SetType set, bitmask_type const* row_bitmask, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -218,7 +231,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final sparse_results(sparse_results), dense_results(dense_results), gather_map(gather_map), - map(map), + set(set), row_bitmask(row_bitmask), stream(stream), mr(mr) @@ -341,8 +354,8 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final rmm::exec_policy(stream), thrust::make_counting_iterator(0), col.size(), - ::cudf::detail::var_hash_functor>{ - map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + ::cudf::detail::var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); sparse_results->add_result(col, agg, std::move(var_result)); dense_results->add_result(col, agg, to_dense_agg_result(agg)); } @@ -399,13 +412,13 @@ flatten_single_pass_aggs(host_span requests) * * @see groupby_null_templated() */ -template +template void sparse_to_dense_results(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, - map_type const& map, + SetType set, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream, @@ -424,7 +437,7 @@ void sparse_to_dense_results(table_view const& keys, // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr); + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); } @@ -468,11 +481,11 @@ auto create_sparse_results_table(table_view const& flattened_values, * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` */ -template +template void compute_single_pass_aggs(table_view const& keys, host_span requests, cudf::detail::result_cache* sparse_results, - map_type& map, + SetType set, bool keys_have_nulls, null_policy include_null_keys, rmm::cuda_stream_view stream) @@ -498,8 +511,8 @@ void compute_single_pass_aggs(table_view const& keys, thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), keys.num_rows(), - hash::compute_single_pass_aggs_fn>{ - map, + hash::compute_single_pass_aggs_fn{ + set, *d_values, *d_sparse_table, d_aggs.data(), @@ -518,23 +531,15 @@ void compute_single_pass_aggs(table_view const& keys, * @brief Computes and returns a device vector containing all populated keys in * `map`. */ -template -rmm::device_uvector extract_populated_keys(map_type const& map, +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, size_type num_keys, rmm::cuda_stream_view stream) { rmm::device_uvector populated_keys(num_keys, stream); + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - auto const get_key = cuda::proclaim_return_type::key_type>( - [] __device__(auto const& element) { return element.first; }); // first = key - auto const key_used = [unused = map.get_unused_key()] __device__(auto key) { - return key != unused; - }; - auto const key_itr = thrust::make_transform_iterator(map.data(), get_key); - auto const end_it = cudf::detail::copy_if_safe( - key_itr, key_itr + map.capacity(), populated_keys.begin(), key_used, stream); - - populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream); + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); return populated_keys; } @@ -581,30 +586,26 @@ std::unique_ptr groupby(table_view const& keys, auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - size_type constexpr unused_key{std::numeric_limits::max()}; - size_type constexpr unused_value{std::numeric_limits::max()}; - // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash map cudf::detail::result_cache sparse_results(requests.size()); auto const comparator_helper = [&](auto const d_key_equal) { - using allocator_type = typename map_type::allocator_type; - - auto const map = map_type::create(compute_hash_table_size(num_keys), - stream, - unused_key, - unused_value, - d_row_hash, - d_key_equal, - allocator_type()); - // Compute all single pass aggs first + auto const set = set_type{num_keys, + 0.5, // desired load factor + cuco::empty_key{key_sentinel}, + d_key_equal, + probing_scheme_type{d_row_hash}, + allocator_type{default_allocator{}, stream}, + stream.value()}; + + // Compute all single pass aggs first compute_single_pass_aggs( - keys, requests, &sparse_results, *map, keys_have_nulls, include_null_keys, stream); + keys, requests, &sparse_results, set.ref(cuco::experimental::insert_and_find), keys_have_nulls, include_null_keys, stream); // Extract the populated indices from the hash map and create a gather map. // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(*map, keys.num_rows(), stream); + auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); // Compact all results from sparse_results and insert into cache sparse_to_dense_results(keys, @@ -612,7 +613,7 @@ std::unique_ptr
groupby(table_view const& keys, &sparse_results, cache, gather_map, - *map, + set.ref(cuco::experimental::find), keys_have_nulls, include_null_keys, stream, diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index eedb07200a5..1ffe870eeb4 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,11 +60,11 @@ namespace hash { * rows. In this way, after all rows are aggregated, `output_values` will likely * be "sparse", meaning that not all rows contain the result of an aggregation. * - * @tparam Map The type of the hash map + * @tparam SetType The type of the hash set device ref */ -template +template struct compute_single_pass_aggs_fn { - Map map; + SetType set; table_device_view input_values; mutable_table_device_view output_values; aggregation::Kind const* __restrict__ aggs; @@ -74,7 +74,7 @@ struct compute_single_pass_aggs_fn { /** * @brief Construct a new compute_single_pass_aggs_fn functor object * - * @param map Hash map object to insert key,value pairs into. + * @param set_ref Hash map object to insert key,value pairs into. * @param input_values The table whose rows will be aggregated in the values * of the hash map * @param output_values Table that stores the results of aggregating rows of @@ -87,13 +87,13 @@ struct compute_single_pass_aggs_fn { * null values should be skipped. It `true`, it is assumed `row_bitmask` is a * bitmask where bit `i` indicates the presence of a null value in row `i`. */ - compute_single_pass_aggs_fn(Map map, + compute_single_pass_aggs_fn(SetType set, table_device_view input_values, mutable_table_device_view output_values, aggregation::Kind const* aggs, bitmask_type const* row_bitmask, bool skip_rows_with_nulls) - : map(map), + : set(set), input_values(input_values), output_values(output_values), aggs(aggs), @@ -105,10 +105,10 @@ struct compute_single_pass_aggs_fn { __device__ void operator()(size_type i) { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { - auto result = map.insert(thrust::make_pair(i, i)); + auto const result = set.insert_and_find(i); cudf::detail::aggregate_row( - output_values, result.first->second, input_values, i, aggs); + output_values, *result.first, input_values, i, aggs); } } }; diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh index 4bc73631732..51432ed5de8 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,23 +31,23 @@ namespace cudf { namespace detail { -template +template struct var_hash_functor { - Map const map; + SetType set; bitmask_type const* __restrict__ row_bitmask; mutable_column_device_view target; column_device_view source; column_device_view sum; column_device_view count; size_type ddof; - var_hash_functor(Map const map, + var_hash_functor(SetType set, bitmask_type const* row_bitmask, mutable_column_device_view target, column_device_view source, column_device_view sum, column_device_view count, size_type ddof) - : map(map), + : set(set), row_bitmask(row_bitmask), target(target), source(source), @@ -96,8 +96,7 @@ struct var_hash_functor { __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { - auto result = map.find(source_index); - auto target_index = result->second; + auto const target_index = *set.find(source_index); auto col = source; auto source_type = source.type(); From 166ed494d85ced274f1e813038d2d34dfcab7a70 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Jan 2024 15:06:22 -0800 Subject: [PATCH 02/16] Formatting --- cpp/src/groupby/hash/groupby.cu | 79 +++++++++++---------- cpp/src/groupby/hash/groupby_kernels.cuh | 3 +- cpp/src/groupby/hash/multi_pass_kernels.cuh | 2 +- 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 59c416e2892..6e83a081130 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -45,12 +46,10 @@ #include #include #include -#include #include #include - #include #include @@ -71,24 +70,24 @@ namespace detail { namespace hash { namespace { -int constexpr cg_size = 1; ///< Number of threads used to handle each input key -int constexpr window_size = 1; ///< Number of slots checked per thread -cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot +int constexpr cg_size = 1; ///< Number of threads used to handle each input key +int constexpr window_size = 1; ///< Number of slots checked per thread +cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot -using probing_scheme_type = cuco::experimental::linear_probing>; +using probing_scheme_type = cuco::experimental::linear_probing< + cg_size, + cudf::experimental::row::hash::device_row_hasher>; using allocator_type = rmm::mr::stream_allocator_adaptor>; template -using set_type = cuco::experimental::static_set< - cudf::size_type, - cuco::experimental::extent, - cuda::thread_scope_device, - ComparatorType, - probing_scheme_type, - allocator_type, - cuco::experimental::storage>; +using set_type = cuco::experimental::static_set, + cuda::thread_scope_device, + ComparatorType, + probing_scheme_type, + allocator_type, + cuco::experimental::storage>; /** * @brief List of aggregation operations that can be computed with a hash-based @@ -508,16 +507,16 @@ void compute_single_pass_aggs(table_view const& keys, ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first : rmm::device_buffer{}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{ - set, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + hash::compute_single_pass_aggs_fn{set, + *d_values, + *d_sparse_table, + d_aggs.data(), + static_cast(row_bitmask.data()), + skip_key_rows_with_nulls}); // Add results back to sparse_results cache auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { @@ -591,17 +590,23 @@ std::unique_ptr
groupby(table_view const& keys, cudf::detail::result_cache sparse_results(requests.size()); auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = set_type{num_keys, - 0.5, // desired load factor - cuco::empty_key{key_sentinel}, - d_key_equal, - probing_scheme_type{d_row_hash}, - allocator_type{default_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - compute_single_pass_aggs( - keys, requests, &sparse_results, set.ref(cuco::experimental::insert_and_find), keys_have_nulls, include_null_keys, stream); + auto const set = + set_type{num_keys, + 0.5, // desired load factor + cuco::empty_key{key_sentinel}, + d_key_equal, + probing_scheme_type{d_row_hash}, + allocator_type{default_allocator{}, stream}, + stream.value()}; + + // Compute all single pass aggs first + compute_single_pass_aggs(keys, + requests, + &sparse_results, + set.ref(cuco::experimental::insert_and_find), + keys_have_nulls, + include_null_keys, + stream); // Extract the populated indices from the hash map and create a gather map. // Gathering using this map from sparse results will give dense results. diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 1ffe870eeb4..a6a09435589 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -107,8 +107,7 @@ struct compute_single_pass_aggs_fn { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row( - output_values, *result.first, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh index 51432ed5de8..7043eafdc10 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh @@ -96,7 +96,7 @@ struct var_hash_functor { __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { - auto const target_index = *set.find(source_index); + auto const target_index = *set.find(source_index); auto col = source; auto source_type = source.type(); From b1db24397ffbc9533464fcdf1592e945f936773a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 22 Jan 2024 16:11:29 -0800 Subject: [PATCH 03/16] Minor cleanups --- cpp/src/groupby/hash/groupby.cu | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 6e83a081130..c5a0f30bf77 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -71,7 +71,6 @@ namespace hash { namespace { int constexpr cg_size = 1; ///< Number of threads used to handle each input key -int constexpr window_size = 1; ///< Number of slots checked per thread cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot using probing_scheme_type = cuco::experimental::linear_probing< @@ -80,15 +79,6 @@ using probing_scheme_type = cuco::experimental::linear_probing< cudf::nullate::DYNAMIC>>; using allocator_type = rmm::mr::stream_allocator_adaptor>; -template -using set_type = cuco::experimental::static_set, - cuda::thread_scope_device, - ComparatorType, - probing_scheme_type, - allocator_type, - cuco::experimental::storage>; - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -591,13 +581,13 @@ std::unique_ptr
groupby(table_view const& keys, auto const comparator_helper = [&](auto const d_key_equal) { auto const set = - set_type{num_keys, - 0.5, // desired load factor - cuco::empty_key{key_sentinel}, - d_key_equal, - probing_scheme_type{d_row_hash}, - allocator_type{default_allocator{}, stream}, - stream.value()}; + cuco::experimental::static_set{num_keys, + 0.5, // desired load factor + cuco::empty_key{key_sentinel}, + d_key_equal, + probing_scheme_type{d_row_hash}, + allocator_type{default_allocator{}, stream}, + stream.value()}; // Compute all single pass aggs first compute_single_pass_aggs(keys, From ca6829d62d926565e2fef8a01ae7eaa8b3855bb3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 12:44:56 -0800 Subject: [PATCH 04/16] Update cuco code --- cpp/src/groupby/hash/groupby.cu | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c5a0f30bf77..1718848ae7e 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -48,7 +48,6 @@ #include #include -#include #include @@ -73,11 +72,10 @@ namespace { int constexpr cg_size = 1; ///< Number of threads used to handle each input key cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot -using probing_scheme_type = cuco::experimental::linear_probing< +using probing_scheme_type = cuco::linear_probing< cg_size, cudf::experimental::row::hash::device_row_hasher>; -using allocator_type = rmm::mr::stream_allocator_adaptor>; /** * @brief List of aggregation operations that can be computed with a hash-based @@ -580,20 +578,21 @@ std::unique_ptr
groupby(table_view const& keys, cudf::detail::result_cache sparse_results(requests.size()); auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = - cuco::experimental::static_set{num_keys, - 0.5, // desired load factor - cuco::empty_key{key_sentinel}, - d_key_equal, - probing_scheme_type{d_row_hash}, - allocator_type{default_allocator{}, stream}, - stream.value()}; + auto const set = cuco::static_set{num_keys, + 0.5, // desired load factor + cuco::empty_key{key_sentinel}, + d_key_equal, + probing_scheme_type{d_row_hash}, + cuco::thread_scope_device, + cuco::storage<1>{}, + cudf::detail::cuco_allocator{stream}, + stream.value()}; // Compute all single pass aggs first compute_single_pass_aggs(keys, requests, &sparse_results, - set.ref(cuco::experimental::insert_and_find), + set.ref(cuco::insert_and_find), keys_have_nulls, include_null_keys, stream); @@ -608,7 +607,7 @@ std::unique_ptr
groupby(table_view const& keys, &sparse_results, cache, gather_map, - set.ref(cuco::experimental::find), + set.ref(cuco::find), keys_have_nulls, include_null_keys, stream, From 0c10a0b875dc28fa5e2a9bef49ef7a27ef8c1b2a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 13:09:12 -0800 Subject: [PATCH 05/16] Add CUCO_CUDF_SIZE_TYPE_SENTINEL --- cpp/include/cudf/detail/cuco_helpers.hpp | 6 ++++++ cpp/src/groupby/hash/groupby.cu | 7 ++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp index 5f3c31479de..a030354fb3c 100644 --- a/cpp/include/cudf/detail/cuco_helpers.hpp +++ b/cpp/include/cudf/detail/cuco_helpers.hpp @@ -16,11 +16,17 @@ #pragma once +#include + #include #include namespace cudf::detail { +/// Sentinel value for `cudf::size_type` +cudf::size_type constexpr CUCO_CUDF_SIZE_TYPE_SENTINEL = -1; +// TODO: is it a mouthful? Maybe `CUCO_SIZE_TYPE_SENTINEL`? + /** * @brief Stream-ordered allocator adaptor used for cuco data structures * diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 1718848ae7e..821e760704a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -69,11 +69,8 @@ namespace detail { namespace hash { namespace { -int constexpr cg_size = 1; ///< Number of threads used to handle each input key -cudf::size_type constexpr key_sentinel = -1; ///< Sentinel value indicating an empty slot - using probing_scheme_type = cuco::linear_probing< - cg_size, + 1, ///< Number of threads used to handle each input key cudf::experimental::row::hash::device_row_hasher>; @@ -580,7 +577,7 @@ std::unique_ptr
groupby(table_view const& keys, auto const comparator_helper = [&](auto const d_key_equal) { auto const set = cuco::static_set{num_keys, 0.5, // desired load factor - cuco::empty_key{key_sentinel}, + cuco::empty_key{cudf::detail::CUCO_CUDF_SIZE_TYPE_SENTINEL}, d_key_equal, probing_scheme_type{d_row_hash}, cuco::thread_scope_device, From 2470c684f15ce4d0f8aeeba3927540887baf67af Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 13:26:47 -0800 Subject: [PATCH 06/16] Header cleanups --- cpp/src/groupby/hash/groupby.cu | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 821e760704a..ec3b7fc2760 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -30,15 +29,11 @@ #include #include #include -#include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -51,13 +46,8 @@ #include -#include #include #include -#include - -#include -#include #include #include From 7da8c5561de48f046665db644c3de8b26ec2817c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 13:42:53 -0800 Subject: [PATCH 07/16] Update docs --- cpp/src/groupby/hash/groupby.cu | 4 ++-- cpp/src/groupby/hash/groupby_kernels.cuh | 30 +++++++++--------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index ec3b7fc2760..ad5f43342d4 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -561,7 +561,7 @@ std::unique_ptr
groupby(table_view const& keys, auto const d_row_hash = row_hash.device_hasher(has_null); // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash map + // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); auto const comparator_helper = [&](auto const d_key_equal) { @@ -584,7 +584,7 @@ std::unique_ptr
groupby(table_view const& keys, include_null_keys, stream); - // Extract the populated indices from the hash map and create a gather map. + // Extract the populated indices from the hash set and create a gather map. // Gathering using this map from sparse results will give dense results. auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index a6a09435589..a8e02266593 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -30,30 +30,22 @@ namespace detail { namespace hash { /** * @brief Compute single-pass aggregations and store results into a sparse - * `output_values` table, and populate `map` with indices of unique keys + * `output_values` table, and populate `set` with indices of unique keys * - * The hash map is built by inserting every row `i` from the `keys` and - * `values` tables as a single (key,value) pair. When the pair is inserted, if - * the key was not already present in the map, then the corresponding value is - * simply copied to the output. If the key was already present in the map, - * then the inserted `values` row is aggregated with the existing row. This - * aggregation is done for every element `j` in the row by applying aggregation - * operation `j` between the new and existing element. + * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If + * the index was not present in the set, insert they index and then copy it to the output. If the + * key was already present in the set, then the inserted index is aggregated with the existing row. + * This aggregation is done for every element `j` in the row by applying aggregation operation `j` + * between the new and existing element. * * Instead of storing the entire rows from `input_keys` and `input_values` in - * the hashmap, we instead store the row indices. For example, when inserting - * row at index `i` from `input_keys` into the hash map, the value `i` is what - * gets stored for the hash map's "key". It is assumed the `map` was constructed + * the hashset, we instead store the row indices. For example, when inserting + * row at index `i` from `input_keys` into the hash set, the value `i` is what + * gets stored for the hash set's "key". It is assumed the `set` was constructed * with a custom comparator that uses these row indices to check for equality * between key rows. For example, comparing two keys `k0` and `k1` will compare * the two rows `input_keys[k0] ?= input_keys[k1]` * - * Likewise, we store the row indices for the hash maps "values". These indices - * index into the `output_values` table. For a given key `k` (which is an index - * into `input_keys`), the corresponding value `v` indexes into `output_values` - * and stores the result of aggregating rows from `input_values` from rows of - * `input_keys` equivalent to the row at `k`. - * * The exact size of the result is not known a priori, but can be upper bounded * by the number of rows in `input_keys` & `input_values`. Therefore, it is * assumed `output_values` has sufficient storage for an equivalent number of @@ -74,9 +66,9 @@ struct compute_single_pass_aggs_fn { /** * @brief Construct a new compute_single_pass_aggs_fn functor object * - * @param set_ref Hash map object to insert key,value pairs into. + * @param set_ref Hash set object to insert key,value pairs into. * @param input_values The table whose rows will be aggregated in the values - * of the hash map + * of the hash set * @param output_values Table that stores the results of aggregating rows of * `input_values`. * @param aggs The set of aggregation operations to perform across the From 7dd59a6c10a88eee37a33304b2bf4d53ec686916 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 13:44:43 -0800 Subject: [PATCH 08/16] Minor doc updates --- cpp/src/groupby/hash/groupby_kernels.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index a8e02266593..f4d2e63af4e 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -29,8 +29,8 @@ namespace groupby { namespace detail { namespace hash { /** - * @brief Compute single-pass aggregations and store results into a sparse - * `output_values` table, and populate `set` with indices of unique keys + * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, + * and populate `set` with indices of unique keys * * The hash set is built by inserting every row index `i` from the `keys` and `values` tables. If * the index was not present in the set, insert they index and then copy it to the output. If the From 3cbdb7c4b7fa4e5de684c79be105eefe1ae37d0b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 13:53:25 -0800 Subject: [PATCH 09/16] Add peak memory usage metrics to groupby NV bencmarks --- cpp/benchmarks/groupby/group_max.cpp | 7 ++++++- cpp/benchmarks/groupby/group_nunique.cpp | 7 ++++++- cpp/benchmarks/groupby/group_rank.cpp | 7 ++++++- cpp/benchmarks/groupby/group_struct_keys.cpp | 9 +++++++-- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index e65c37f001d..b7b330f02e5 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include @@ -50,9 +51,13 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_max_aggregation()); + auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH_TYPES(bench_groupby_max, diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp index 63d738b2951..8206c739e01 100644 --- a/cpp/benchmarks/groupby/group_nunique.cpp +++ b/cpp/benchmarks/groupby/group_nunique.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include @@ -63,9 +64,13 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list) auto const requests = make_aggregation_request_vector( *vals, cudf::make_nunique_aggregation()); + auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list)) diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp index 2122720a421..c00a2cef937 100644 --- a/cpp/benchmarks/groupby/group_rank.cpp +++ b/cpp/benchmarks/groupby/group_rank.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include @@ -53,11 +54,15 @@ static void nvbench_groupby_rank(nvbench::state& state, cudf::groupby::groupby gb_obj( keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO); + auto const mem_stats_logger = cudf::memory_stats_logger(); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; // groupby scan uses sort implementation auto result = gb_obj.scan(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } enum class rank_method : int32_t {}; diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index 44a12c1c30e..cadd9c2d137 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include @@ -80,11 +81,15 @@ void bench_groupby_struct_keys(nvbench::state& state) requests[0].aggregations.push_back(cudf::make_min_aggregation()); // Set up nvbench default stream - auto stream = cudf::get_default_stream(); + auto const mem_stats_logger = cudf::memory_stats_logger(); + auto stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH(bench_groupby_struct_keys) From 82aa0ce00b9748ae2d71aaf2a72cd45fdf0285dd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Feb 2024 14:09:51 -0800 Subject: [PATCH 10/16] Revert some benchmark changes --- cpp/benchmarks/groupby/group_nunique.cpp | 7 +------ cpp/benchmarks/groupby/group_rank.cpp | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp index 8206c739e01..63d738b2951 100644 --- a/cpp/benchmarks/groupby/group_nunique.cpp +++ b/cpp/benchmarks/groupby/group_nunique.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include @@ -64,13 +63,9 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list) auto const requests = make_aggregation_request_vector( *vals, cudf::make_nunique_aggregation()); - auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); - - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH_TYPES(bench_groupby_nunique, NVBENCH_TYPE_AXES(nvbench::type_list)) diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp index c00a2cef937..2122720a421 100644 --- a/cpp/benchmarks/groupby/group_rank.cpp +++ b/cpp/benchmarks/groupby/group_rank.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ #include -#include #include #include @@ -54,15 +53,11 @@ static void nvbench_groupby_rank(nvbench::state& state, cudf::groupby::groupby gb_obj( keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO); - auto const mem_stats_logger = cudf::memory_stats_logger(); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; // groupby scan uses sort implementation auto result = gb_obj.scan(requests); }); - - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } enum class rank_method : int32_t {}; From 4193c759f439b92b31faae33b87bec835bf0a2bf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Feb 2024 12:36:05 -0800 Subject: [PATCH 11/16] Fix pytests --- python/cudf/cudf/tests/test_groupby.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e8dbdd35352..232fc057702 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -55,12 +55,12 @@ def assert_groupby_results_equal( if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): expect = expect.sort_values(by=by).reset_index(drop=True) else: - expect = expect.sort_values().reset_index(drop=True) + expect = expect.sort_values(by=by).reset_index(drop=True) if isinstance(got, cudf.DataFrame): got = got.sort_values(by=by).reset_index(drop=True) else: - got = got.sort_values().reset_index(drop=True) + got = got.sort_values(by=by).reset_index(drop=True) assert_eq(expect, got, **kwargs) @@ -179,7 +179,7 @@ def test_groupby_agg_min_max_dictlist(nelem): def test_groupby_as_index_single_agg(pdf, gdf, as_index): gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") @pytest.mark.parametrize("engine", ["cudf", "jit"]) @@ -192,7 +192,7 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine): if PANDAS_GE_220: kwargs["include_groups"] = False pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs) - assert_groupby_results_equal(pdf, gdf) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") @pytest.mark.parametrize("as_index", [True, False]) @@ -3764,7 +3764,13 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` assert_groupby_results_equal( - actual, expected, check_names=False, check_index_type=False + actual, + expected, + check_names=False, + check_index_type=False, + as_index=as_index, + by=["gender", "education"], + sort=sort, ) From 574f628c36fa69b8658d7478ea40da17f019b4a0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Feb 2024 12:42:02 -0800 Subject: [PATCH 12/16] Renaming --- cpp/include/cudf/detail/cuco_helpers.hpp | 3 +-- cpp/src/groupby/hash/groupby.cu | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp index a030354fb3c..11a6f9a909b 100644 --- a/cpp/include/cudf/detail/cuco_helpers.hpp +++ b/cpp/include/cudf/detail/cuco_helpers.hpp @@ -24,8 +24,7 @@ namespace cudf::detail { /// Sentinel value for `cudf::size_type` -cudf::size_type constexpr CUCO_CUDF_SIZE_TYPE_SENTINEL = -1; -// TODO: is it a mouthful? Maybe `CUCO_SIZE_TYPE_SENTINEL`? +cudf::size_type constexpr CUDF_SIZE_TYPE_SENTINEL = -1; /** * @brief Stream-ordered allocator adaptor used for cuco data structures diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 458821cffae..dcb40edbddc 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -567,7 +567,7 @@ std::unique_ptr
groupby(table_view const& keys, auto const comparator_helper = [&](auto const d_key_equal) { auto const set = cuco::static_set{num_keys, 0.5, // desired load factor - cuco::empty_key{cudf::detail::CUCO_CUDF_SIZE_TYPE_SENTINEL}, + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_key_equal, probing_scheme_type{d_row_hash}, cuco::thread_scope_device, From 75a8e6461accd2434e9b3311c8c3c01691bc4a93 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Feb 2024 16:36:17 -0800 Subject: [PATCH 13/16] Fix several docstring tests --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 28 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 89abd7be0ba..0a90b535040 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7693,7 +7693,7 @@ def value_counts( dog 4 0 cat 4 0 ant 6 0 - >>> df.value_counts() + >>> df.value_counts(sort=True) num_legs num_wings 4 0 2 2 2 1 diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a236a9b6abf..1bbacb29a1e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -109,11 +109,11 @@ def _is_row_of(chunk, obj): Parrot 30.0 Parrot 20.0 Name: Max Speed, dtype: float64 ->>> ser.groupby(level=0).mean() +>>> ser.groupby(level=0, sort=True).mean() Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 ->>> ser.groupby(ser > 100).mean() +>>> ser.groupby(ser > 100, sort=True).mean() Max Speed False 25.0 True 370.0 @@ -133,7 +133,7 @@ def _is_row_of(chunk, obj): 1 Falcon 370.0 2 Parrot 24.0 3 Parrot 26.0 ->>> df.groupby(['Animal']).mean() +>>> df.groupby(['Animal'], sort=True).mean() Max Speed Animal Falcon 375.0 @@ -151,22 +151,22 @@ def _is_row_of(chunk, obj): Wild 350.0 Parrot Captive 30.0 Wild 20.0 ->>> df.groupby(level=0).mean() +>>> df.groupby(level=0, sort=True).mean() Max Speed Animal Falcon 370.0 Parrot 25.0 ->>> df.groupby(level="Type").mean() +>>> df.groupby(level="Type", sort=True).mean() Max Speed Type -Wild 185.0 Captive 210.0 +Wild 185.0 >>> df = cudf.DataFrame({{'A': 'a a b'.split(), ... 'B': [1,2,3], ... 'C': [4,6,5]}}) ->>> g1 = df.groupby('A', group_keys=False) ->>> g2 = df.groupby('A', group_keys=True) +>>> g1 = df.groupby('A', group_keys=False, sort=True) +>>> g2 = df.groupby('A', group_keys=True, sort=True) Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only differ in their ``group_keys`` argument. Calling `apply` in various ways, @@ -539,11 +539,11 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg('sum') + >>> a.groupby('a', sort=True).agg('sum') b c a - 2 3 1 1 3 4 + 2 3 1 Specifying a list of aggregations to perform on each column. @@ -553,12 +553,12 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg(['sum', 'min']) + >>> a.groupby('a', sort=True).agg(['sum', 'min']) b c sum min sum min a - 2 3 3 1 1 1 3 1 4 2 + 2 3 3 1 1 Using a dict to specify aggregations to perform per column. @@ -568,12 +568,12 @@ def agg(self, func): ... 'b': [1, 2, 3], ... 'c': [2, 2, 1] ... }) - >>> a.groupby('a').agg({'a': 'max', 'b': ['min', 'mean']}) + >>> a.groupby('a', sort=True).agg({'a': 'max', 'b': ['min', 'mean']}) a b max min mean a - 2 2 3 3.0 1 1 1 1.5 + 2 2 3 3.0 Using lambdas/callables to specify aggregations taking parameters. From 85a47db938b8ffc5e7dd9646eac38272458c37d7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 23 Feb 2024 12:57:50 -0800 Subject: [PATCH 14/16] Make value_counts docstring test deterministic --- python/cudf/cudf/core/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0a90b535040..8f22b990443 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7693,10 +7693,10 @@ def value_counts( dog 4 0 cat 4 0 ant 6 0 - >>> df.value_counts(sort=True) + >>> df.value_counts().sort_index() num_legs num_wings - 4 0 2 2 2 1 + 4 0 2 6 0 1 Name: count, dtype: int64 """ From f79f1d6a08eb6bb6ce68cfcbea43544ddcb2a556 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Feb 2024 17:01:27 -0800 Subject: [PATCH 15/16] Update docs --- docs/cudf/source/user_guide/pandas-comparison.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index 03ce58ea9e3..549d91b771a 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -87,7 +87,7 @@ using `.from_arrow()` or `.from_pandas()`. ## Result ordering -By default, `join` (or `merge`) and `groupby` operations in cuDF +By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF do *not* guarantee output ordering. Compare the results obtained from Pandas and cuDF below: From 56a222996273eb229d9aa0fd8a914012bcc21592 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Feb 2024 09:10:48 -0800 Subject: [PATCH 16/16] Add TODO reminder for future performance tuning --- cpp/src/groupby/hash/groupby.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index dcb40edbddc..acc1b087510 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -59,6 +59,8 @@ namespace detail { namespace hash { namespace { +// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested +// types and `cg_size = 1`for flat data to improve performance using probing_scheme_type = cuco::linear_probing< 1, ///< Number of threads used to handle each input key cudf::experimental::row::hash::device_row_hasher