Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update groupby::hash to use new row operators for keys #10770

Merged
merged 33 commits into from
May 25, 2022
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d335153
Use new row hasher and comparator
PointKernel May 2, 2022
b227ec4
Get rid of Map template
PointKernel May 2, 2022
3deb64b
Fix a bug: update the lifecycle of preprocessed table
PointKernel May 5, 2022
1c7d9f4
Merge remote-tracking branch 'upstream/branch-22.06' into groupby-new…
PointKernel May 5, 2022
bf94a8d
Get rid of flattened columns
PointKernel May 6, 2022
80d8f87
Fix a bug: keys always have nulls
PointKernel May 9, 2022
5f704ec
Pass shared_ptr of preprocessed table by value
PointKernel May 9, 2022
6de7c0b
Add structs argmax unit tests
PointKernel May 10, 2022
70d740f
Add basic list tests
PointKernel May 10, 2022
1a70016
Add all null input tests
PointKernel May 10, 2022
965eba4
Add lists with nulls tests
PointKernel May 11, 2022
762bf69
Fix a lifetime bug for row operators
PointKernel May 11, 2022
a000e65
Fix a bug: check nested nulls when initing row operators
PointKernel May 12, 2022
208f224
Add const + comments
PointKernel May 12, 2022
7232a20
Merge remote-tracking branch 'upstream/branch-22.06' into groupby-new…
PointKernel May 12, 2022
b6346e5
Consistently use has_nested_nulls
PointKernel May 12, 2022
2f70d8f
Use auto const consistently
PointKernel May 17, 2022
4e6de36
Remove unused parameter
PointKernel May 17, 2022
935ccf6
Move test to proper file
PointKernel May 17, 2022
bd27723
Minor cleanups
PointKernel May 18, 2022
d4724be
Add group struct keys benchmark
PointKernel May 19, 2022
9aa2f8d
Remove unnecessary sync
PointKernel May 19, 2022
055c31a
Remove unused parameter
PointKernel May 20, 2022
abdb431
Update unit test
PointKernel May 20, 2022
70ca9a0
Improvement: use flattened keys to compute row bitmask
PointKernel May 20, 2022
ece4321
Add tests for lists with null elements
PointKernel May 24, 2022
55902dd
Minor cleanups
PointKernel May 24, 2022
9086f33
Add exception to null exclude case
PointKernel May 24, 2022
6f170d8
Revert changes to match pandas dropna behavior
PointKernel May 24, 2022
c8b1aab
Update unit tests to exercise null elements in list keys
PointKernel May 24, 2022
002ad40
Minor cleanup
PointKernel May 24, 2022
b76677c
Remove unused header
PointKernel May 24, 2022
efd497e
Throw when null structs are excluded
PointKernel May 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,13 @@ ConfigureBench(
groupby/group_sum.cu
groupby/group_nth.cu
groupby/group_shift.cu
groupby/group_struct.cu
groupby/group_struct_values.cpp
groupby/group_no_requests.cu
groupby/group_scan.cu
groupby/group_rank_benchmark.cu
)

ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu groupby/group_struct_keys.cpp)

# ##################################################################################################
# * hashing benchmark -----------------------------------------------------------------------------
Expand Down
101 changes: 101 additions & 0 deletions cpp/benchmarks/groupby/group_struct_keys.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

#include <random>

void bench_groupby_struct_keys(nvbench::state& state)
{
cudf::rmm_pool_raii pool_raii;

using Type = int;
using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, 100);

const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
const cudf::size_type n_cols{1};
const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};

// Create columns with values in the range [0,100)
std::vector<column_wrapper> columns;
columns.reserve(n_cols);
std::generate_n(std::back_inserter(columns), n_cols, [&]() {
auto const elements = cudf::detail::make_counting_transform_iterator(
0, [&](auto row) { return distribution(generator); });
if (!nulls) return column_wrapper(elements, elements + n_rows);
auto valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 10 != 0; });
return column_wrapper(elements, elements + n_rows, valids);
});

std::vector<std::unique_ptr<cudf::column>> cols;
std::transform(columns.begin(), columns.end(), std::back_inserter(cols), [](column_wrapper& col) {
return col.release();
});

std::vector<std::unique_ptr<cudf::column>> child_cols = std::move(cols);
// Add some layers
for (int i = 0; i < depth; i++) {
std::vector<bool> struct_validity;
std::uniform_int_distribution<int> bool_distribution(0, 100 * (i + 1));
std::generate_n(
std::back_inserter(struct_validity), n_rows, [&]() { return bool_distribution(generator); });
cudf::test::structs_column_wrapper struct_col(std::move(child_cols), struct_validity);
child_cols = std::vector<std::unique_ptr<cudf::column>>{};
child_cols.push_back(struct_col.release());
}

data_profile profile;
profile.set_null_frequency(std::nullopt);
profile.set_cardinality(0);
profile.set_distribution_params<int64_t>(
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);

auto const keys_table = cudf::table(std::move(child_cols));
auto const vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);

cudf::groupby::groupby gb_obj(keys_table.view());

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals_table->get_column(0).view();
requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());

// Set up nvbench default stream
auto stream = rmm::cuda_stream_default;
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
}

NVBENCH_BENCH(bench_groupby_struct_keys)
.set_name("groupby_struct_keys")
.add_int64_power_of_two_axis("NumRows", {10, 16, 20})
.add_int64_axis("Depth", {0, 1, 8})
.add_int64_axis("Nulls", {0, 1});
5 changes: 2 additions & 3 deletions cpp/include/cudf/detail/groupby.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,13 +31,12 @@ namespace hash {
* @brief Indicates if a set of aggregation requests can be satisfied with a
* hash-based groupby implementation.
*
* @param keys The table of keys
* @param requests The set of columns to aggregate and the aggregations to
* perform
* @return true A hash-based groupby can be used
* @return false A hash-based groupby cannot be used
*/
bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests);
bool can_use_hash_groupby(host_span<aggregation_request const> requests);

// Hash-based groupby
std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
Expand Down
12 changes: 2 additions & 10 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
// Only use hash groupby if the keys aren't sorted and all requests can be
// satisfied with a hash implementation
if (_keys_are_sorted == sorted::NO and not _helper and
detail::hash::can_use_hash_groupby(_keys, requests)) {
// Optionally flatten nested key columns.
auto flattened = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
auto flattened_keys = flattened.flattened_columns();
auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
"Unsupported groupby key type does not support equality comparison");
auto [grouped_keys, results] =
detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
detail::hash::can_use_hash_groupby(requests)) {
return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr);
} else {
return sort_aggregate(requests, stream, mr);
}
Expand Down
Loading