Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COLLECT_LIST support returning empty output columns. #8279

Merged
merged 3 commits into from
May 21, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,31 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
groupby::~groupby() = default;

namespace {

/**
* @brief Factory to construct empty result columns.
*
* Adds special handling for COLLECT_LIST/COLLECT_SET, because:
* 1. `make_empty_column()` does not support construction of nested columns.
* 2. Empty lists need empty child columns, to persist type information.
*/
struct empty_column_constructor {
column_view values;

template <aggregation::Kind k>
std::unique_ptr<cudf::column> operator()() const
{
using namespace cudf;

if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
return make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
}

return make_empty_column(cudf::detail::target_type(values.type(), k));
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
}
};

/// Make an empty table with appropriate types for requested aggs
auto empty_results(host_span<aggregation_request const> requests)
{
Expand All @@ -88,13 +113,13 @@ auto empty_results(host_span<aggregation_request const> requests)
requests.begin(), requests.end(), std::back_inserter(empty_results), [](auto const& request) {
std::vector<std::unique_ptr<column>> results;

std::transform(
request.aggregations.begin(),
request.aggregations.end(),
std::back_inserter(results),
[&request](auto const& agg) {
return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind));
});
std::transform(request.aggregations.begin(),
request.aggregations.end(),
std::back_inserter(results),
[&request](auto const& agg) {
return cudf::detail::aggregation_dispatcher(
agg->kind, empty_column_constructor{request.values});
});

return aggregation_result{std::move(results)};
});
Expand Down
70 changes: 70 additions & 0 deletions cpp/tests/groupby/collect_list_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
{
using K = int32_t;
using V = TypeParam;

fixed_width_column_wrapper<K, int32_t> keys{};
fixed_width_column_wrapper<V, int32_t> values{};

fixed_width_column_wrapper<K, int32_t> expect_keys{};
lists_column_wrapper<V, int32_t> expect_vals{};

auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectLists)
{
using K = int32_t;
Expand Down Expand Up @@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
{
using K = int32_t;
using V = TypeParam;

using LCW = cudf::test::lists_column_wrapper<V, int32_t>;

auto offsets = data_type{type_to_id<offset_type>()};

fixed_width_column_wrapper<K, int32_t> keys{};
auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});

fixed_width_column_wrapper<K, int32_t> expect_keys{};

auto expect_child =
cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
auto expect_values =
cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});

auto agg = cudf::make_collect_list_aggregation();
test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
{
using K = int32_t;
using V = TypeParam;

using LCW = cudf::test::lists_column_wrapper<V, int32_t>;

fixed_width_column_wrapper<K, int32_t> keys{};
auto struct_child = LCW{};
auto struct_column = structs_column_wrapper{{struct_child}};

auto values = cudf::make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), struct_column.release(), 0, {});

fixed_width_column_wrapper<K, int32_t> expect_keys{};

auto expect_struct_child = LCW{};
auto expect_struct_column = structs_column_wrapper{{expect_struct_child}};

auto expect_child =
cudf::make_lists_column(0,
make_empty_column(data_type{type_to_id<offset_type>()}),
expect_struct_column.release(),
0,
{});
auto expect_values = cudf::make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});

auto agg = cudf::make_collect_list_aggregation();
test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, dictionary)
{
using K = int32_t;
Expand Down
3 changes: 1 addition & 2 deletions cpp/tests/groupby/collect_set_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
TYPED_TEST(CollectSetTypedTest, TrivialInput)
{
// Empty input
// TODO: Enable this test after issue#7611 has been fixed
// test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET);
test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set());

// Single key input
{
Expand Down