Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COLLECT_LIST support returning empty output columns. #8279

Merged
merged 3 commits into from
May 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,44 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
groupby::~groupby() = default;

namespace {

/**
* @brief Factory to construct empty result columns.
*
* Adds special handling for COLLECT_LIST/COLLECT_SET, because:
* 1. `make_empty_column()` does not support construction of nested columns.
* 2. Empty lists need empty child columns, to persist type information.
*/
struct empty_column_constructor {
column_view values;

template <typename ValuesType, aggregation::Kind k>
std::unique_ptr<cudf::column> operator()() const
{
using namespace cudf;
using namespace cudf::detail;

if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
return make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
}

// If `values` is LIST typed, and the aggregation results match the type,
// construct empty results based on `values`.
// Most generally, this applies if input type matches output type.
//
// Note: `target_type_t` is not recursive, and `ValuesType` does not consider children.
// It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this
// point, because `COLLECT_LIST(LIST)` produces `LIST<LIST>`, but `target_type_t`
// wouldn't know the difference.
if constexpr (std::is_same_v<target_type_t<ValuesType, k>, ValuesType>) {
return empty_like(values);
}

return make_empty_column(target_type(values.type(), k));
}
};

/// Make an empty table with appropriate types for requested aggs
auto empty_results(host_span<aggregation_request const> requests)
{
Expand All @@ -93,7 +131,8 @@ auto empty_results(host_span<aggregation_request const> requests)
request.aggregations.end(),
std::back_inserter(results),
[&request](auto const& agg) {
return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind));
return cudf::detail::dispatch_type_and_aggregation(
request.values.type(), agg->kind, empty_column_constructor{request.values});
});

return aggregation_result{std::move(results)};
Expand Down
70 changes: 70 additions & 0 deletions cpp/tests/groupby/collect_list_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
{
using K = int32_t;
using V = TypeParam;

fixed_width_column_wrapper<K, int32_t> keys{};
fixed_width_column_wrapper<V, int32_t> values{};

fixed_width_column_wrapper<K, int32_t> expect_keys{};
lists_column_wrapper<V, int32_t> expect_vals{};

auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectLists)
{
using K = int32_t;
Expand Down Expand Up @@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
{
using K = int32_t;
using V = TypeParam;

using LCW = cudf::test::lists_column_wrapper<V, int32_t>;

auto offsets = data_type{type_to_id<offset_type>()};

fixed_width_column_wrapper<K, int32_t> keys{};
auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});

fixed_width_column_wrapper<K, int32_t> expect_keys{};

auto expect_child =
cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
auto expect_values =
cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});

auto agg = cudf::make_collect_list_aggregation();
test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
{
using K = int32_t;
using V = TypeParam;

using LCW = cudf::test::lists_column_wrapper<V, int32_t>;

fixed_width_column_wrapper<K, int32_t> keys{};
auto struct_child = LCW{};
auto struct_column = structs_column_wrapper{{struct_child}};

auto values = cudf::make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), struct_column.release(), 0, {});

fixed_width_column_wrapper<K, int32_t> expect_keys{};

auto expect_struct_child = LCW{};
auto expect_struct_column = structs_column_wrapper{{expect_struct_child}};

auto expect_child =
cudf::make_lists_column(0,
make_empty_column(data_type{type_to_id<offset_type>()}),
expect_struct_column.release(),
0,
{});
auto expect_values = cudf::make_lists_column(
0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});

auto agg = cudf::make_collect_list_aggregation();
test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
}

TYPED_TEST(groupby_collect_list_test, dictionary)
{
using K = int32_t;
Expand Down
3 changes: 1 addition & 2 deletions cpp/tests/groupby/collect_set_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
TYPED_TEST(CollectSetTypedTest, TrivialInput)
{
// Empty input
// TODO: Enable this test after issue#7611 has been fixed
// test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET);
test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set());

// Single key input
{
Expand Down
40 changes: 40 additions & 0 deletions cpp/tests/groupby/nth_element_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary)
keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2));
}

template <typename T>
struct groupby_nth_element_lists_test : BaseFixture {
};

TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint);

TYPED_TEST(groupby_nth_element_lists_test, Basics)
{
using K = int32_t;
using V = TypeParam;

using lists = cudf::test::lists_column_wrapper<V, int32_t>;

auto keys = fixed_width_column_wrapper<K, int32_t>{1, 1, 2, 2, 3, 3};
auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}};

auto expected_keys = fixed_width_column_wrapper<K, int32_t>{1, 2, 3};
auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}};

test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
}

TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
{
using K = int32_t;
using V = TypeParam;

using lists = cudf::test::lists_column_wrapper<V, int32_t>;

auto keys = fixed_width_column_wrapper<K, int32_t>{};
auto values = lists{};

auto expected_keys = fixed_width_column_wrapper<K, int32_t>{};
auto expected_values = lists{};

test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
}

} // namespace test
} // namespace cudf