Skip to content

Commit

Permalink
Explicitly disable grouping on LIST.
Browse files Browse the repository at this point in the history
Fixes rapidsai#8905.

Attempting groupby aggregations with LIST keys leads to silent
failures and bad results.
For instance, attempting hash-based groupby aggregations with LIST
keys only fails on DEBUG builds, thus:
```
/home/myth/dev/cudf/2/cpp/include/cudf/table/row_operators.cuh:447: unsigned int cudf:
:element_hasher_with_seed<hash_function, has_nulls>::operator()(cudf::column_device_view, signed in
t) const [with T = cudf::list_view; void *<anonymous> = (void *)nullptr; hash_function = default_ha
sh; __nv_bool has_nulls = false]: block: [0,0,0], thread: [0,0,0] Assertion `false && "Unsupported
type in hash."` failed.
```
In RELEASE builds, a copy of the input LIST column is returned, causing
each output row to be interpreted as its own group.

This commit adds an explicit failure for unsupported LIST groupby keys.
  • Loading branch information
mythrocks committed Sep 14, 2021
1 parent eab2486 commit 01db2a3
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 10 deletions.
3 changes: 3 additions & 0 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ groupby::groupby(table_view const& keys,
_column_order{column_order},
_null_precedence{null_precedence}
{
auto is_list = [](auto const& col) { return col.type().id() == cudf::type_id::LIST; };
auto has_list = std::any_of(keys.begin(), keys.end(), is_list);
CUDF_EXPECTS(not has_list, "Groupby does not support LIST columns as keys.");
}

// Select hash vs. sort groupby implementation
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ ConfigureTest(GROUPBY_TEST
groupby/count_tests.cpp
groupby/groups_tests.cpp
groupby/keys_tests.cpp
groupby/lists_tests.cpp
groupby/m2_tests.cpp
groupby/min_tests.cpp
groupby/max_scan_tests.cpp
Expand Down
69 changes: 69 additions & 0 deletions cpp/tests/groupby/lists_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <tests/groupby/groupby_test_util.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <cudf/detail/aggregation/aggregation.hpp>

namespace cudf {
namespace test {

template <typename V>
struct groupby_lists_test : public cudf::test::BaseFixture {
};

TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes);

namespace {
// Checking with a single aggregation, and aggregation column.
// This test is orthogonal to the aggregation type; it focuses on testing the grouping
// with LISTS keys.
auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }

void test_sort_based_sum_agg(column_view const& keys, column_view const& values)
{
test_single_agg(
keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE);
}

void test_hash_based_sum_agg(column_view const& keys, column_view const& values)
{
test_single_agg(
keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE);
}

} // namespace

TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported)
{
// Test that grouping on LISTS columns fails visibly.

// clang-format off
auto keys = lists_column_wrapper<TypeParam, int32_t> { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} };
auto values = fixed_width_column_wrapper<int32_t> { 0, 1, 2, 3, 4 };
// clang-format on

EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error);
EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error);
}

} // namespace test
} // namespace cudf
35 changes: 25 additions & 10 deletions cpp/tests/groupby/structs_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
#include <cudf_test/type_lists.hpp>

#include <cudf/detail/aggregation/aggregation.hpp>
#include "cudf/aggregation.hpp"
#include "cudf/types.hpp"

using namespace cudf::test::iterators;

Expand All @@ -34,7 +32,7 @@ template <typename V>
struct groupby_structs_test : public cudf::test::BaseFixture {
};

TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes);
TYPED_TEST_SUITE(groupby_structs_test, cudf::test::FixedWidthTypes);

using V = int32_t; // Type of Aggregation Column.
using M0 = int32_t; // Type of STRUCT's first (i.e. 0th) member.
Expand Down Expand Up @@ -79,27 +77,43 @@ void print_agg_results(column_view const& keys, column_view const& vals)
}
}

void test_sum_agg(column_view const& keys,
column_view const& values,
column_view const& expected_keys,
column_view const& expected_values)
void test_sort_based_sum_agg(column_view const& keys,
column_view const& values,
column_view const& expected_keys,
column_view const& expected_values)
{
test_single_agg(keys,
values,
expected_keys,
expected_values,
sum_agg(),
force_use_sort_impl::NO,
force_use_sort_impl::YES,
null_policy::INCLUDE);
}

void test_hash_based_sum_agg(column_view const& keys,
column_view const& values,
column_view const& expected_keys,
column_view const& expected_values)
{
test_single_agg(keys,
values,
expected_keys,
expected_values,
sum_agg(),
force_use_sort_impl::YES,
force_use_sort_impl::NO,
null_policy::INCLUDE);
}

void test_sum_agg(column_view const& keys,
column_view const& values,
column_view const& expected_keys,
column_view const& expected_values)
{
test_sort_based_sum_agg(keys, values, expected_keys, expected_values);
test_hash_based_sum_agg(keys, values, expected_keys, expected_values);
}

} // namespace

TYPED_TEST(groupby_structs_test, basic)
Expand Down Expand Up @@ -312,7 +326,8 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported)
// clang-format on
auto keys = structs{{member_0, member_1}};

EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error);
EXPECT_THROW(test_sort_based_sum_agg(keys, values, keys, values), cudf::logic_error);
EXPECT_THROW(test_hash_based_sum_agg(keys, values, keys, values), cudf::logic_error);
}

} // namespace test
Expand Down

0 comments on commit 01db2a3

Please sign in to comment.