Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix has_null predicate for drop_list_duplicates on nested structs #10436

Merged
merged 4 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions cpp/src/lists/drop_list_duplicates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ struct get_indices_of_unique_entries_dispatch {
size_type*,
null_equality,
nan_equality,
bool,
duplicate_keep_option,
rmm::cuda_stream_view) const
{
Expand All @@ -370,7 +369,6 @@ struct get_indices_of_unique_entries_dispatch {
size_type* output_begin,
null_equality nulls_equal,
nan_equality nans_equal,
bool has_nulls,
duplicate_keep_option keep_option,
rmm::cuda_stream_view stream) const noexcept
{
Expand All @@ -379,7 +377,7 @@ struct get_indices_of_unique_entries_dispatch {
*d_view,
*d_view,
nulls_equal,
has_nulls,
all_lists_entries.has_nulls(),
nans_equal == nan_equality::ALL_EQUAL};
return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_entries),
Expand All @@ -396,19 +394,20 @@ struct get_indices_of_unique_entries_dispatch {
size_type* output_begin,
null_equality nulls_equal,
nan_equality nans_equal,
bool has_nulls,
duplicate_keep_option keep_option,
rmm::cuda_stream_view stream) const noexcept
{
auto const flattened_entries = cudf::structs::detail::flatten_nested_columns(
table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {});
auto const dview_ptr = table_device_view::create(flattened_entries, stream);
// search through children of all levels for null values
bool nested_has_nulls = has_nulls(flattened_entries.flattened_columns());
sperlingxx marked this conversation as resolved.
Show resolved Hide resolved

auto const comp = table_row_comparator_fn{list_indices,
*dview_ptr,
*dview_ptr,
nulls_equal,
has_nulls,
nested_has_nulls,
nans_equal == nan_equality::ALL_EQUAL};
return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_entries),
Expand Down Expand Up @@ -447,7 +446,6 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_indices(
output_begin,
nulls_equal,
nans_equal,
keys_entries.has_nulls(),
keep_option,
stream);

Expand Down
106 changes: 105 additions & 1 deletion cpp/tests/lists/drop_list_duplicates_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -734,6 +734,110 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull)
}
}

TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull)
{
using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level
auto constexpr YYY = int32_t{0}; // nulls at the parent structs column level
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling with the distinction between XXX and YYY here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same for me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@davidwendt Sorry, it is my blunder. They should be assigned with different values to create intra null structs with
children of different values.

auto constexpr null = int32_t{0}; // nulls at the children columns level

auto const get_nested_structs = [] {
auto grandchild1 = ColWrapper{{
1, XXX, null, XXX, YYY, 1, 1, 1, // list1
1, 1, 1, 1, 2, 1, null, 2, // list2
null, null, 2, 2, 3, 2, 3, 3 // list3
},
nulls_at({2, 14, 16, 17})};
auto grandchild2 = StringsCol{{
// begin list1
"Banana",
"YYY", /*NULL*/
"Apple",
"XXX", /*NULL*/
"YYY", /*NULL*/
"Banana",
"Cherry",
"Kiwi", // end list1
// begin list2
"Bear",
"Duck",
"Cat",
"Dog",
"Panda",
"Bear",
"" /*NULL*/,
"Panda", // end list2
// begin list3
"ÁÁÁ",
"ÉÉÉÉÉ",
"ÍÍÍÍÍ",
"ÁBC",
"" /*NULL*/,
"ÁÁÁ",
"ÁBC",
"XYZ" // end list3
},
nulls_at({14, 20})};
auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})};
return StructsCol{{child1}};
};

auto const get_nested_struct_expected = [] {
auto grandchild1 =
ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
nulls_at({3, 4, 10, 17, 18})};
auto grandchild2 = StringsCol{{
// begin list1
"Banana",
"Cherry",
"Kiwi",
"Apple",
"XXX" /*NULL*/, // end list1
// begin list2
"Bear",
"Cat",
"Dog",
"Duck",
"Panda",
"" /*NULL*/, // end list2
// begin list3
"ÁBC",
"ÁÁÁ",
"ÍÍÍÍÍ",
"XYZ",
"ÁBC",
"" /*NULL*/,
"ÁÁÁ",
"ÉÉÉÉÉ" // end list3
},
nulls_at({4, 10, 16})};
auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({4})};
return StructsCol{{child1}};
};

// Test full columns.
{
auto const lists = cudf::make_lists_column(
3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
auto const expected = cudf::make_lists_column(
3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
}

// Test sliced columns.
{
auto const lists_original = cudf::make_lists_column(
3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
auto const expected_original = cudf::make_lists_column(
3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
auto const lists = cudf::slice(lists_original->view(), {1, 3})[0];
auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
}
}

TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
{
auto const h_child = std::vector<float_type>{
Expand Down