Skip to content

Commit

Permalink
Fix has_null predicate for drop_list_duplicates on nested structs (#1…
Browse files Browse the repository at this point in the history
…0436)

For now, `get_indices_of_unique_entries_dispatch` only checks the outest layer of structures to determine whether the column contains null or not. It leads to the omission of intra null structures.
In current, 
`[Struct(Struct(1, "a", valid)), Struct(Struct(1, "a", valid), Struct(Struct(2, "a", invalid), Struct(Struct(1, "b", invalid)]` becomes
`[Struct(Struct(1, "a", valid)), Struct(Struct(2, "a", invalid), Struct(Struct(1, "b", invalid)]` after droping list duplicates even if null equality is EQUAL. However, it should be 
`[Struct(Struct(1, "a", valid)), Struct(Struct(2, "a", invalid)]`, because `Struct(1, "a")` and `Struct(2, "a")` are both invalid, despite they contain different elements.

Current PR is to fix above problem by checking all flattened coulumns for nulls.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: #10436
  • Loading branch information
sperlingxx authored Mar 17, 2022
1 parent 5d3f7dc commit 87180ce
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 7 deletions.
10 changes: 4 additions & 6 deletions cpp/src/lists/drop_list_duplicates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ struct get_indices_of_unique_entries_dispatch {
size_type*,
null_equality,
nan_equality,
bool,
duplicate_keep_option,
rmm::cuda_stream_view) const
{
Expand All @@ -370,7 +369,6 @@ struct get_indices_of_unique_entries_dispatch {
size_type* output_begin,
null_equality nulls_equal,
nan_equality nans_equal,
bool has_nulls,
duplicate_keep_option keep_option,
rmm::cuda_stream_view stream) const noexcept
{
Expand All @@ -379,7 +377,7 @@ struct get_indices_of_unique_entries_dispatch {
*d_view,
*d_view,
nulls_equal,
has_nulls,
all_lists_entries.has_nulls(),
nans_equal == nan_equality::ALL_EQUAL};
return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_entries),
Expand All @@ -396,19 +394,20 @@ struct get_indices_of_unique_entries_dispatch {
size_type* output_begin,
null_equality nulls_equal,
nan_equality nans_equal,
bool has_nulls,
duplicate_keep_option keep_option,
rmm::cuda_stream_view stream) const noexcept
{
auto const flattened_entries = cudf::structs::detail::flatten_nested_columns(
table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {});
auto const dview_ptr = table_device_view::create(flattened_entries, stream);
// Search through children of all levels for nulls.
auto const nested_has_nulls = has_nulls(flattened_entries.flattened_columns());

auto const comp = table_row_comparator_fn{list_indices,
*dview_ptr,
*dview_ptr,
nulls_equal,
has_nulls,
nested_has_nulls,
nans_equal == nan_equality::ALL_EQUAL};
return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_entries),
Expand Down Expand Up @@ -447,7 +446,6 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_indices(
output_begin,
nulls_equal,
nans_equal,
keys_entries.has_nulls(),
keep_option,
stream);

Expand Down
109 changes: 108 additions & 1 deletion cpp/tests/lists/drop_list_duplicates_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -734,6 +734,113 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull)
}
}

TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull)
{
using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
auto constexpr null = int32_t{0}; // nulls at the children columns level
// XXX and YYY are int placeholders for nulls at parent structs column level.
// We bring up two placeholders of different values to create intra null structs with
// children of different values, so as to test whether null_equality::EQUAL works or not.
auto constexpr XXX = int32_t{5};
auto constexpr YYY = int32_t{6};

auto const get_nested_structs = [] {
auto grandchild1 = ColWrapper{{
1, XXX, null, XXX, YYY, 1, 1, 1, // list1
1, 1, 1, 1, 2, 1, null, 2, // list2
null, null, 2, 2, 3, 2, 3, 3 // list3
},
nulls_at({2, 14, 16, 17})};
auto grandchild2 = StringsCol{{
// begin list1
"Banana",
"YYY", /*NULL*/
"Apple",
"XXX", /*NULL*/
"YYY", /*NULL*/
"Banana",
"Cherry",
"Kiwi", // end list1
// begin list2
"Bear",
"Duck",
"Cat",
"Dog",
"Panda",
"Bear",
"" /*NULL*/,
"Panda", // end list2
// begin list3
"ÁÁÁ",
"ÉÉÉÉÉ",
"ÍÍÍÍÍ",
"ÁBC",
"" /*NULL*/,
"ÁÁÁ",
"ÁBC",
"XYZ" // end list3
},
nulls_at({14, 20})};
auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})};
return StructsCol{{child1}};
};

auto const get_nested_struct_expected = [] {
auto grandchild1 =
ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
nulls_at({3, 4, 10, 17, 18})};
auto grandchild2 = StringsCol{{
// begin list1
"Banana",
"Cherry",
"Kiwi",
"Apple",
"XXX" /*NULL*/, // end list1
// begin list2
"Bear",
"Cat",
"Dog",
"Duck",
"Panda",
"" /*NULL*/, // end list2
// begin list3
"ÁBC",
"ÁÁÁ",
"ÍÍÍÍÍ",
"XYZ",
"ÁBC",
"" /*NULL*/,
"ÁÁÁ",
"ÉÉÉÉÉ" // end list3
},
nulls_at({4, 10, 16})};
auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({4})};
return StructsCol{{child1}};
};

// Test full columns.
{
auto const lists = cudf::make_lists_column(
3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
auto const expected = cudf::make_lists_column(
3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
}

// Test sliced columns.
{
auto const lists_original = cudf::make_lists_column(
3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
auto const expected_original = cudf::make_lists_column(
3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
auto const lists = cudf::slice(lists_original->view(), {1, 3})[0];
auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
}
}

TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
{
auto const h_child = std::vector<float_type>{
Expand Down

0 comments on commit 87180ce

Please sign in to comment.