From 87180cefc060010c77e6988d3811428feaa89dbe Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 17 Mar 2022 10:36:57 +0800 Subject: [PATCH] Fix has_null predicate for drop_list_duplicates on nested structs (#10436) For now, `get_indices_of_unique_entries_dispatch` only checks the outest layer of structures to determine whether the column contains null or not. It leads to the omission of intra null structures. In current, `[Struct(Struct(1, "a", valid)), Struct(Struct(1, "a", valid), Struct(Struct(2, "a", invalid), Struct(Struct(1, "b", invalid)]` becomes `[Struct(Struct(1, "a", valid)), Struct(Struct(2, "a", invalid), Struct(Struct(1, "b", invalid)]` after droping list duplicates even if null equality is EQUAL. However, it should be `[Struct(Struct(1, "a", valid)), Struct(Struct(2, "a", invalid)]`, because `Struct(1, "a")` and `Struct(2, "a")` are both invalid, despite they contain different elements. Current PR is to fix above problem by checking all flattened coulumns for nulls. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10436 --- cpp/src/lists/drop_list_duplicates.cu | 10 +- .../lists/drop_list_duplicates_tests.cpp | 109 +++++++++++++++++- 2 files changed, 112 insertions(+), 7 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 7d391578428..e3c47649617 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -354,7 +354,6 @@ struct get_indices_of_unique_entries_dispatch { size_type*, null_equality, nan_equality, - bool, duplicate_keep_option, rmm::cuda_stream_view) const { @@ -370,7 +369,6 @@ struct get_indices_of_unique_entries_dispatch { size_type* output_begin, null_equality nulls_equal, nan_equality nans_equal, - bool has_nulls, duplicate_keep_option keep_option, rmm::cuda_stream_view stream) const noexcept { @@ -379,7 +377,7 @@ struct get_indices_of_unique_entries_dispatch { *d_view, *d_view, nulls_equal, - has_nulls, + all_lists_entries.has_nulls(), nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -396,19 +394,20 @@ struct get_indices_of_unique_entries_dispatch { size_type* output_begin, null_equality nulls_equal, nan_equality nans_equal, - bool has_nulls, duplicate_keep_option keep_option, rmm::cuda_stream_view stream) const noexcept { auto const flattened_entries = cudf::structs::detail::flatten_nested_columns( table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {}); auto const dview_ptr = table_device_view::create(flattened_entries, stream); + // Search through children of all levels for nulls. + auto const nested_has_nulls = has_nulls(flattened_entries.flattened_columns()); auto const comp = table_row_comparator_fn{list_indices, *dview_ptr, *dview_ptr, nulls_equal, - has_nulls, + nested_has_nulls, nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -447,7 +446,6 @@ std::vector> get_unique_entries_and_list_indices( output_begin, nulls_equal, nans_equal, - keys_entries.has_nulls(), keep_option, stream); diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 8efcf8886ae..945d138c789 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -734,6 +734,113 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) } } +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + auto constexpr null = int32_t{0}; // nulls at the children columns level + // XXX and YYY are int placeholders for nulls at parent structs column level. + // We bring up two placeholders of different values to create intra null structs with + // children of different values, so as to test whether null_equality::EQUAL works or not. + auto constexpr XXX = int32_t{5}; + auto constexpr YYY = int32_t{6}; + + auto const get_nested_structs = [] { + auto grandchild1 = ColWrapper{{ + 1, XXX, null, XXX, YYY, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto grandchild2 = StringsCol{{ + // begin list1 + "Banana", + "YYY", /*NULL*/ + "Apple", + "XXX", /*NULL*/ + "YYY", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})}; + return StructsCol{{child1}}; + }; + + auto const get_nested_struct_expected = [] { + auto grandchild1 = + ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, + nulls_at({3, 4, 10, 17, 18})}; + auto grandchild2 = StringsCol{{ + // begin list1 + "Banana", + "Cherry", + "Kiwi", + "Apple", + "XXX" /*NULL*/, // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", + "" /*NULL*/, // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÉÉÉÉÉ" // end list3 + }, + nulls_at({4, 10, 16})}; + auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({4})}; + return StructsCol{{child1}}; + }; + + // Test full columns. + { + auto const lists = cudf::make_lists_column( + 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = cudf::make_lists_column( + 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs) { auto const h_child = std::vector{