From e978cbccfc812922ae407faa660564f8573904fe Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Tue, 15 Mar 2022 19:15:55 +0800 Subject: [PATCH 1/4] fix null setting of drop_list_duplicates on nested structs Signed-off-by: sperlingxx --- cpp/src/lists/drop_list_duplicates.cu | 10 +- .../lists/drop_list_duplicates_tests.cpp | 106 +++++++++++++++++- 2 files changed, 109 insertions(+), 7 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 7d391578428..19845c4d536 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -354,7 +354,6 @@ struct get_indices_of_unique_entries_dispatch { size_type*, null_equality, nan_equality, - bool, duplicate_keep_option, rmm::cuda_stream_view) const { @@ -370,7 +369,6 @@ struct get_indices_of_unique_entries_dispatch { size_type* output_begin, null_equality nulls_equal, nan_equality nans_equal, - bool has_nulls, duplicate_keep_option keep_option, rmm::cuda_stream_view stream) const noexcept { @@ -379,7 +377,7 @@ struct get_indices_of_unique_entries_dispatch { *d_view, *d_view, nulls_equal, - has_nulls, + all_lists_entries.has_nulls(), nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -396,19 +394,20 @@ struct get_indices_of_unique_entries_dispatch { size_type* output_begin, null_equality nulls_equal, nan_equality nans_equal, - bool has_nulls, duplicate_keep_option keep_option, rmm::cuda_stream_view stream) const noexcept { auto const flattened_entries = cudf::structs::detail::flatten_nested_columns( table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {}); auto const dview_ptr = table_device_view::create(flattened_entries, stream); + // search through children of all levels for null values + bool nested_has_nulls = has_nulls(flattened_entries.flattened_columns()); auto const comp = table_row_comparator_fn{list_indices, *dview_ptr, *dview_ptr, nulls_equal, - has_nulls, + nested_has_nulls, nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), @@ -447,7 +446,6 @@ std::vector> get_unique_entries_and_list_indices( output_begin, nulls_equal, nans_equal, - keys_entries.has_nulls(), keep_option, stream); diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 8efcf8886ae..1d66275ff17 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -734,6 +734,110 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) } } +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level + auto constexpr YYY = int32_t{0}; // nulls at the parent structs column level + auto constexpr null = int32_t{0}; // nulls at the children columns level + + auto const get_nested_structs = [] { + auto grandchild1 = ColWrapper{{ + 1, XXX, null, XXX, YYY, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto grandchild2 = StringsCol{{ + // begin list1 + "Banana", + "YYY", /*NULL*/ + "Apple", + "XXX", /*NULL*/ + "YYY", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})}; + return StructsCol{{child1}}; + }; + + auto const get_nested_struct_expected = [] { + auto grandchild1 = + ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, + nulls_at({3, 4, 10, 17, 18})}; + auto grandchild2 = StringsCol{{ + // begin list1 + "Banana", + "Cherry", + "Kiwi", + "Apple", + "XXX" /*NULL*/, // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", + "" /*NULL*/, // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÉÉÉÉÉ" // end list3 + }, + nulls_at({4, 10, 16})}; + auto child1 = StructsCol{{grandchild1, grandchild2}, nulls_at({4})}; + return StructsCol{{child1}}; + }; + + // Test full columns. + { + auto const lists = cudf::make_lists_column( + 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = cudf::make_lists_column( + 3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs) { auto const h_child = std::vector{ From 54ddf70786d8acc59d9744733c99c5d2c2d97417 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Wed, 16 Mar 2022 10:20:33 +0800 Subject: [PATCH 2/4] Update cpp/src/lists/drop_list_duplicates.cu Co-authored-by: Nghia Truong --- cpp/src/lists/drop_list_duplicates.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 19845c4d536..e3c47649617 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -400,8 +400,8 @@ struct get_indices_of_unique_entries_dispatch { auto const flattened_entries = cudf::structs::detail::flatten_nested_columns( table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {}); auto const dview_ptr = table_device_view::create(flattened_entries, stream); - // search through children of all levels for null values - bool nested_has_nulls = has_nulls(flattened_entries.flattened_columns()); + // Search through children of all levels for nulls. + auto const nested_has_nulls = has_nulls(flattened_entries.flattened_columns()); auto const comp = table_row_comparator_fn{list_indices, *dview_ptr, From 960dd7af075b17d0fce424866de3dbfdaa90c8aa Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Wed, 16 Mar 2022 10:41:35 +0800 Subject: [PATCH 3/4] fix typo Signed-off-by: sperlingxx --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 1d66275ff17..3f9c4c9cca7 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -737,9 +737,12 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull) { using ColWrapper = cudf::test::fixed_width_column_wrapper; - auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level - auto constexpr YYY = int32_t{0}; // nulls at the parent structs column level auto constexpr null = int32_t{0}; // nulls at the children columns level + // XXX and YY are int placeholders for nulls at parent structs column level. + // We bring up two placeholders of different values to create intra null structs with + // children of different values, so as to test whether null_equality::EQUAL works or not. + auto constexpr XXX = int32_t{5}; + auto constexpr YYY = int32_t{6}; auto const get_nested_structs = [] { auto grandchild1 = ColWrapper{{ From 917d20ebec0835c5a75383fec2d489cb1299c08d Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Wed, 16 Mar 2022 11:32:00 +0800 Subject: [PATCH 4/4] Update cpp/tests/lists/drop_list_duplicates_tests.cpp Co-authored-by: Nghia Truong --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 3f9c4c9cca7..945d138c789 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -738,7 +738,7 @@ TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull) { using ColWrapper = cudf::test::fixed_width_column_wrapper; auto constexpr null = int32_t{0}; // nulls at the children columns level - // XXX and YY are int placeholders for nulls at parent structs column level. + // XXX and YYY are int placeholders for nulls at parent structs column level. // We bring up two placeholders of different values to create intra null structs with // children of different values, so as to test whether null_equality::EQUAL works or not. auto constexpr XXX = int32_t{5};