From db07df155730a61c468b105195bbebf5ffb963e5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 16 May 2022 12:35:39 -0700 Subject: [PATCH] Add more unit tests for `cudf::distinct` for nested types with sliced input (#10860) This adds more nested types tests for `cudf::distinct`, including cases of `List>` and `Struct>`, and the input columns are sliced. Partially addresses https://github.com/rapidsai/cudf/issues/10742. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10860 --- .../stream_compaction/distinct_tests.cpp | 148 +++++++++++++++++- 1 file changed, 142 insertions(+), 6 deletions(-) diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 5ce39b42fea..1c3e07dad2d 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -14,6 +14,13 @@ * limitations under the License. */ +#include +#include +#include +#include +#include +#include + #include #include #include @@ -21,12 +28,6 @@ #include #include -#include -#include -#include -#include -#include - #include #include @@ -111,6 +112,35 @@ TEST_F(Distinct, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } +TEST_F(Distinct, SlicedNonNullTable) +{ + using int32s_col = cudf::test::fixed_width_column_wrapper; + using floats_col = cudf::test::fixed_width_column_wrapper; + auto constexpr dont_care = int32_t{0}; + + auto const col1 = int32s_col{dont_care, dont_care, 6, 6, 3, 5, 8, 5, dont_care}; + auto const col2 = floats_col{dont_care, dont_care, 6, 6, 3, 4, 9, 4, dont_care}; + auto const col1_key = int32s_col{dont_care, dont_care, 20, 20, 20, 19, 21, 9, dont_care}; + auto const col2_key = int32s_col{dont_care, dont_care, 19, 19, 20, 20, 9, 21, dont_care}; + + auto const input_original = cudf::table_view{{col1, col2, col1_key, col2_key}}; + auto const input = cudf::slice(input_original, {2, 8})[0]; + auto const keys = std::vector{2, 3}; + + // The expected table would be sorted in ascending order with respect to keys. + auto const exp_col1 = int32s_col{{5, 5, 6, 3, 8}}; + auto const exp_col2 = floats_col{{4, 4, 6, 3, 9}}; + auto const exp_col1_key = int32s_col{{9, 19, 20, 20, 21}}; + auto const exp_col2_key = int32s_col{{21, 20, 19, 20, 9}}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + + auto const result = cudf::distinct(input, keys); + auto const key_view = result->select(keys.begin(), keys.end()); + auto const sorted_result = cudf::sort_by_key(result->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); +} + TEST_F(Distinct, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; @@ -160,6 +190,28 @@ TEST_F(Distinct, BasicList) CUDF_TEST_EXPECT_TABLES_EQUAL(expect, *sorted_result); } +TEST_F(Distinct, BasicSlicedLists) +{ + using int32s_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + auto constexpr dont_care = int32_t{0}; + + auto const idx = int32s_col{dont_care, dont_care, 1, 2, 1, 3, 4, 5, 5, 6, 4, 4, dont_care}; + auto const col = lists_col{ + {0, 0}, {0, 0}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}, {5, 5}}; + auto const input_original = cudf::table_view({idx, col}); + auto const input = cudf::slice(input_original, {2, 12})[0]; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_val = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view({exp_idx, exp_val}); + + auto const result = cudf::distinct(input, {1}); + auto const sorted_result = cudf::sort_by_key(*result, result->select({0})); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *sorted_result); +} + TEST_F(Distinct, NullableList) { using LCW = cudf::test::lists_column_wrapper; @@ -245,6 +297,66 @@ TEST_F(Distinct, ListOfStruct) CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *sorted_result); } +TEST_F(Distinct, SlicedListsOfStructs) +{ + // Constructing a list of struct of two elements + // 0. [] == <- Don't care + // 1. [] != <- Don't care + // 2. Null == <- Don't care + // 3. Null != <- Don't care + // 4. [Null, Null] != <- Don't care + // 5. [Null] == <- Don't care + // 6. [Null] == <- Don't care + // 7. [Null] != <- Don't care + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == <- Don't care + // 16. [{Null, 'b'}] <- Don't care + + using int32s_col = cudf::test::fixed_width_column_wrapper; + using strings_col = cudf::test::strings_column_wrapper; + using structs_col = cudf::test::structs_column_wrapper; + using cudf::test::iterators::nulls_at; + + auto const structs = [] { + auto child1 = + int32s_col{{-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2}, nulls_at({5, 16, 17})}; + auto child2 = strings_col{ + {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"}, + nulls_at({5, 14, 15})}; + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const lists_nullmask = std::vector{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const nullmask_buf = + cudf::test::detail::make_null_mask(lists_nullmask.begin(), lists_nullmask.end()); + auto const lists = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(nullmask_buf.data()), + cudf::UNKNOWN_NULL_COUNT, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; + auto const input_original = cudf::table_view({idx, lists}); + auto const input = cudf::slice(input_original, {8, 15})[0]; + + auto const result = cudf::distinct(input, {1}); + auto const sorted_result = cudf::sort_by_key(*result, result->select({0})); + + auto const exp_map = cudf::test::fixed_width_column_wrapper{8, 9, 10, 11, 13}; + auto const expected_table = cudf::gather(input_original, exp_map); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expected_table, *sorted_result); +} + TEST_F(Distinct, StructOfStruct) { using FWCW = cudf::test::fixed_width_column_wrapper; @@ -303,6 +415,30 @@ TEST_F(Distinct, StructOfStruct) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_expect->get_column(1), sorted_sliced_result->get_column(1)); } +TEST_F(Distinct, SlicedStructsOfLists) +{ + using lists_col = cudf::test::lists_column_wrapper; + using structs_col = cudf::test::structs_column_wrapper; + + auto const structs = [] { + auto child = lists_col{ + {0, 0}, {0, 0}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}, {5, 5}}; + return structs_col{{child}}; + }(); + + auto const input_original = cudf::table_view({structs}); + auto const input = cudf::slice(input_original, {2, 12})[0]; + + auto const expected_structs = [] { + auto child = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + return structs_col{{child}}; + }(); + auto const expected = cudf::table_view({expected_structs}); + + auto const result = cudf::distinct(input, {0}); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + TEST_F(Distinct, StructWithNullElement) { using FWCW = cudf::test::fixed_width_column_wrapper;