From dd5eecd4d726915b2a28c42360419535a01b87de Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 24 May 2021 15:54:27 -0600 Subject: [PATCH] Fix struct binary search and struct flattening (#8268) This PR fixes several bugs. In particular: * Fixes a bug in struct binary search that only check for null elements at the top level. * Fixes a bug in struct flattening that uses default null order for the children of the input structs columns. Unit tests for struct binary search and struct sorting are also rewritten. Closes #8189. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Jake Hemstad (https://github.com/jrhemstad) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/8268 --- cpp/include/cudf/table/table_view.hpp | 14 +++++- cpp/src/search/search.cu | 2 +- cpp/src/structs/utilities.cpp | 6 +-- cpp/tests/search/search_struct_test.cpp | 20 ++++---- cpp/tests/sort/sort_test.cpp | 61 ++++++++++++++----------- 5 files changed, 59 insertions(+), 44 deletions(-) diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index a225e590f9a..1ff701c3b01 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -257,9 +257,19 @@ class mutable_table_view : public detail::table_view_base { mutable_table_view(std::vector const& views); }; -inline bool has_nulls(table_view view) +inline bool has_nulls(table_view const& view) { - return std::any_of(view.begin(), view.end(), [](column_view col) { return col.has_nulls(); }); + return std::any_of(view.begin(), view.end(), [](auto const& col) { return col.has_nulls(); }); +} + +inline bool has_nested_nulls(table_view const& input) +{ + return std::any_of(input.begin(), input.end(), [](auto const& col) { + return col.has_nulls() || + std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) { + return has_nested_nulls(table_view{{child_col}}); + }); + }); } /** diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index cc4f0727a77..fbb89bec731 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -101,7 +101,7 @@ std::unique_ptr search_ordered(table_view const& t, auto const matched = dictionary::detail::match_dictionaries({t, values}, stream); // Prepare to flatten the structs column - auto const has_null_elements = has_nulls(t) or has_nulls(values); + auto const has_null_elements = has_nested_nulls(t) or has_nested_nulls(values); auto const flatten_nullability = has_null_elements ? structs::detail::column_nullability::FORCE : structs::detail::column_nullability::MATCH_INCOMING; diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 6cc537d2042..4f7795bad7a 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -107,13 +107,11 @@ struct flattened_table { for (decltype(col.num_children()) i = 0; i < col.num_children(); ++i) { auto const& child = col.get_sliced_child(i); if (child.type().id() == type_id::STRUCT) { - flatten_struct_column(structs_column_view{child}, col_order, null_order::BEFORE); - // default spark behaviour is null_order::BEFORE + flatten_struct_column(structs_column_view{child}, col_order, col_null_order); } else { flat_columns.push_back(child); if (not column_order.empty()) flat_column_order.push_back(col_order); - if (not null_precedence.empty()) flat_null_precedence.push_back(null_order::BEFORE); - // default spark behaviour is null_order::BEFORE + if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order); } } } diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index fbd8b283a42..1c2e9b02f05 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -167,8 +167,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); - expected_lower_bound = int32s_col{1, 0, 10, 10, 2, 10}; - expected_upper_bound = int32s_col{2, 0, 10, 11, 6, 10}; + expected_lower_bound = int32s_col{1, 9, 9, 10, 2, 9}; + expected_upper_bound = int32s_col{2, 10, 9, 11, 6, 9}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all); @@ -187,8 +187,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER}); - expected_lower_bound = int32s_col{7, 11, 0, 0, 3, 0}; - expected_upper_bound = int32s_col{8, 11, 0, 0, 7, 0}; + expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0}; + expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all); } @@ -214,8 +214,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests) // Sorted asc, search nulls last results = search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); - expected_lower_bound = int32s_col{3, 0, 11, 11, 4, 11}; - expected_upper_bound = int32s_col{4, 0, 11, 11, 8, 11}; + expected_lower_bound = int32s_col{3, 11, 11, 11, 4, 11}; + expected_upper_bound = int32s_col{4, 11, 11, 11, 8, 11}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all); @@ -232,8 +232,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests) // Sorted dsc, search nulls last results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER}); - expected_lower_bound = int32s_col{7, 11, 0, 0, 3, 0}; - expected_upper_bound = int32s_col{8, 11, 0, 0, 7, 0}; + expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0}; + expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all); } @@ -261,8 +261,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); - expected_lower_bound = int32s_col{1, 0, 10, 0, 2, 10}; - expected_upper_bound = int32s_col{2, 1, 10, 1, 6, 10}; + expected_lower_bound = int32s_col{1, 0, 9, 0, 2, 9}; + expected_upper_bound = int32s_col{2, 1, 9, 1, 6, 9}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all); diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 9eb082c513c..0f4688119b7 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -50,11 +50,16 @@ void run_sort_test(table_view input, CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view()); } +using TestTypes = cudf::test::Concat; + template struct Sort : public BaseFixture { }; -TYPED_TEST_CASE(Sort, NumericTypes); +TYPED_TEST_CASE(Sort, TestTypes); TYPED_TEST(Sort, WithNullMax) { @@ -461,14 +466,14 @@ TYPED_TEST(Sort, WithStructColumnCombinations) +------------+ +------------+ +------------+ +------------+ | s| | s| | s| | s| +------------+ +------------+ +------------+ +------------+ - 2 | null| 1 | {1, null}| 2 | null| 3 |{null, null}| - 4 | null| 0 | {0, null}| 4 | null| 5 |{null, null}| - 1 | {1, null}| 6 | {null, 1}| 3 |{null, null}| 7 | {null, 0}| - 0 | {0, null}| 7 | {null, 0}| 5 |{null, null}| 6 | {null, 1}| - 6 | {null, 1}| 3 |{null, null}| 7 | {null, 0}| 0 | {0, null}| - 7 | {null, 0}| 5 |{null, null}| 6 | {null, 1}| 1 | {1, null}| - 3 |{null, null}| 2 | null| 0 | {0, null}| 2 | null| - 5 |{null, null}| 4 | null| 1 | {1, null}| 4 | null| + 2 | null| 1 | {1, null}| 2 | null| 0 | {0, null}| + 4 | null| 0 | {0, null}| 4 | null| 1 | {1, null}| + 3 |{null, null}| 6 | {null, 1}| 3 |{null, null}| 7 | {null, 0}| + 5 |{null, null}| 7 | {null, 0}| 5 |{null, null}| 6 | {null, 1}| + 6 | {null, 1}| 3 |{null, null}| 7 | {null, 0}| 3 |{null, null}| + 7 | {null, 0}| 5 |{null, null}| 6 | {null, 1}| 5 |{null, null}| + 1 | {1, null}| 2 | null| 0 | {0, null}| 2 | null| + 0 | {0, null}| 4 | null| 1 | {1, null}| 4 | null| +------------+ +------------+ +------------+ +------------+ */ // clang-format on @@ -477,7 +482,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations) std::vector column_order1{order::DESCENDING}; // desc_nulls_first - fixed_width_column_wrapper expected1{{2, 4, 1, 0, 6, 7, 3, 5}}; + fixed_width_column_wrapper expected1{{2, 4, 3, 5, 6, 7, 1, 0}}; auto got = sorted_order(input, column_order1, {null_order::AFTER}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view()); // Run test for sort and sort_by_key @@ -499,7 +504,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations) run_sort_test(input, expected3, column_order2, {null_order::BEFORE}); // asce_nulls_last - fixed_width_column_wrapper expected4{{3, 5, 7, 6, 0, 1, 2, 4}}; + fixed_width_column_wrapper expected4{{0, 1, 7, 6, 3, 5, 2, 4}}; got = sorted_order(input, column_order2, {null_order::AFTER}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view()); // Run test for sort and sort_by_key @@ -534,14 +539,14 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls) +------------+ +------------+ | s| | s| +------------+ +------------+ - 2 | {9, 9}| 3 |{null, null}| - 4 | {9, 9}| 5 |{null, null}| - 1 | {1, null}| 7 | {null, 0}| - 0 | {0, null}| 6 | {null, 1}| - 6 | {null, 1}| 0 | {0, null}| - 7 | {null, 0}| 1 | {1, null}| - 3 |{null, null}| 2 | {9, 9}| - 5 |{null, null}| 4 | {9, 9}| + 3 |{null, null}| 0 | {0, null}| + 5 |{null, null}| 1 | {1, null}| + 6 | {null, 1}| 2 | {9, 9}| + 7 | {null, 0}| 4 | {9, 9}| + 2 | {9, 9}| 7 | {null, 0}| + 4 | {9, 9}| 6 | {null, 1}| + 1 | {1, null}| 3 |{null, null}| + 0 | {0, null}| 5 |{null, null}| +------------+ +------------+ */ // clang-format on @@ -550,31 +555,33 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls) std::vector column_order{order::DESCENDING}; // desc_nulls_first - fixed_width_column_wrapper expected1{{2, 4, 1, 0, 6, 7, 3, 5}}; + fixed_width_column_wrapper expected1{{3, 5, 6, 7, 2, 4, 1, 0}}; auto got = sorted_order(input, column_order, {null_order::AFTER}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view()); // Run test for sort and sort_by_key run_sort_test(input, expected1, column_order, {null_order::AFTER}); // desc_nulls_last + fixed_width_column_wrapper expected2{{2, 4, 1, 0, 6, 7, 3, 5}}; got = sorted_order(input, column_order, {null_order::BEFORE}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view()); // Run test for sort and sort_by_key - run_sort_test(input, expected1, column_order, {null_order::BEFORE}); + run_sort_test(input, expected2, column_order, {null_order::BEFORE}); // asce_nulls_first std::vector column_order2{order::ASCENDING}; - fixed_width_column_wrapper expected2{{3, 5, 7, 6, 0, 1, 2, 4}}; + fixed_width_column_wrapper expected3{{3, 5, 7, 6, 0, 1, 2, 4}}; got = sorted_order(input, column_order2, {null_order::BEFORE}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view()); // Run test for sort and sort_by_key - run_sort_test(input, expected2, column_order2, {null_order::BEFORE}); + run_sort_test(input, expected3, column_order2, {null_order::BEFORE}); // asce_nulls_last + fixed_width_column_wrapper expected4{{0, 1, 2, 4, 7, 6, 3, 5}}; got = sorted_order(input, column_order2, {null_order::AFTER}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view()); // Run test for sort and sort_by_key - run_sort_test(input, expected2, column_order2, {null_order::AFTER}); + run_sort_test(input, expected4, column_order2, {null_order::AFTER}); } TYPED_TEST(Sort, Stable)