From 0818370236787722105d771f7ae23dd6ea529e96 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 14:19:21 -0700 Subject: [PATCH 01/18] Complete implementation --- .../cudf/table/experimental/row_operators.cuh | 29 ++++++++++++--- cpp/src/table/row_operators.cu | 37 ++++++++++++++++--- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 846bebfe10b..0e125ce34f6 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -330,8 +330,8 @@ class device_row_comparator { null_order null_precedence = null_order::BEFORE, int depth = 0, PhysicalElementComparator comparator = {}, - optional_dremel_view l_dremel_device_view = {}, - optional_dremel_view r_dremel_device_view = {}) + optional_dremel_view l_dremel_device_view = thrust::nullopt, + optional_dremel_view r_dremel_device_view = thrust::nullopt) : _lhs{lhs}, _rhs{rhs}, _check_nulls{check_nulls}, @@ -408,7 +408,14 @@ class device_row_comparator { return cudf::type_dispatcher( lcol.type(), - element_comparator{_check_nulls, lcol, rcol, _null_precedence, depth, _comparator}, + element_comparator{_check_nulls, + lcol, + rcol, + _null_precedence, + depth, + _comparator, + _l_dremel_device_view, + _r_dremel_device_view}, lhs_element_index, rhs_element_index); } @@ -561,9 +568,21 @@ class device_row_comparator { auto idx = list_column_index++; return std::make_tuple(optional_dremel_view(_l_dremel[idx]), optional_dremel_view(_r_dremel[idx])); - } else { - return std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + } else if (_lhs.column(i).type().id() == type_id::STRUCT && + _lhs.column(i).num_child_columns() > 0) { + // Any structs column has already been processed to have only 1 child. + auto child = _lhs.column(i).child(0); + while (child.type().id() == type_id::STRUCT && child.num_child_columns() > 0) { + child = child.child(0); + } + if (child.type().id() == type_id::LIST) { + auto idx = list_column_index++; + + return std::make_tuple(optional_dremel_view(_l_dremel[idx]), + optional_dremel_view(_r_dremel[idx])); + } } + return std::make_tuple(thrust::nullopt, thrust::nullopt); }(); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 6997de18be5..dc4716dfab0 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -180,8 +180,9 @@ auto decompose_structs(table_view table, c->children[lists_column_view::child_column_index].get(), branch, depth + 1); } else if (c->type().id() == type_id::STRUCT) { for (size_t child_idx = 0; child_idx < c->children.size(); ++child_idx) { - if (child_idx > 0) { - verticalized_col_depths.push_back(depth + 1); + if (child_idx > 0 || + (child_idx == 0 && c->children[0]->type().id() == type_id::LIST)) { + verticalized_col_depths.push_back(child_idx == 0 ? depth : depth + 1); branch = &flattened.emplace_back(); } recursive_child(c->children[child_idx].get(), branch, depth + 1); @@ -194,6 +195,18 @@ auto decompose_structs(table_view table, for (auto const& branch : flattened) { column_view temp_col = *branch.back(); + + if (temp_col.type().id() == type_id::STRUCT && + (temp_col.num_children() > 0 && temp_col.child(0).type().id() == type_id::LIST)) { + temp_col = column_view(temp_col.type(), + temp_col.size(), + temp_col.head(), + temp_col.null_mask(), + UNKNOWN_NULL_COUNT, + temp_col.offset(), + {}); + } + for (auto it = branch.crbegin() + 1; it < branch.crend(); ++it) { auto const& prev_col = *(*it); auto children = @@ -260,7 +273,7 @@ auto decompose_structs(table_view table, * This helper function generates dremel data for any list-type columns in a * table. This data is necessary for lexicographic comparisons. */ -auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream) +auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream) { std::vector dremel_data; std::vector dremel_device_views; @@ -268,6 +281,22 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream) if (col.type().id() == type_id::LIST) { dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); dremel_device_views.push_back(dremel_data.back()); + } else if (col.type().id() == type_id::STRUCT) { + if (col.num_children() == 0) { continue; } + CUDF_EXPECTS(col.num_children() == 1, + "Structs column should be processed to have either 0 or 1 child"); + + auto child = col.child(0); + while (child.type().id() == type_id::STRUCT) { + if (child.num_children() == 0) { break; } + CUDF_EXPECTS(child.num_children() == 1, + "Structs column should be processed to have either 0 or 1 child"); + child = child.child(0); + } + if (child.type().id() == type_id::LIST) { + dremel_data.push_back(detail::get_comparator_data(child, {}, false, stream)); + dremel_device_views.push_back(dremel_data.back()); + } } } auto d_dremel_device_views = detail::make_device_uvector_sync( @@ -293,8 +322,6 @@ void check_lex_compatibility(table_view const& input) check_column(list_col.child()); } else if (c.type().id() == type_id::STRUCT) { for (auto child = c.child_begin(); child < c.child_end(); ++child) { - CUDF_EXPECTS(child->type().id() != type_id::LIST, - "Cannot lexicographic compare a table with a STRUCT of LIST column"); check_column(*child); } } From 1667b27aec8a9ae0fe12c145e4dc448658971baa Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 14:47:21 -0700 Subject: [PATCH 02/18] Fix existing test --- cpp/tests/groupby/structs_tests.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp index a7bc6016307..57c93db50b5 100644 --- a/cpp/tests/groupby/structs_tests.cpp +++ b/cpp/tests/groupby/structs_tests.cpp @@ -294,11 +294,12 @@ TYPED_TEST(groupby_structs_test, all_null_input) test_sum_agg(keys, values, expected_keys, expected_values); } -TYPED_TEST(groupby_structs_test, lists_are_unsupported) +TYPED_TEST(groupby_structs_test, lists_as_keys) { using V = int32_t; // Type of Aggregation Column. using M0 = int32_t; // Type of STRUCT's first (i.e. 0th) member. using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + using R = cudf::detail::target_type_t; // clang-format off auto values = fwcw { 0, 1, 2, 3, 4 }; @@ -307,5 +308,12 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported) // clang-format on auto keys = cudf::test::structs_column_wrapper{{member_0, member_1}}; - EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error); + // clang-format off + auto expected_values = fwcw { 3, 5, 2 }; + auto expected_member_0 = lcw { {1,1}, {2,2}, {3,3} }; + auto expected_member_1 = fwcw{ 1, 2, 3 }; + // clang-format on + auto expected_keys = cudf::test::structs_column_wrapper{{expected_member_0, expected_member_1}}; + + test_sum_agg(keys, values, expected_keys, expected_values); } From e2cc0ab2c1b91a0f1e4dbe5230603f13195a3568 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 14:57:12 -0700 Subject: [PATCH 03/18] Fix test utilities --- cpp/tests/utilities/column_utilities.cu | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 3a94aac1cc9..133ca99b31f 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -543,7 +543,7 @@ struct column_comparator_impl { auto const comparator = cudf::experimental::row::equality::two_table_comparator{ lhs_tview, rhs_tview, cudf::get_default_stream()}; - auto const has_nulls = cudf::has_nested_nulls(lhs_tview) or cudf::has_nested_nulls(rhs_tview); + auto const has_nulls = cudf::has_nulls(lhs_tview) or cudf::has_nulls(rhs_tview); auto const device_comparator = comparator.equal_to(cudf::nullate::DYNAMIC{has_nulls}); @@ -556,18 +556,22 @@ struct column_comparator_impl { lhs_row_indices.size(), cudf::get_default_stream()); // worst case: everything different auto input_iter = thrust::make_counting_iterator(0); + auto diff_map = rmm::device_uvector(lhs_row_indices.size(), cudf::get_default_stream()); + thrust::transform( rmm::exec_policy(cudf::get_default_stream()), input_iter, input_iter + lhs_row_indices.size(), - differences.begin(), + diff_map.begin(), ComparatorType( *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs)); - auto diff_iter = thrust::remove(rmm::exec_policy(cudf::get_default_stream()), - differences.begin(), - differences.end(), - 0); // remove the zero entries + auto diff_iter = thrust::copy_if(rmm::exec_policy(cudf::get_default_stream()), + input_iter, + input_iter + lhs_row_indices.size(), + diff_map.begin(), + differences.begin(), + thrust::identity{}); differences.resize(thrust::distance(differences.begin(), diff_iter), cudf::get_default_stream()); // shrink back down From 64c6605655beb18f9af2b4c61de5b8f927586cf5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 14:57:45 -0700 Subject: [PATCH 04/18] Add tests --- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/sort/sort_nested_types_tests.cpp | 181 +++++++++++++++++++++ 2 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 cpp/tests/sort/sort_nested_types_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index bd4077aff4e..6df646b8cae 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -295,8 +295,8 @@ endif() # ################################################################################################## # * sort tests ------------------------------------------------------------------------------------ ConfigureTest( - SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/stable_sort_tests.cpp - sort/rank_test.cpp + SORT_TEST sort/segmented_sort_tests.cpp sort/sort_nested_types_tests.cpp sort/sort_test.cpp + sort/stable_sort_tests.cpp sort/rank_test.cpp GPUS 1 PERCENT 70 ) diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp new file mode 100644 index 00000000000..f7b52940d30 --- /dev/null +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +using int32s_lists = cudf::test::lists_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; + +using namespace cudf::test::iterators; + +constexpr auto null{0}; + +struct NestedStructTest : public cudf::test::BaseFixture { +}; + +TEST_F(NestedStructTest, SimpleStructsOfListsNoNulls) +{ + auto const input = [] { + auto child = int32s_lists{{4, 2, 0}, {2}, {0, 5}, {1, 5}, {4, 1}}; + return structs_col{{child}}; + }(); + + { + auto const expected_order = int32s_col{2, 3, 1, 4, 0}; + auto const order = cudf::sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{0, 4, 1, 3, 2}; + auto const order = cudf::sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, SimpleStructsOfListsWithNulls) +{ + auto const input = [] { + auto child = + int32s_lists{{{4, 2, null}, null_at(2)}, {2}, {{null, 5}, null_at(0)}, {0, 5}, {4, 1}}; + return structs_col{{child}}; + }(); + + { + auto const expected_order = int32s_col{2, 3, 1, 4, 0}; + auto const order = cudf::sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{0, 4, 1, 3, 2}; + auto const order = cudf::sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, StructsHaveListsNoNulls) +{ + // Input has equal elements, thus needs to tested by stable sort. + auto const input = [] { + auto child0 = int32s_lists{{4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child1 = int32s_col{1, 2, 5, 0, 3, 3, 4}; + return structs_col{{child0, child1}}; + }(); + + { + auto const expected_order = int32s_col{1, 5, 6, 4, 3, 0, 2}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{2, 0, 3, 4, 1, 5, 6}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, StructsHaveListsWithNulls) +{ + // Input has equal elements, thus needs to tested by stable sort. + auto const input = [] { + auto child0 = + int32s_lists{{{4, 2, null}, null_at(2)}, {}, {} /*NULL*/, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child1 = int32s_col{{1, 2, null, 5, null, 3, 3, 4}, nulls_at({2, 4})}; + return structs_col{{child0, child1}, null_at(2)}; + }(); + + { + auto const expected_order = int32s_col{2, 1, 6, 7, 5, 4, 0, 3}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{3, 0, 4, 5, 1, 6, 7, 2}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, StructsOfStructsHaveListsNoNulls) +{ + // Input has equal elements, thus needs to tested by stable sort. + auto const input = [] { + auto child0 = [] { + auto child0 = int32s_lists{{4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child1 = int32s_col{1, 2, 5, 0, 3, 3, 4}; + return structs_col{{child0, child1}}; + }(); + auto child1 = int32s_lists{{4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child2 = int32s_col{1, 2, 5, 0, 3, 3, 4}; + return structs_col{{child0, child1, child2}}; + }(); + + { + auto const expected_order = int32s_col{1, 5, 6, 4, 3, 0, 2}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{2, 0, 3, 4, 6, 5, 1}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, StructsOfStructsHaveListsWithNulls) +{ + // Input has equal elements, thus needs to tested by stable sort. + auto const input = [] { + auto child0 = [] { + auto child0 = + int32s_lists{{{4, 2, null}, null_at(2)}, {}, {} /*NULL*/, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child1 = int32s_col{{1, 2, null, 5, null, 3, 3, 4}, nulls_at({2, 4})}; + return structs_col{{child0, child1}, null_at(2)}; + }(); + auto child1 = + int32s_lists{{{4, 2, null}, null_at(2)}, {}, {} /*NULL*/, {5}, {4, 1}, {4, 0}, {}, {}}; + auto child2 = int32s_col{{1, 2, null, 5, null, 3, 3, 4}, nulls_at({2, 4})}; + return structs_col{{child0, child1, child2}, null_at(2)}; + }(); + + { + auto const expected_order = int32s_col{2, 1, 6, 7, 5, 4, 0, 3}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{3, 0, 4, 5, 7, 6, 1, 2}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} From f75f9c5e147445d5e68810d243a6746775ea0737 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 18:29:28 -0700 Subject: [PATCH 05/18] Simplify code --- cpp/src/table/row_operators.cu | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index dc4716dfab0..b8fe3b17062 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -282,19 +282,15 @@ auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream) dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); dremel_device_views.push_back(dremel_data.back()); } else if (col.type().id() == type_id::STRUCT) { - if (col.num_children() == 0) { continue; } - CUDF_EXPECTS(col.num_children() == 1, - "Structs column should be processed to have either 0 or 1 child"); - - auto child = col.child(0); - while (child.type().id() == type_id::STRUCT) { - if (child.num_children() == 0) { break; } - CUDF_EXPECTS(child.num_children() == 1, + auto col_iter = col; + while (col_iter.type().id() == type_id::STRUCT) { + if (col_iter.num_children() == 0) { break; } + CUDF_EXPECTS(col_iter.num_children() == 1, "Structs column should be processed to have either 0 or 1 child"); - child = child.child(0); + col_iter = col_iter.child(0); } - if (child.type().id() == type_id::LIST) { - dremel_data.push_back(detail::get_comparator_data(child, {}, false, stream)); + if (col_iter.type().id() == type_id::LIST) { + dremel_data.push_back(detail::get_comparator_data(col_iter, {}, false, stream)); dremel_device_views.push_back(dremel_data.back()); } } From 0c8a62a84a9d640b485864bd38cdca168595a4bc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 18:33:01 -0700 Subject: [PATCH 06/18] Remove redundant code --- cpp/src/table/row_operators.cu | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index b8fe3b17062..39fe5fcfc14 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -281,18 +281,6 @@ auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream) if (col.type().id() == type_id::LIST) { dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); dremel_device_views.push_back(dremel_data.back()); - } else if (col.type().id() == type_id::STRUCT) { - auto col_iter = col; - while (col_iter.type().id() == type_id::STRUCT) { - if (col_iter.num_children() == 0) { break; } - CUDF_EXPECTS(col_iter.num_children() == 1, - "Structs column should be processed to have either 0 or 1 child"); - col_iter = col_iter.child(0); - } - if (col_iter.type().id() == type_id::LIST) { - dremel_data.push_back(detail::get_comparator_data(col_iter, {}, false, stream)); - dremel_device_views.push_back(dremel_data.back()); - } } } auto d_dremel_device_views = detail::make_device_uvector_sync( From 01acdafc94f0fdc13ab1ca21656ff34a325fe565 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 18:44:26 -0700 Subject: [PATCH 07/18] Revert all changes in `row_operators.cuh` --- .../cudf/table/experimental/row_operators.cuh | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 0e125ce34f6..846bebfe10b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -330,8 +330,8 @@ class device_row_comparator { null_order null_precedence = null_order::BEFORE, int depth = 0, PhysicalElementComparator comparator = {}, - optional_dremel_view l_dremel_device_view = thrust::nullopt, - optional_dremel_view r_dremel_device_view = thrust::nullopt) + optional_dremel_view l_dremel_device_view = {}, + optional_dremel_view r_dremel_device_view = {}) : _lhs{lhs}, _rhs{rhs}, _check_nulls{check_nulls}, @@ -408,14 +408,7 @@ class device_row_comparator { return cudf::type_dispatcher( lcol.type(), - element_comparator{_check_nulls, - lcol, - rcol, - _null_precedence, - depth, - _comparator, - _l_dremel_device_view, - _r_dremel_device_view}, + element_comparator{_check_nulls, lcol, rcol, _null_precedence, depth, _comparator}, lhs_element_index, rhs_element_index); } @@ -568,21 +561,9 @@ class device_row_comparator { auto idx = list_column_index++; return std::make_tuple(optional_dremel_view(_l_dremel[idx]), optional_dremel_view(_r_dremel[idx])); - } else if (_lhs.column(i).type().id() == type_id::STRUCT && - _lhs.column(i).num_child_columns() > 0) { - // Any structs column has already been processed to have only 1 child. - auto child = _lhs.column(i).child(0); - while (child.type().id() == type_id::STRUCT && child.num_child_columns() > 0) { - child = child.child(0); - } - if (child.type().id() == type_id::LIST) { - auto idx = list_column_index++; - - return std::make_tuple(optional_dremel_view(_l_dremel[idx]), - optional_dremel_view(_r_dremel[idx])); - } + } else { + return std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); } - return std::make_tuple(thrust::nullopt, thrust::nullopt); }(); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), From 93942560a049872c1700beae8178bf561a755f65 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 18:50:20 -0700 Subject: [PATCH 08/18] Fix typo --- cpp/tests/sort/sort_nested_types_tests.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index f7b52940d30..8702887f5e3 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -77,7 +77,7 @@ TEST_F(NestedStructTest, SimpleStructsOfListsWithNulls) TEST_F(NestedStructTest, StructsHaveListsNoNulls) { - // Input has equal elements, thus needs to tested by stable sort. + // Input has equal elements, thus needs to be tested by stable sort. auto const input = [] { auto child0 = int32s_lists{{4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}}; auto child1 = int32s_col{1, 2, 5, 0, 3, 3, 4}; @@ -100,7 +100,7 @@ TEST_F(NestedStructTest, StructsHaveListsNoNulls) TEST_F(NestedStructTest, StructsHaveListsWithNulls) { - // Input has equal elements, thus needs to tested by stable sort. + // Input has equal elements, thus needs to be tested by stable sort. auto const input = [] { auto child0 = int32s_lists{{{4, 2, null}, null_at(2)}, {}, {} /*NULL*/, {5}, {4, 1}, {4, 0}, {}, {}}; @@ -124,7 +124,7 @@ TEST_F(NestedStructTest, StructsHaveListsWithNulls) TEST_F(NestedStructTest, StructsOfStructsHaveListsNoNulls) { - // Input has equal elements, thus needs to tested by stable sort. + // Input has equal elements, thus needs to be tested by stable sort. auto const input = [] { auto child0 = [] { auto child0 = int32s_lists{{4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}}; @@ -152,7 +152,7 @@ TEST_F(NestedStructTest, StructsOfStructsHaveListsNoNulls) TEST_F(NestedStructTest, StructsOfStructsHaveListsWithNulls) { - // Input has equal elements, thus needs to tested by stable sort. + // Input has equal elements, thus needs to be tested by stable sort. auto const input = [] { auto child0 = [] { auto child0 = From 053722545c72475d95e8d2178207f914544978a1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 19:31:05 -0700 Subject: [PATCH 09/18] Update docs --- cpp/src/table/row_operators.cu | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 39fe5fcfc14..ecce9652589 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -85,13 +85,17 @@ table_view remove_struct_child_offsets(table_view table) /** * @brief Decompose all struct columns in a table * - * If a struct column is a tree with N leaves, then this function decomposes the tree into + * If a structs column is a tree with N leaves, then this function decomposes the tree into * N "linear trees" (branch factor == 1) and prunes common parents. Also returns a vector of * per-column `depth`s. * * A `depth` value is the number of nested levels as parent of the column in the original, * non-decomposed table, which are pruned during decomposition. * + * Special handling is needed in the cases of structs column having lists as its first child. In + * such situations, the function decomposes the tree of N leaves into N+1 linear trees in which the + * second tree is also leaf of the first tree extracted out. + * * For example, if the original table has a column `Struct, decimal>`, * * S1 @@ -113,7 +117,7 @@ table_view remove_struct_child_offsets(table_view table) * The depth of the first column is 0 because it contains all its parent levels, while the depth * of the second column is 2 because two of its parent struct levels were pruned. * - * Similarly, a struct column of type Struct> is decomposed as follows + * Similarly, a struct column of type `Struct>` is decomposed as follows * * S1 * / \ @@ -148,6 +152,10 @@ table_view remove_struct_child_offsets(table_view table) * The list parents are still needed to define the range of elements in the leaf that belong to the * same row. * + * In the case of structs column having its first child is a lists column such as + * `Struct, int>`, after decomposition we get three columns `Struct>`, + * `List`, and `int`. + * * @param table The table whose struct columns to decompose. * @param column_order The per-column order if using output with lexicographic comparison * @param null_precedence The per-column null precedence From 6ff8edecc95547744cde48dc2f0c64323bc05845 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 20:44:09 -0700 Subject: [PATCH 10/18] Add comments --- cpp/src/table/row_operators.cu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index ecce9652589..75c0e0509a7 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -94,7 +94,8 @@ table_view remove_struct_child_offsets(table_view table) * * Special handling is needed in the cases of structs column having lists as its first child. In * such situations, the function decomposes the tree of N leaves into N+1 linear trees in which the - * second tree is also leaf of the first tree extracted out. + * second tree is also leaf of the first tree extracted out. This is to make sure there is no + * structs column having child lists column in the output. * * For example, if the original table has a column `Struct, decimal>`, * @@ -188,6 +189,13 @@ auto decompose_structs(table_view table, c->children[lists_column_view::child_column_index].get(), branch, depth + 1); } else if (c->type().id() == type_id::STRUCT) { for (size_t child_idx = 0; child_idx < c->children.size(); ++child_idx) { + // When child_idx == 0, we also cut off the current branch if its first child is a + // lists column. + // In such cases, the last column of the current branch will be `Struct` and + // it will be modified to empty struct type `Struct<>` later on. + // In addition, the new cut out branch will be set to have the same depth as the + // current branch instead of higher depth. By this way, we can avoid the new branch + // to be ignored in `device_row_comparator` when the current branch has zero child. if (child_idx > 0 || (child_idx == 0 && c->children[0]->type().id() == type_id::LIST)) { verticalized_col_depths.push_back(child_idx == 0 ? depth : depth + 1); @@ -204,6 +212,7 @@ auto decompose_structs(table_view table, for (auto const& branch : flattened) { column_view temp_col = *branch.back(); + // Change `Struct` into empty struct type `Struct<>`. if (temp_col.type().id() == type_id::STRUCT && (temp_col.num_children() > 0 && temp_col.child(0).type().id() == type_id::LIST)) { temp_col = column_view(temp_col.type(), From b806d14ab5c766e10f06082c196941a810c08833 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 20:45:38 -0700 Subject: [PATCH 11/18] Change docs --- cpp/src/table/row_operators.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 75c0e0509a7..2a49a47a9e3 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -94,8 +94,8 @@ table_view remove_struct_child_offsets(table_view table) * * Special handling is needed in the cases of structs column having lists as its first child. In * such situations, the function decomposes the tree of N leaves into N+1 linear trees in which the - * second tree is also leaf of the first tree extracted out. This is to make sure there is no - * structs column having child lists column in the output. + * second tree was generated by extracting out leaf of the first tree. This is to make sure there is + * no structs column having child lists column in the output. * * For example, if the original table has a column `Struct, decimal>`, * From 7c63726f9d17d6da846358b69025982758254272 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 20:58:06 -0700 Subject: [PATCH 12/18] Fix style --- cpp/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6df646b8cae..04cf3f838b5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -296,7 +296,7 @@ endif() # * sort tests ------------------------------------------------------------------------------------ ConfigureTest( SORT_TEST sort/segmented_sort_tests.cpp sort/sort_nested_types_tests.cpp sort/sort_test.cpp - sort/stable_sort_tests.cpp sort/rank_test.cpp + sort/stable_sort_tests.cpp sort/rank_test.cpp GPUS 1 PERCENT 70 ) From 422f93c0ac42290e586ceaed17fab2fcef2c4983 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 23 Mar 2023 21:32:32 -0700 Subject: [PATCH 13/18] Fix docs --- cpp/src/table/row_operators.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 2a49a47a9e3..30f8f89fd21 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -154,8 +154,8 @@ table_view remove_struct_child_offsets(table_view table) * same row. * * In the case of structs column having its first child is a lists column such as - * `Struct, int>`, after decomposition we get three columns `Struct>`, - * `List`, and `int`. + * `Struct, float>`, after decomposition we get three columns `Struct<>`, + * `List`, and `float`. * * @param table The table whose struct columns to decompose. * @param column_order The per-column order if using output with lexicographic comparison From 70abc3ca97452b98e675b80dbc481d55dc0e0e78 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 24 Mar 2023 10:25:43 -0700 Subject: [PATCH 14/18] Fix depth comparision --- cpp/include/cudf/table/experimental/row_operators.cuh | 2 +- cpp/src/table/row_operators.cu | 5 +---- cpp/tests/sort/sort_nested_types_tests.cpp | 4 ++-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 846bebfe10b..a549e725605 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -397,7 +397,7 @@ class device_row_comparator { } if (lcol.num_child_columns() == 0) { - return cuda::std::pair(weak_ordering::EQUIVALENT, depth); + return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits::max()); } // Non-empty structs have been modified to only have 1 child when using this. diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 30f8f89fd21..7be2b22ee6e 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -193,12 +193,9 @@ auto decompose_structs(table_view table, // lists column. // In such cases, the last column of the current branch will be `Struct` and // it will be modified to empty struct type `Struct<>` later on. - // In addition, the new cut out branch will be set to have the same depth as the - // current branch instead of higher depth. By this way, we can avoid the new branch - // to be ignored in `device_row_comparator` when the current branch has zero child. if (child_idx > 0 || (child_idx == 0 && c->children[0]->type().id() == type_id::LIST)) { - verticalized_col_depths.push_back(child_idx == 0 ? depth : depth + 1); + verticalized_col_depths.push_back(depth + 1); branch = &flattened.emplace_back(); } recursive_child(c->children[child_idx].get(), branch, depth + 1); diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index 8702887f5e3..f9235974167 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -91,7 +91,7 @@ TEST_F(NestedStructTest, StructsHaveListsNoNulls) } { - auto const expected_order = int32s_col{2, 0, 3, 4, 1, 5, 6}; + auto const expected_order = int32s_col{2, 0, 3, 4, 6, 5, 1}; auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); @@ -115,7 +115,7 @@ TEST_F(NestedStructTest, StructsHaveListsWithNulls) } { - auto const expected_order = int32s_col{3, 0, 4, 5, 1, 6, 7, 2}; + auto const expected_order = int32s_col{3, 0, 4, 5, 7, 6, 1, 2}; auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); From 104aeba799702ece8c063abd6e7b43d40d0d50da Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 7 Apr 2023 10:07:06 -0700 Subject: [PATCH 15/18] Add comment --- cpp/src/table/row_operators.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 7be2b22ee6e..528b1fe1f79 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -95,7 +95,9 @@ table_view remove_struct_child_offsets(table_view table) * Special handling is needed in the cases of structs column having lists as its first child. In * such situations, the function decomposes the tree of N leaves into N+1 linear trees in which the * second tree was generated by extracting out leaf of the first tree. This is to make sure there is - * no structs column having child lists column in the output. + * no structs column having child lists column in the output. Note that structs with lists children + * in subsequent positions do not require any special treatment because the struct parent will be + * pruned for all subsequent children. * * For example, if the original table has a column `Struct, decimal>`, * From 1b8369a5f1b13157e80d6609d264972079966f42 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 7 Apr 2023 10:21:13 -0700 Subject: [PATCH 16/18] Add tests with sliced input --- cpp/tests/sort/sort_nested_types_tests.cpp | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index f9235974167..4075a8456d8 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -122,6 +122,70 @@ TEST_F(NestedStructTest, StructsHaveListsWithNulls) } } +TEST_F(NestedStructTest, SlicedStructsHaveListsNoNulls) +{ + // Input has equal elements, thus needs to be tested by stable sort. + // The original input has 3 first elements repeated at the beginning and the end. + auto const input_original = [] { + auto child0 = int32s_lists{ + {4, 2, 0}, {}, {5}, {4, 2, 0}, {}, {5}, {4, 1}, {4, 0}, {}, {}, {4, 2, 0}, {}, {5}}; + auto child1 = int32s_col{1, 2, 5, 1, 2, 5, 0, 3, 3, 4, 1, 2, 5}; + return structs_col{{child0, child1}}; + }(); + + auto const input = cudf::slice(input_original, {3, 10})[0]; + + { + auto const expected_order = int32s_col{1, 5, 6, 4, 3, 0, 2}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{2, 0, 3, 4, 6, 5, 1}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + +TEST_F(NestedStructTest, SlicedStructsHaveListsWithNulls) +{ + // Input has equal elements, thus needs to be tested by stable sort. + // The original input has 2 first elements repeated at the beginning and the end. + auto const input_original = [] { + auto child0 = int32s_lists{{{4, 2, null}, null_at(2)}, + {}, + {{4, 2, null}, null_at(2)}, + {}, + {} /*NULL*/, + {5}, + {4, 1}, + {4, 0}, + {}, + {}, + {{4, 2, null}, null_at(2)}, + {}}; + auto child1 = int32s_col{{1, 2, 1, 2, null, 5, null, 3, 3, 4, 1, 2}, nulls_at({4, 6})}; + return structs_col{{child0, child1}, null_at(4)}; + }(); + + auto const input = cudf::slice(input_original, {2, 10})[0]; + + { + auto const expected_order = int32s_col{2, 1, 6, 7, 5, 4, 0, 3}; + auto const order = cudf::stable_sorted_order(cudf::table_view{{input}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } + + { + auto const expected_order = int32s_col{3, 0, 4, 5, 7, 6, 1, 2}; + auto const order = + cudf::stable_sorted_order(cudf::table_view{{input}}, {cudf::order::DESCENDING}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view()); + } +} + TEST_F(NestedStructTest, StructsOfStructsHaveListsNoNulls) { // Input has equal elements, thus needs to be tested by stable sort. From 654a232db0c0fef6f1d2ceb7d595bdcfc04a6cca Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 7 Apr 2023 11:26:29 -0600 Subject: [PATCH 17/18] Update cpp/src/table/row_operators.cu Co-authored-by: Bradley Dice --- cpp/src/table/row_operators.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 528b1fe1f79..f1bcc3f7d3d 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -155,7 +155,7 @@ table_view remove_struct_child_offsets(table_view table) * The list parents are still needed to define the range of elements in the leaf that belong to the * same row. * - * In the case of structs column having its first child is a lists column such as + * In the case of structs column with a lists column as its first child such as * `Struct, float>`, after decomposition we get three columns `Struct<>`, * `List`, and `float`. * From 9d7921d80e47a043f9bdb43c62983f6558ddd51d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 7 Apr 2023 10:31:22 -0700 Subject: [PATCH 18/18] Optimize condition, and remove `UNKNOWN_NULL_COUNT` --- cpp/src/table/row_operators.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index f1bcc3f7d3d..94636f5e6ca 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -195,8 +195,7 @@ auto decompose_structs(table_view table, // lists column. // In such cases, the last column of the current branch will be `Struct` and // it will be modified to empty struct type `Struct<>` later on. - if (child_idx > 0 || - (child_idx == 0 && c->children[0]->type().id() == type_id::LIST)) { + if (child_idx > 0 || c->children[0]->type().id() == type_id::LIST) { verticalized_col_depths.push_back(depth + 1); branch = &flattened.emplace_back(); } @@ -218,7 +217,7 @@ auto decompose_structs(table_view table, temp_col.size(), temp_col.head(), temp_col.null_mask(), - UNKNOWN_NULL_COUNT, + temp_col.null_count(), temp_col.offset(), {}); } @@ -235,7 +234,7 @@ auto decompose_structs(table_view table, prev_col.size(), nullptr, prev_col.null_mask(), - UNKNOWN_NULL_COUNT, + prev_col.null_count(), prev_col.offset(), std::move(children)); } @@ -249,7 +248,7 @@ auto decompose_structs(table_view table, parent->size(), nullptr, // list has no data of its own nullptr, // If we're going through this then nullmask is already in another branch - UNKNOWN_NULL_COUNT, + 0, parent->offset(), {*parent->children[lists_column_view::offsets_column_index], temp_col}); } else if (parent->type().id() == type_id::STRUCT) { @@ -258,7 +257,7 @@ auto decompose_structs(table_view table, parent->size(), temp_col.head(), temp_col.null_mask(), - UNKNOWN_NULL_COUNT, + temp_col.null_count(), parent->offset(), {temp_col.child_begin(), temp_col.child_end()}); }