Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support structs of lists in row lexicographic comparator #13005

Merged
merged 25 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ class device_row_comparator {
}

if (lcol.num_child_columns() == 0) {
return cuda::std::pair(weak_ordering::EQUIVALENT, depth);
return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits<int>::max());
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}

// Non-empty structs have been modified to only have 1 child when using this.
Expand Down
44 changes: 35 additions & 9 deletions cpp/src/table/row_operators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,20 @@ table_view remove_struct_child_offsets(table_view table)
/**
* @brief Decompose all struct columns in a table
*
* If a struct column is a tree with N leaves, then this function decomposes the tree into
* If a structs column is a tree with N leaves, then this function decomposes the tree into
* N "linear trees" (branch factor == 1) and prunes common parents. Also returns a vector of
* per-column `depth`s.
*
* A `depth` value is the number of nested levels as parent of the column in the original,
* non-decomposed table, which are pruned during decomposition.
*
* Special handling is needed in the cases of structs column having lists as its first child. In
* such situations, the function decomposes the tree of N leaves into N+1 linear trees in which the
* second tree was generated by extracting out leaf of the first tree. This is to make sure there is
* no structs column having child lists column in the output. Note that structs with lists children
* in subsequent positions do not require any special treatment because the struct parent will be
* pruned for all subsequent children.
*
* For example, if the original table has a column `Struct<Struct<int, float>, decimal>`,
*
* S1
Expand All @@ -113,7 +120,7 @@ table_view remove_struct_child_offsets(table_view table)
* The depth of the first column is 0 because it contains all its parent levels, while the depth
* of the second column is 2 because two of its parent struct levels were pruned.
*
* Similarly, a struct column of type Struct<int, Struct<float, decimal>> is decomposed as follows
* Similarly, a struct column of type `Struct<int, Struct<float, decimal>>` is decomposed as follows
*
* S1
* / \
Expand Down Expand Up @@ -148,6 +155,10 @@ table_view remove_struct_child_offsets(table_view table)
* The list parents are still needed to define the range of elements in the leaf that belong to the
* same row.
*
* In the case of structs column with a lists column as its first child such as
* `Struct<List<int>, float>`, after decomposition we get three columns `Struct<>`,
* `List<int>`, and `float`.
*
* @param table The table whose struct columns to decompose.
* @param column_order The per-column order if using output with lexicographic comparison
* @param null_precedence The per-column null precedence
Expand Down Expand Up @@ -180,7 +191,11 @@ auto decompose_structs(table_view table,
c->children[lists_column_view::child_column_index].get(), branch, depth + 1);
} else if (c->type().id() == type_id::STRUCT) {
for (size_t child_idx = 0; child_idx < c->children.size(); ++child_idx) {
if (child_idx > 0) {
// When child_idx == 0, we also cut off the current branch if its first child is a
// lists column.
// In such cases, the last column of the current branch will be `Struct<List,...>` and
// it will be modified to empty struct type `Struct<>` later on.
if (child_idx > 0 || c->children[0]->type().id() == type_id::LIST) {
verticalized_col_depths.push_back(depth + 1);
branch = &flattened.emplace_back();
}
Expand All @@ -194,6 +209,19 @@ auto decompose_structs(table_view table,

for (auto const& branch : flattened) {
column_view temp_col = *branch.back();

// Change `Struct<List,...>` into empty struct type `Struct<>`.
if (temp_col.type().id() == type_id::STRUCT &&
(temp_col.num_children() > 0 && temp_col.child(0).type().id() == type_id::LIST)) {
temp_col = column_view(temp_col.type(),
temp_col.size(),
temp_col.head(),
temp_col.null_mask(),
temp_col.null_count(),
temp_col.offset(),
{});
}

for (auto it = branch.crbegin() + 1; it < branch.crend(); ++it) {
auto const& prev_col = *(*it);
auto children =
Expand All @@ -206,7 +234,7 @@ auto decompose_structs(table_view table,
prev_col.size(),
nullptr,
prev_col.null_mask(),
UNKNOWN_NULL_COUNT,
prev_col.null_count(),
prev_col.offset(),
std::move(children));
}
Expand All @@ -220,7 +248,7 @@ auto decompose_structs(table_view table,
parent->size(),
nullptr, // list has no data of its own
nullptr, // If we're going through this then nullmask is already in another branch
UNKNOWN_NULL_COUNT,
0,
parent->offset(),
{*parent->children[lists_column_view::offsets_column_index], temp_col});
} else if (parent->type().id() == type_id::STRUCT) {
Expand All @@ -229,7 +257,7 @@ auto decompose_structs(table_view table,
parent->size(),
temp_col.head(),
temp_col.null_mask(),
UNKNOWN_NULL_COUNT,
temp_col.null_count(),
parent->offset(),
{temp_col.child_begin(), temp_col.child_end()});
}
Expand Down Expand Up @@ -260,7 +288,7 @@ auto decompose_structs(table_view table,
* This helper function generates dremel data for any list-type columns in a
* table. This data is necessary for lexicographic comparisons.
*/
auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
{
std::vector<detail::dremel_data> dremel_data;
std::vector<detail::dremel_device_view> dremel_device_views;
Expand Down Expand Up @@ -293,8 +321,6 @@ void check_lex_compatibility(table_view const& input)
check_column(list_col.child());
} else if (c.type().id() == type_id::STRUCT) {
for (auto child = c.child_begin(); child < c.child_end(); ++child) {
CUDF_EXPECTS(child->type().id() != type_id::LIST,
"Cannot lexicographic compare a table with a STRUCT of LIST column");
check_column(*child);
}
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,8 @@ endif()
# ##################################################################################################
# * sort tests ------------------------------------------------------------------------------------
ConfigureTest(
SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/stable_sort_tests.cpp
sort/rank_test.cpp
SORT_TEST sort/segmented_sort_tests.cpp sort/sort_nested_types_tests.cpp sort/sort_test.cpp
sort/stable_sort_tests.cpp sort/rank_test.cpp
GPUS 1
PERCENT 70
)
Expand Down
12 changes: 10 additions & 2 deletions cpp/tests/groupby/structs_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,11 +294,12 @@ TYPED_TEST(groupby_structs_test, all_null_input)
test_sum_agg(keys, values, expected_keys, expected_values);
}

TYPED_TEST(groupby_structs_test, lists_are_unsupported)
TYPED_TEST(groupby_structs_test, lists_as_keys)
{
using V = int32_t; // Type of Aggregation Column.
using M0 = int32_t; // Type of STRUCT's first (i.e. 0th) member.
using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member.
using R = cudf::detail::target_type_t<V, cudf::aggregation::SUM>;

// clang-format off
auto values = fwcw<V> { 0, 1, 2, 3, 4 };
Expand All @@ -307,5 +308,12 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported)
// clang-format on
auto keys = cudf::test::structs_column_wrapper{{member_0, member_1}};

EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error);
// clang-format off
auto expected_values = fwcw<R> { 3, 5, 2 };
auto expected_member_0 = lcw<M0> { {1,1}, {2,2}, {3,3} };
auto expected_member_1 = fwcw<M1>{ 1, 2, 3 };
// clang-format on
auto expected_keys = cudf::test::structs_column_wrapper{{expected_member_0, expected_member_1}};

test_sum_agg(keys, values, expected_keys, expected_values);
}
Loading