Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support "unflatten" of columns flattened via flatten_nested_columns(): #8956

Merged
merged 16 commits into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 124 additions & 8 deletions cpp/src/structs/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@

#include <thrust/iterator/counting_iterator.h>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -61,6 +63,24 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
return result;
}

namespace {
/**
* @brief Check whether the specified column is of type `STRUCT`.
*/
bool is_struct(cudf::column_view const& col) { return col.type().id() == type_id::STRUCT; }

/**
* @brief Check whether the specified column is of type LIST, or any LISTs in its descendent
* columns.
*/
bool is_or_has_nested_lists(cudf::column_view const& col)
{
auto is_list = [](cudf::column_view const& col) { return col.type().id() == type_id::LIST; };

return is_list(col) || std::any_of(col.child_begin(), col.child_end(), is_or_has_nested_lists);
}
} // namespace

/**
* @brief Flattens struct columns to constituent non-struct columns in the input table.
*
Expand All @@ -86,6 +106,13 @@ struct flattened_table {
null_precedence(null_precedence),
nullability(nullability)
{
fail_if_unsupported_types(input);
}

void fail_if_unsupported_types(table_view const& input) const
{
auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported.");
}

// Convert null_mask to BOOL8 columns and flatten the struct children in order.
Expand Down Expand Up @@ -156,9 +183,6 @@ struct flattened_table {
}
};

/**
* @copydoc cudf::detail::flatten_nested_columns
*/
std::tuple<table_view,
std::vector<order>,
std::vector<null_order>,
Expand All @@ -168,15 +192,107 @@ flatten_nested_columns(table_view const& input,
std::vector<null_order> const& null_precedence,
column_nullability nullability)
{
std::vector<std::unique_ptr<column>> validity_as_column;
auto const has_struct = std::any_of(
input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
if (not has_struct)
return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
if (not has_struct) {
return std::make_tuple(
input, column_order, null_precedence, std::vector<std::unique_ptr<column>>{});
}

return flattened_table{input, column_order, null_precedence, nullability}();
}

namespace {
using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
using column_index_t = typename vector_of_columns::size_type;

// Forward declaration, to enable recursion via `unflattener`.
std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
column_index_t& current_index,
cudf::column_view const& blueprint);

/**
* @brief Helper functor to reconstruct STRUCT columns from its flattened member columns.
*
*/
class unflattener {
public:
unflattener(vector_of_columns& flattened_, column_index_t& current_index_)
: flattened{flattened_}, current_index{current_index_}
{
}

auto operator()(column_view const& blueprint)
{
return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint)
: std::move(flattened[current_index++]);
}

private:
vector_of_columns& flattened;
column_index_t& current_index;

}; // class unflattener;

std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
column_index_t& current_index,
cudf::column_view const& blueprint)
{
// "Consume" columns from `flattened`, starting at `current_index`,
// based on the provided `blueprint` struct col. Recurse for struct children.
CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT,
"Expected blueprint column to be a STRUCT column.");

CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children.");

auto const num_rows = flattened[current_index]->size();

// cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
// before the child/member columns.
// E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
// 1. Null Vector for STRUCT_1
// 2. Null Vector for STRUCT_2
// 3. Member STRUCT_2::A
// 4. Member STRUCT_2::B
// 5. Member STRUCT_1::C
//
// Extract null-vector *before* child columns are constructed.
auto struct_null_column_contents = flattened[current_index++]->release();
auto unflattening_iter =
thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index});

return cudf::make_structs_column(
num_rows,
vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()},
UNKNOWN_NULL_COUNT, // Do count?
std::move(*struct_null_column_contents.null_mask));
}
} // namespace

std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
table_view const& blueprint)
{
// Bail, if LISTs are present.
auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists);
CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported.");

// If there are no STRUCTs, unflattening is a NOOP.
auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct);
if (not has_structs) {
return std::move(flattened); // Unchanged.
}

// There be struct columns.
// Note: Requires null vectors for all struct input columns.
auto flattened_columns = flattened->release();
auto current_idx = column_index_t{0};

auto unflattening_iter =
thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx});

return std::make_unique<cudf::table>(
vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()});
}

// Helper function to superimpose validity of parent struct
// over the specified member (child) column.
void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
Expand Down
29 changes: 29 additions & 0 deletions cpp/src/structs/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,35 @@ flatten_nested_columns(table_view const& input,
std::vector<null_order> const& null_precedence,
column_nullability nullability = column_nullability::MATCH_INCOMING);

/**
* @brief Unflatten columns flattened as by `flatten_nested_columns()`,
* based on the provided `blueprint`.
*
* cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
* before the child/member columns.
* E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
* 1. Null Vector for STRUCT_1
* 2. Null Vector for STRUCT_2
* 3. Member STRUCT_2::A
* 4. Member STRUCT_2::B
* 5. Member STRUCT_1::C
*
* `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows
* the convention above.
*
* Note: This function requires a null-mask vector for each STRUCT column, including for nested
* STRUCT members.
*
* @param flattened "Flattened" `table` of input columns, following the conventions in
* `flatten_nested_columns()`.
* @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines
* the nesting of the reconstructed output table.
* @return std::unique_ptr<cudf::table> Unflattened table (with nested STRUCT columns) reconstructed
* based on `blueprint`.
*/
std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
table_view const& blueprint);

/**
* @brief Pushdown nulls from a parent mask into a child column, using AND.
*
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ ConfigureTest(STRINGS_TEST
# - structs test ----------------------------------------------------------------------------------
ConfigureTest(STRUCTS_TEST
structs/structs_column_tests.cpp
structs/utilities_tests.cpp
)

###################################################################################################
Expand Down
Loading