Skip to content

Commit

Permalink
Support "unflatten" of columns flattened via `flatten_nested_columns(…
Browse files Browse the repository at this point in the history
…)`: (#8956)

`cudf::flatten_nested_columns()` flattens out `STRUCT` columns into their
constituent member columns, and includes the `STRUCT`'s validity information
as a `BOOL8` column.
E.g. `STRUCT_1< STRUCT_2< A, B >, C >` is flattened to:
     1. Null Vector for `STRUCT_1`
     2. Null Vector for `STRUCT_2`
     3. Member `STRUCT_2::A`
     4. Member `STRUCT_2::B`
     5. Member `STRUCT_1::C`

This commit adds an `unflatten_nested_columns()` method to convert back
from a flattened representation to the nested columns.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)

URL: #8956
  • Loading branch information
mythrocks authored Aug 20, 2021
1 parent 3fec3d8 commit 8c92812
Show file tree
Hide file tree
Showing 4 changed files with 374 additions and 8 deletions.
132 changes: 124 additions & 8 deletions cpp/src/structs/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@

#include <thrust/iterator/counting_iterator.h>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -61,6 +63,24 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
return result;
}

namespace {
/**
* @brief Check whether the specified column is of type `STRUCT`.
*/
bool is_struct(cudf::column_view const& col) { return col.type().id() == type_id::STRUCT; }

/**
* @brief Check whether the specified column is of type LIST, or any LISTs in its descendent
* columns.
*/
bool is_or_has_nested_lists(cudf::column_view const& col)
{
auto is_list = [](cudf::column_view const& col) { return col.type().id() == type_id::LIST; };

return is_list(col) || std::any_of(col.child_begin(), col.child_end(), is_or_has_nested_lists);
}
} // namespace

/**
* @brief Flattens struct columns to constituent non-struct columns in the input table.
*
Expand All @@ -86,6 +106,13 @@ struct flattened_table {
null_precedence(null_precedence),
nullability(nullability)
{
fail_if_unsupported_types(input);
}

void fail_if_unsupported_types(table_view const& input) const
{
auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported.");
}

// Convert null_mask to BOOL8 columns and flatten the struct children in order.
Expand Down Expand Up @@ -156,9 +183,6 @@ struct flattened_table {
}
};

/**
* @copydoc cudf::detail::flatten_nested_columns
*/
std::tuple<table_view,
std::vector<order>,
std::vector<null_order>,
Expand All @@ -168,15 +192,107 @@ flatten_nested_columns(table_view const& input,
std::vector<null_order> const& null_precedence,
column_nullability nullability)
{
std::vector<std::unique_ptr<column>> validity_as_column;
auto const has_struct = std::any_of(
input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
if (not has_struct)
return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
if (not has_struct) {
return std::make_tuple(
input, column_order, null_precedence, std::vector<std::unique_ptr<column>>{});
}

return flattened_table{input, column_order, null_precedence, nullability}();
}

namespace {
using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
using column_index_t = typename vector_of_columns::size_type;

// Forward declaration, to enable recursion via `unflattener`.
std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
column_index_t& current_index,
cudf::column_view const& blueprint);

/**
* @brief Helper functor to reconstruct STRUCT columns from its flattened member columns.
*
*/
class unflattener {
public:
unflattener(vector_of_columns& flattened_, column_index_t& current_index_)
: flattened{flattened_}, current_index{current_index_}
{
}

auto operator()(column_view const& blueprint)
{
return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint)
: std::move(flattened[current_index++]);
}

private:
vector_of_columns& flattened;
column_index_t& current_index;

}; // class unflattener;

std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
column_index_t& current_index,
cudf::column_view const& blueprint)
{
// "Consume" columns from `flattened`, starting at `current_index`,
// based on the provided `blueprint` struct col. Recurse for struct children.
CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT,
"Expected blueprint column to be a STRUCT column.");

CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children.");

auto const num_rows = flattened[current_index]->size();

// cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
// before the child/member columns.
// E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
// 1. Null Vector for STRUCT_1
// 2. Null Vector for STRUCT_2
// 3. Member STRUCT_2::A
// 4. Member STRUCT_2::B
// 5. Member STRUCT_1::C
//
// Extract null-vector *before* child columns are constructed.
auto struct_null_column_contents = flattened[current_index++]->release();
auto unflattening_iter =
thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index});

return cudf::make_structs_column(
num_rows,
vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()},
UNKNOWN_NULL_COUNT, // Do count?
std::move(*struct_null_column_contents.null_mask));
}
} // namespace

std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
table_view const& blueprint)
{
// Bail, if LISTs are present.
auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists);
CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported.");

// If there are no STRUCTs, unflattening is a NOOP.
auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct);
if (not has_structs) {
return std::move(flattened); // Unchanged.
}

// There be struct columns.
// Note: Requires null vectors for all struct input columns.
auto flattened_columns = flattened->release();
auto current_idx = column_index_t{0};

auto unflattening_iter =
thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx});

return std::make_unique<cudf::table>(
vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()});
}

// Helper function to superimpose validity of parent struct
// over the specified member (child) column.
void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
Expand Down
29 changes: 29 additions & 0 deletions cpp/src/structs/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,35 @@ flatten_nested_columns(table_view const& input,
std::vector<null_order> const& null_precedence,
column_nullability nullability = column_nullability::MATCH_INCOMING);

/**
* @brief Unflatten columns flattened as by `flatten_nested_columns()`,
* based on the provided `blueprint`.
*
* cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
* before the child/member columns.
* E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
* 1. Null Vector for STRUCT_1
* 2. Null Vector for STRUCT_2
* 3. Member STRUCT_2::A
* 4. Member STRUCT_2::B
* 5. Member STRUCT_1::C
*
* `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows
* the convention above.
*
* Note: This function requires a null-mask vector for each STRUCT column, including for nested
* STRUCT members.
*
* @param flattened "Flattened" `table` of input columns, following the conventions in
* `flatten_nested_columns()`.
* @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines
* the nesting of the reconstructed output table.
* @return std::unique_ptr<cudf::table> Unflattened table (with nested STRUCT columns) reconstructed
* based on `blueprint`.
*/
std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
table_view const& blueprint);

/**
* @brief Pushdown nulls from a parent mask into a child column, using AND.
*
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ ConfigureTest(STRINGS_TEST
# - structs test ----------------------------------------------------------------------------------
ConfigureTest(STRUCTS_TEST
structs/structs_column_tests.cpp
structs/utilities_tests.cpp
)

###################################################################################################
Expand Down
Loading

0 comments on commit 8c92812

Please sign in to comment.