Skip to content

Commit

Permalink
Create table_input_metadata from a table_metadata (#13920)
Browse files Browse the repository at this point in the history
When round-tripping data through cuDF (e.g. read a parquet file with `read_parquet()`, then write slices using the `chunked_parquet_writer`) column nullability information can be lost. The parquet writers will accept a `table_input_metadata` object as an optional parameter, and this object can be used to preserve the nullability. Creating the `table_input_metadata` can be a challenge, however. This PR addresses this problem by adding the ability to create a `table_input_metadata` using the `table_metadata` returned by `read_parquet()`.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #13920
  • Loading branch information
etseidl authored Aug 30, 2023
1 parent f999e1c commit 8978a21
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 7 deletions.
25 changes: 21 additions & 4 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,20 +201,27 @@ enum dictionary_policy {
};

/**
* @brief Detailed name information for output columns.
* @brief Detailed name (and optionally nullability) information for output columns.
*
* The hierarchy of children matches the hierarchy of children in the output
* cudf columns.
*/
struct column_name_info {
std::string name; ///< Column name
std::optional<bool> is_nullable; ///< Column nullability
std::vector<column_name_info> children; ///< Child column names

/**
* @brief Construct a column name info with a name and no children
* @brief Construct a column name info with a name, optional nullabilty, and no children
*
* @param _name Column name
* @param _is_nullable True if column is nullable
*/
column_name_info(std::string const& _name) : name(_name) {}
column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
: name(_name), is_nullable(_is_nullable)
{
}

column_name_info() = default;
};

Expand Down Expand Up @@ -798,7 +805,17 @@ class table_input_metadata {
*
* @param table The table_view to construct metadata for
*/
table_input_metadata(table_view const& table);
explicit table_input_metadata(table_view const& table);

/**
* @brief Construct a new table_input_metadata from a table_metadata object.
*
* The constructed table_input_metadata has the same structure, column names and nullability as
* the passed table_metadata.
*
* @param metadata The table_metadata to construct table_intput_metadata for
*/
explicit table_input_metadata(table_metadata const& metadata);

std::vector<column_in_metadata> column_metadata; //!< List of column metadata
};
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,26 @@ table_input_metadata::table_input_metadata(table_view const& table)
table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
}

table_input_metadata::table_input_metadata(table_metadata const& metadata)
{
auto const& names = metadata.schema_info;

// Create a metadata hierarchy with naming and nullability using `table_metadata`
std::function<column_in_metadata(column_name_info const&)> process_node =
[&](column_name_info const& name) {
auto col_meta = column_in_metadata{name.name};
if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
std::transform(name.children.begin(),
name.children.end(),
std::back_inserter(col_meta.children),
process_node);
return col_meta;
};

std::transform(
names.begin(), names.end(), std::back_inserter(this->column_metadata), process_node);
}

/**
* @copydoc cudf::io::write_parquet
*/
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,9 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
// Return column names
out_metadata.schema_info.resize(_output_buffers.size());
for (size_t i = 0; i < _output_column_schemas.size(); i++) {
auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
out_metadata.schema_info[i].name = schema.name;
auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
out_metadata.schema_info[i].name = schema.name;
out_metadata.schema_info[i].is_nullable = schema.repetition_type != REQUIRED;
}

// Return user metadata
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/utilities/column_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,10 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
std::optional<reader_column_schema> const& schema,
rmm::cuda_stream_view stream)
{
if (schema_info != nullptr) { schema_info->name = buffer.name; }
if (schema_info != nullptr) {
schema_info->name = buffer.name;
schema_info->is_nullable = buffer.is_nullable;
}

switch (buffer.type.id()) {
case type_id::STRING:
Expand Down
64 changes: 64 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6599,6 +6599,70 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
}

TEST_F(ParquetWriterTest, PreserveNullability)
{
constexpr auto num_rows = 100;

auto const col0_data = random_values<int32_t>(num_rows);
auto const col1_data = random_values<int32_t>(num_rows);

auto const col0_validity = cudf::test::iterators::no_nulls();
auto const col1_validity =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });

column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), col0_validity};
column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), col1_validity};
auto const col2 = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);

auto const expected = table_view{{col0, col1, *col2}};

cudf::io::table_input_metadata expected_metadata(expected);
expected_metadata.column_metadata[0].set_name("mandatory");
expected_metadata.column_metadata[0].set_nullability(false);
expected_metadata.column_metadata[1].set_name("optional");
expected_metadata.column_metadata[1].set_nullability(true);
expected_metadata.column_metadata[2].set_name("lists");
expected_metadata.column_metadata[2].set_nullability(true);
// offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set
expected_metadata.column_metadata[2].child(0).set_name("offsets");
expected_metadata.column_metadata[2].child(1).set_name("element");
expected_metadata.column_metadata[2].child(1).set_nullability(false);
expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets");
expected_metadata.column_metadata[2].child(1).child(1).set_name("element");
expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true);

auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet");
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
.metadata(expected_metadata);

cudf::io::write_parquet(out_opts);

cudf::io::parquet_reader_options const in_opts =
cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
auto const result = cudf::io::read_parquet(in_opts);
auto const read_metadata = cudf::io::table_input_metadata{result.metadata};

// test that expected_metadata matches read_metadata
std::function<void(cudf::io::column_in_metadata, cudf::io::column_in_metadata)>
compare_names_and_nullability = [&](auto lhs, auto rhs) {
EXPECT_EQ(lhs.get_name(), rhs.get_name());
ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined());
if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); }
ASSERT_EQ(lhs.num_children(), rhs.num_children());
for (int i = 0; i < lhs.num_children(); ++i) {
compare_names_and_nullability(lhs.child(i), rhs.child(i));
}
};

ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size());

for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) {
compare_names_and_nullability(expected_metadata.column_metadata[i],
read_metadata.column_metadata[i]);
}
}

TEST_P(ParquetV2Test, CheckEncodings)
{
using cudf::io::parquet::Encoding;
Expand Down

0 comments on commit 8978a21

Please sign in to comment.