Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create table_input_metadata from a table_metadata #13920

Merged
merged 27 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
36f7ad8
add helpers to get table_input_metadata from a table_with_metadata
etseidl Aug 17, 2023
8b7020a
Merge branch 'rapidsai:branch-23.10' into feature/input_metadata
etseidl Aug 19, 2023
ac39211
Merge remote-tracking branch 'origin/branch-23.10' into feature/input…
etseidl Aug 21, 2023
07204db
add test
etseidl Aug 21, 2023
24bf9d8
change expect to assert for bounds checks
etseidl Aug 21, 2023
7cc11a6
remove fluff and make ctor that takes a table_with_metadata explicit
etseidl Aug 21, 2023
acfb613
make another ctor explicit...breaking change
etseidl Aug 22, 2023
00b54b9
add nullability to column_name_info since the table can lie.
etseidl Aug 22, 2023
9aa89d3
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 22, 2023
e1a5cef
probably better to make the struct node mandatory
etseidl Aug 22, 2023
d02ba3f
get rid of getters/setters
etseidl Aug 23, 2023
ed51828
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 23, 2023
3f62e0d
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 23, 2023
a392472
only need one constructor
etseidl Aug 24, 2023
b5d42cf
don't set nullability on metadata elements that are internal to cudf
etseidl Aug 24, 2023
73a72b9
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 24, 2023
4be5ae3
i
etseidl Aug 24, 2023
bfad864
formatting
etseidl Aug 24, 2023
a55a3e9
Merge remote-tracking branch 'origin/branch-23.10' into feature/input…
etseidl Aug 26, 2023
7488872
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 27, 2023
d9f6241
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 29, 2023
0530450
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 29, 2023
497522e
Merge remote-tracking branch 'origin/branch-23.10' into feature/input…
etseidl Aug 29, 2023
df0fe86
Merge branch 'branch-23.10' into feature/input_metadata
hyperbolic2346 Aug 29, 2023
f2be602
rename lambda and correct comment
etseidl Aug 29, 2023
632e709
Merge branch 'branch-23.10' into feature/input_metadata
etseidl Aug 29, 2023
dc806c5
Merge branch 'branch-23.10' into feature/input_metadata
vuule Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,24 @@ struct table_metadata {
per_file_user_data; //!< Per file format-dependent metadata as key-values pairs
};

class table_input_metadata;

/**
* @brief Table with table metadata used by io readers to return the metadata by value
*/
struct table_with_metadata {
std::unique_ptr<table> tbl; //!< Table
table_metadata metadata; //!< Table metadata

/**
* @brief Return a `table_input_metadata` populated with data from this `table_with_metadata`.
*
* The returned `table_input_metadata` will preserve the hierarchy, naming, and nullability
* of the contained table.
*
* @return `table_input_metadata`
*/
table_input_metadata get_table_input_metadata();
};

/**
Expand Down Expand Up @@ -800,6 +812,16 @@ class table_input_metadata {
*/
table_input_metadata(table_view const& table);

/**
* @brief Construct a new table_input_metadata from a table_with_metadata.
*
* The constructed table_input_metadata has the same structure as the passed table_with_metadata,
* and also preserves any naming and nullability info from the original table.
etseidl marked this conversation as resolved.
Show resolved Hide resolved
*
* @param table_and_metadata The table_with_metadata to construct metadata for
*/
table_input_metadata(table_with_metadata const& table_and_metadata);

std::vector<column_in_metadata> column_metadata; //!< List of column metadata
};

Expand Down
30 changes: 30 additions & 0 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,11 @@ std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
return detail_parquet::writer::merge_row_group_metadata(metadata_list);
}

table_input_metadata table_with_metadata::get_table_input_metadata()
etseidl marked this conversation as resolved.
Show resolved Hide resolved
{
return table_input_metadata(*this);
}

table_input_metadata::table_input_metadata(table_view const& table)
{
// Create a metadata hierarchy using `table`
Expand All @@ -517,6 +522,31 @@ table_input_metadata::table_input_metadata(table_view const& table)
table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
}

table_input_metadata::table_input_metadata(table_with_metadata const& table_and_metadata)
{
auto const& table = table_and_metadata.tbl->view();
auto const& names = table_and_metadata.metadata.schema_info;

// Create a metadata hierarchy with naming and nullability using `table_and_metadata`
std::function<column_in_metadata(column_view const&, column_name_info const&)> get_children =
[&](column_view const& col, column_name_info const& name) {
auto col_meta = column_in_metadata{name.name};
col_meta.set_nullability(col.nullable());
std::transform(col.child_begin(),
col.child_end(),
name.children.begin(),
std::back_inserter(col_meta.children),
get_children);
return col_meta;
};

std::transform(table.begin(),
table.end(),
names.begin(),
std::back_inserter(this->column_metadata),
get_children);
}

/**
* @copydoc cudf::io::write_parquet
*/
Expand Down
64 changes: 64 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6599,4 +6599,68 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
}

TEST_F(ParquetWriterTest, PreserveNullability)
{
constexpr auto num_rows = 100;

auto const col0_data = random_values<int32_t>(num_rows);
auto const col1_data = random_values<int32_t>(num_rows);

auto const col0_validity = cudf::test::iterators::no_nulls();
auto const col1_validity =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });

column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), col0_validity};
column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), col1_validity};
auto const col2 = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);

auto const expected = table_view{{col0, col1, *col2}};

cudf::io::table_input_metadata expected_metadata(expected);
expected_metadata.column_metadata[0].set_name("mandatory");
expected_metadata.column_metadata[0].set_nullability(false);
expected_metadata.column_metadata[1].set_name("optional");
expected_metadata.column_metadata[1].set_nullability(true);
expected_metadata.column_metadata[2].set_name("lists");
expected_metadata.column_metadata[2].set_nullability(true);
expected_metadata.column_metadata[2].child(0).set_name("offsets");
expected_metadata.column_metadata[2].child(0).set_nullability(false);
expected_metadata.column_metadata[2].child(1).set_name("element");
expected_metadata.column_metadata[2].child(1).set_nullability(false);
expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets");
expected_metadata.column_metadata[2].child(1).child(0).set_nullability(false);
expected_metadata.column_metadata[2].child(1).child(1).set_name("element");
expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true);

auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet");
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
.metadata(expected_metadata);

cudf::io::write_parquet(out_opts);

cudf::io::parquet_reader_options const in_opts =
cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
auto const result = cudf::io::read_parquet(in_opts);
auto const read_metadata = cudf::io::table_input_metadata{result};

// test that expected_metadata matches read_metadata
std::function<void(cudf::io::column_in_metadata, cudf::io::column_in_metadata)>
compare_names_and_nullability = [&](auto lhs, auto rhs) {
EXPECT_EQ(lhs.get_name(), rhs.get_name());
EXPECT_EQ(lhs.nullable(), rhs.nullable());
EXPECT_EQ(lhs.num_children(), rhs.num_children());
vuule marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < lhs.num_children(); ++i) {
compare_names_and_nullability(lhs.child(i), rhs.child(i));
}
};

EXPECT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size());

for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) {
compare_names_and_nullability(expected_metadata.column_metadata[i],
read_metadata.column_metadata[i]);
}
}

CUDF_TEST_PROGRAM_MAIN()