diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 9b0dcff99af..a97f81182ac 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -201,20 +201,27 @@ enum dictionary_policy { }; /** - * @brief Detailed name information for output columns. + * @brief Detailed name (and optionally nullability) information for output columns. * * The hierarchy of children matches the hierarchy of children in the output * cudf columns. */ struct column_name_info { std::string name; ///< Column name + std::optional is_nullable; ///< Column nullability std::vector children; ///< Child column names + /** - * @brief Construct a column name info with a name and no children + * @brief Construct a column name info with a name, optional nullabilty, and no children * * @param _name Column name + * @param _is_nullable True if column is nullable */ - column_name_info(std::string const& _name) : name(_name) {} + column_name_info(std::string const& _name, std::optional _is_nullable = std::nullopt) + : name(_name), is_nullable(_is_nullable) + { + } + column_name_info() = default; }; @@ -798,7 +805,17 @@ class table_input_metadata { * * @param table The table_view to construct metadata for */ - table_input_metadata(table_view const& table); + explicit table_input_metadata(table_view const& table); + + /** + * @brief Construct a new table_input_metadata from a table_metadata object. + * + * The constructed table_input_metadata has the same structure, column names and nullability as + * the passed table_metadata. + * + * @param metadata The table_metadata to construct table_intput_metadata for + */ + explicit table_input_metadata(table_metadata const& metadata); std::vector column_metadata; //!< List of column metadata }; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 5adb2046dbd..45f8b0f8822 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -517,6 +517,26 @@ table_input_metadata::table_input_metadata(table_view const& table) table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children); } +table_input_metadata::table_input_metadata(table_metadata const& metadata) +{ + auto const& names = metadata.schema_info; + + // Create a metadata hierarchy with naming and nullability using `table_metadata` + std::function process_node = + [&](column_name_info const& name) { + auto col_meta = column_in_metadata{name.name}; + if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); } + std::transform(name.children.begin(), + name.children.end(), + std::back_inserter(col_meta.children), + process_node); + return col_meta; + }; + + std::transform( + names.begin(), names.end(), std::back_inserter(this->column_metadata), process_node); +} + /** * @copydoc cudf::io::write_parquet */ diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 9289ddb91b3..8a73c43be3e 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -366,8 +366,9 @@ void reader::impl::populate_metadata(table_metadata& out_metadata) // Return column names out_metadata.schema_info.resize(_output_buffers.size()); for (size_t i = 0; i < _output_column_schemas.size(); i++) { - auto const& schema = _metadata->get_schema(_output_column_schemas[i]); - out_metadata.schema_info[i].name = schema.name; + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + out_metadata.schema_info[i].name = schema.name; + out_metadata.schema_info[i].is_nullable = schema.repetition_type != REQUIRED; } // Return user metadata diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 3248d94d60a..f3a43cbc63c 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -149,7 +149,10 @@ std::unique_ptr make_column(column_buffer_base& buffer, std::optional const& schema, rmm::cuda_stream_view stream) { - if (schema_info != nullptr) { schema_info->name = buffer.name; } + if (schema_info != nullptr) { + schema_info->name = buffer.name; + schema_info->is_nullable = buffer.is_nullable; + } switch (buffer.type.id()) { case type_id::STRING: diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index b210452f619..3cd5c9f5593 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -6599,6 +6599,70 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(ParquetWriterTest, PreserveNullability) +{ + constexpr auto num_rows = 100; + + auto const col0_data = random_values(num_rows); + auto const col1_data = random_values(num_rows); + + auto const col0_validity = cudf::test::iterators::no_nulls(); + auto const col1_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + column_wrapper col0{col0_data.begin(), col0_data.end(), col0_validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), col1_validity}; + auto const col2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); + + auto const expected = table_view{{col0, col1, *col2}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("mandatory"); + expected_metadata.column_metadata[0].set_nullability(false); + expected_metadata.column_metadata[1].set_name("optional"); + expected_metadata.column_metadata[1].set_nullability(true); + expected_metadata.column_metadata[2].set_name("lists"); + expected_metadata.column_metadata[2].set_nullability(true); + // offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set + expected_metadata.column_metadata[2].child(0).set_name("offsets"); + expected_metadata.column_metadata[2].child(1).set_name("element"); + expected_metadata.column_metadata[2].child(1).set_nullability(false); + expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets"); + expected_metadata.column_metadata[2].child(1).child(1).set_name("element"); + expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true); + + auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(expected_metadata); + + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options const in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(in_opts); + auto const read_metadata = cudf::io::table_input_metadata{result.metadata}; + + // test that expected_metadata matches read_metadata + std::function + compare_names_and_nullability = [&](auto lhs, auto rhs) { + EXPECT_EQ(lhs.get_name(), rhs.get_name()); + ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined()); + if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); } + ASSERT_EQ(lhs.num_children(), rhs.num_children()); + for (int i = 0; i < lhs.num_children(); ++i) { + compare_names_and_nullability(lhs.child(i), rhs.child(i)); + } + }; + + ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size()); + + for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) { + compare_names_and_nullability(expected_metadata.column_metadata[i], + read_metadata.column_metadata[i]); + } +} + TEST_P(ParquetV2Test, CheckEncodings) { using cudf::io::parquet::Encoding;