From 2c6b39bdd7a37e0aa8708ed4018d1ad360a4d104 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 22 Sep 2021 15:31:43 -0700 Subject: [PATCH] Add support for struct type in ORC writer (#9025) Fixes #7830, #8443 Features: - Use the new table metadata type that matches the table hierarchy, `table_input_metadata`. - Support struct columns in the writer. Changes: - Null masks are encoded as aligned rowgroups to avoid invalid bits when the number of encoded rows is not divisible by 8 (except for the last rowgroup in each stripe). This also affects list columns. The issue is equivalent to https://github.com/rapidsai/cudf/issues/6763 (boolean columns only). - Added pushdown masks that are used to determine which child elements should not be encoded, including null mask bits. - Use pushdown masks for rowgroup alignment, null mask encoding and value encoding. - Separated the null mask encoding from value encoding - can be further moved to a separate kernel call. Breaking because the table metadata type has changed. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Jason Lowe (https://github.com/jlowe) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - Robert (Bobby) Evans (https://github.com/revans2) - Vyas Ramasubramani (https://github.com/vyasr) - Devavret Makkar (https://github.com/devavret) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: https://github.com/rapidsai/cudf/pull/9025 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/orc.hpp | 19 +- cpp/include/cudf/io/parquet.hpp | 169 ------ cpp/include/cudf/io/types.hpp | 199 ++++++- cpp/include/cudf/utilities/bit.hpp | 18 + .../cudf_test/io_metadata_utilities.hpp | 25 + cpp/src/io/orc/dict_enc.cu | 2 +- cpp/src/io/orc/orc.h | 8 +- cpp/src/io/orc/orc_gpu.h | 33 +- cpp/src/io/orc/stripe_enc.cu | 267 ++++----- cpp/src/io/orc/stripe_init.cu | 64 ++- cpp/src/io/orc/writer_impl.cu | 510 ++++++++++++++---- cpp/src/io/orc/writer_impl.hpp | 24 +- cpp/tests/io/metadata_utilities.cpp | 42 ++ cpp/tests/io/orc_test.cpp | 366 ++++++------- cpp/tests/io/parquet_test.cpp | 52 +- .../java/ai/rapids/cudf/ORCWriterOptions.java | 4 +- java/src/main/java/ai/rapids/cudf/Table.java | 7 +- java/src/main/native/src/TableJni.cpp | 57 +- .../test/java/ai/rapids/cudf/TableTest.java | 13 +- python/cudf/cudf/_lib/cpp/io/orc.pxd | 12 +- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 31 +- python/cudf/cudf/_lib/cpp/io/types.pxd | 27 +- python/cudf/cudf/_lib/orc.pyx | 86 ++- python/cudf/cudf/_lib/parquet.pyx | 3 +- python/cudf/cudf/io/orc.py | 8 +- python/cudf/cudf/tests/test_orc.py | 56 +- python/cudf/cudf/utils/ioutils.py | 6 + 29 files changed, 1313 insertions(+), 797 deletions(-) create mode 100644 cpp/include/cudf_test/io_metadata_utilities.hpp create mode 100644 cpp/tests/io/metadata_utilities.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0f05dcb4bb3..c3450fe8d88 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -238,6 +238,7 @@ test: - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp - test -f $PREFIX/include/cudf_test/cxxopts.hpp - test -f $PREFIX/include/cudf_test/file_utilities.hpp + - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp - test -f $PREFIX/include/cudf_test/table_utilities.hpp - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c72c258fd18..2df35aa0971 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -565,6 +565,7 @@ add_library(cudftestutil STATIC tests/utilities/base_fixture.cpp tests/utilities/column_utilities.cu tests/utilities/table_utilities.cu + tests/io/metadata_utilities.cpp tests/strings/utilities.cu) set_target_properties(cudftestutil diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 4ae09b516a4..17d8e5eb7dd 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -389,7 +389,7 @@ class orc_writer_options { // Set of columns to output table_view _table; // Optional associated metadata - const table_metadata* _metadata = nullptr; + const table_input_metadata* _metadata = nullptr; friend orc_writer_options_builder; @@ -445,7 +445,7 @@ class orc_writer_options { /** * @brief Returns associated metadata. */ - table_metadata const* get_metadata() const { return _metadata; } + table_input_metadata const* get_metadata() const { return _metadata; } // Setters @@ -475,7 +475,7 @@ class orc_writer_options { * * @param meta Associated metadata. */ - void set_metadata(table_metadata* meta) { _metadata = meta; } + void set_metadata(table_input_metadata const* meta) { _metadata = meta; } }; class orc_writer_options_builder { @@ -541,7 +541,7 @@ class orc_writer_options_builder { * @param meta Associated metadata. * @return this for chaining. */ - orc_writer_options_builder& metadata(table_metadata* meta) + orc_writer_options_builder& metadata(table_input_metadata const* meta) { options._metadata = meta; return *this; @@ -570,6 +570,9 @@ class orc_writer_options_builder { * cudf::io::write_orc(options); * @endcode * + * Note: Support for writing tables with struct columns is currently experimental, the output may + * not be as reliable as writing for other datatypes. + * * @param options Settings for controlling reading behavior. * @param mr Device memory resource to use for device memory allocation. */ @@ -592,7 +595,7 @@ class chunked_orc_writer_options { // Enable writing column statistics bool _enable_statistics = true; // Optional associated metadata - const table_metadata_with_nullability* _metadata = nullptr; + const table_input_metadata* _metadata = nullptr; friend chunked_orc_writer_options_builder; @@ -638,7 +641,7 @@ class chunked_orc_writer_options { /** * @brief Returns associated metadata. */ - table_metadata_with_nullability const* get_metadata() const { return _metadata; } + table_input_metadata const* get_metadata() const { return _metadata; } // Setters @@ -661,7 +664,7 @@ class chunked_orc_writer_options { * * @param meta Associated metadata. */ - void metadata(table_metadata_with_nullability* meta) { _metadata = meta; } + void metadata(table_input_metadata const* meta) { _metadata = meta; } }; class chunked_orc_writer_options_builder { @@ -712,7 +715,7 @@ class chunked_orc_writer_options_builder { * @param meta Associated metadata. * @return this for chaining. */ - chunked_orc_writer_options_builder& metadata(table_metadata_with_nullability* meta) + chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta) { options._metadata = meta; return *this; diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 25cbb6fd554..bc495c61d54 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -24,8 +24,6 @@ #include -#include - #include #include #include @@ -375,173 +373,6 @@ table_with_metadata read_parquet( * @{ * @file */ -class table_input_metadata; - -class column_in_metadata { - friend table_input_metadata; - std::string _name = ""; - thrust::optional _nullable; - // TODO: This isn't implemented yet - bool _list_column_is_map = false; - bool _use_int96_timestamp = false; - // bool _output_as_binary = false; - thrust::optional _decimal_precision; - std::vector children; - - public: - /** - * @brief Get the children of this column metadata - * - * @return this for chaining - */ - column_in_metadata& add_child(column_in_metadata const& child) - { - children.push_back(child); - return *this; - } - - /** - * @brief Set the name of this column - * - * @return this for chaining - */ - column_in_metadata& set_name(std::string const& name) - { - _name = name; - return *this; - } - - /** - * @brief Set the nullability of this column - * - * Only valid in case of chunked writes. In single writes, this option is ignored. - * - * @return column_in_metadata& - */ - column_in_metadata& set_nullability(bool nullable) - { - _nullable = nullable; - return *this; - } - - /** - * @brief Specify that this list column should be encoded as a map in the written parquet file - * - * The column must have the structure list>. This option is invalid otherwise - * - * @return this for chaining - */ - column_in_metadata& set_list_column_as_map() - { - _list_column_is_map = true; - return *this; - } - - /** - * @brief Specifies whether this timestamp column should be encoded using the deprecated int96 - * physical type. Only valid for the following column types: - * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns - * - * @param req True = use int96 physical type. False = use int64 physical type - * @return this for chaining - */ - column_in_metadata& set_int96_timestamps(bool req) - { - _use_int96_timestamp = req; - return *this; - } - - /** - * @brief Set the decimal precision of this column. Only valid if this column is a decimal - * (fixed-point) type - * - * @param precision The integer precision to set for this decimal column - * @return this for chaining - */ - column_in_metadata& set_decimal_precision(uint8_t precision) - { - _decimal_precision = precision; - return *this; - } - - /** - * @brief Get reference to a child of this column - * - * @param i Index of the child to get - * @return this for chaining - */ - column_in_metadata& child(size_type i) { return children[i]; } - - /** - * @brief Get const reference to a child of this column - * - * @param i Index of the child to get - * @return this for chaining - */ - column_in_metadata const& child(size_type i) const { return children[i]; } - - /** - * @brief Get the name of this column - */ - std::string get_name() const { return _name; } - - /** - * @brief Get whether nullability has been explicitly set for this column. - */ - bool is_nullability_defined() const { return _nullable.has_value(); } - - /** - * @brief Gets the explicitly set nullability for this column. - * @throws If nullability is not explicitly defined for this column. - * Check using `is_nullability_defined()` first. - */ - bool nullable() const { return _nullable.value(); } - - /** - * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. - */ - bool is_map() const { return _list_column_is_map; } - - /** - * @brief Get whether to encode this timestamp column using deprecated int96 physical type - */ - bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } - - /** - * @brief Get whether precision has been set for this decimal column - */ - bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } - - /** - * @brief Get the decimal precision that was set for this column. - * @throws If decimal precision was not set for this column. - * Check using `is_decimal_precision_set()` first. - */ - uint8_t get_decimal_precision() const { return _decimal_precision.value(); } - - /** - * @brief Get the number of children of this column - */ - size_type num_children() const { return children.size(); } -}; - -class table_input_metadata { - public: - table_input_metadata() = default; // Required by cython - - /** - * @brief Construct a new table_input_metadata from a table_view. - * - * The constructed table_input_metadata has the same structure as the passed table_view - * - * @param table The table_view to construct metadata for - * @param user_data Optional Additional metadata to encode, as key-value pairs - */ - table_input_metadata(table_view const& table, std::map user_data = {}); - - std::vector column_metadata; - std::map user_data; //!< Format-dependent metadata as key-values pairs -}; /** * @brief Class to build `parquet_writer_options`. diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 661b36f10c8..ac965e2d416 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -23,6 +23,8 @@ #include +#include + #include #include #include @@ -125,34 +127,6 @@ struct table_metadata { std::map user_data; //!< Format-dependent metadata as key-values pairs }; -/** - * @brief Derived class of table_metadata which includes flattened nullability information of input. - * - * This information is used as an optimization for chunked writes. If the caller leaves - * column_nullable uninitialized, the writer code will assume the worst case : that all columns are - * nullable. - * - * If the column_nullable field is not empty, it is expected that it has a length equal to the - * number of columns in the flattened table being written. - * - * Flattening refers to the flattening of nested columns. For list columns, the number of values - * expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of - * three columns of types: {int, list, float}, the nullability vector contains the values: - * - * |Index| Nullability of | - * |-----|----------------------------------------| - * | 0 | int column | - * | 1 | Level 0 of list column (list itself) | - * | 2 | Level 1 of list column (double values) | - * | 3 | float column | - * - * In the case where column nullability is known, pass `true` if the corresponding column could - * contain nulls in one or more subtables to be written, otherwise `false`. - */ -struct table_metadata_with_nullability : public table_metadata { - std::vector column_nullable; //!< Per-column nullability information. -}; - /** * @brief Table with table metadata used by io readers to return the metadata by value */ @@ -234,5 +208,174 @@ struct sink_info { } }; +class table_input_metadata; + +class column_in_metadata { + friend table_input_metadata; + std::string _name = ""; + thrust::optional _nullable; + bool _list_column_is_map = false; + bool _use_int96_timestamp = false; + // bool _output_as_binary = false; + thrust::optional _decimal_precision; + std::vector children; + + public: + column_in_metadata() = default; + column_in_metadata(std::string_view name) : _name{name} {} + /** + * @brief Get the children of this column metadata + * + * @return this for chaining + */ + column_in_metadata& add_child(column_in_metadata const& child) + { + children.push_back(child); + return *this; + } + + /** + * @brief Set the name of this column + * + * @return this for chaining + */ + column_in_metadata& set_name(std::string const& name) + { + _name = name; + return *this; + } + + /** + * @brief Set the nullability of this column + * + * Only valid in case of chunked writes. In single writes, this option is ignored. + * + * @return column_in_metadata& + */ + column_in_metadata& set_nullability(bool nullable) + { + _nullable = nullable; + return *this; + } + + /** + * @brief Specify that this list column should be encoded as a map in the written parquet file + * + * The column must have the structure list>. This option is invalid otherwise + * + * @return this for chaining + */ + column_in_metadata& set_list_column_as_map() + { + _list_column_is_map = true; + return *this; + } + + /** + * @brief Specifies whether this timestamp column should be encoded using the deprecated int96 + * physical type. Only valid for the following column types: + * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns + * + * @param req True = use int96 physical type. False = use int64 physical type + * @return this for chaining + */ + column_in_metadata& set_int96_timestamps(bool req) + { + _use_int96_timestamp = req; + return *this; + } + + /** + * @brief Set the decimal precision of this column. Only valid if this column is a decimal + * (fixed-point) type + * + * @param precision The integer precision to set for this decimal column + * @return this for chaining + */ + column_in_metadata& set_decimal_precision(uint8_t precision) + { + _decimal_precision = precision; + return *this; + } + + /** + * @brief Get reference to a child of this column + * + * @param i Index of the child to get + * @return this for chaining + */ + column_in_metadata& child(size_type i) { return children[i]; } + + /** + * @brief Get const reference to a child of this column + * + * @param i Index of the child to get + * @return this for chaining + */ + column_in_metadata const& child(size_type i) const { return children[i]; } + + /** + * @brief Get the name of this column + */ + std::string get_name() const { return _name; } + + /** + * @brief Get whether nullability has been explicitly set for this column. + */ + bool is_nullability_defined() const { return _nullable.has_value(); } + + /** + * @brief Gets the explicitly set nullability for this column. + * @throws If nullability is not explicitly defined for this column. + * Check using `is_nullability_defined()` first. + */ + bool nullable() const { return _nullable.value(); } + + /** + * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. + */ + bool is_map() const { return _list_column_is_map; } + + /** + * @brief Get whether to encode this timestamp column using deprecated int96 physical type + */ + bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } + + /** + * @brief Get whether precision has been set for this decimal column + */ + bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } + + /** + * @brief Get the decimal precision that was set for this column. + * @throws If decimal precision was not set for this column. + * Check using `is_decimal_precision_set()` first. + */ + uint8_t get_decimal_precision() const { return _decimal_precision.value(); } + + /** + * @brief Get the number of children of this column + */ + size_type num_children() const { return children.size(); } +}; + +class table_input_metadata { + public: + table_input_metadata() = default; // Required by cython + + /** + * @brief Construct a new table_input_metadata from a table_view. + * + * The constructed table_input_metadata has the same structure as the passed table_view + * + * @param table The table_view to construct metadata for + * @param user_data Optional Additional metadata to encode, as key-value pairs + */ + table_input_metadata(table_view const& table, std::map user_data = {}); + + std::vector column_metadata; + std::map user_data; //!< Format-dependent metadata as key-values pairs +}; + } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp index 458587946f2..cbd09fa7b0d 100644 --- a/cpp/include/cudf/utilities/bit.hpp +++ b/cpp/include/cudf/utilities/bit.hpp @@ -104,6 +104,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type /** * @brief Indicates whether the specified bit is set to `1` * + * @param bitmask The bitmask containing the bit to clear * @param bit_index Index of the bit to test * @return true The specified bit is `1` * @return false The specified bit is `0` @@ -114,6 +115,23 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index)); } +/** + * @brief optional-like interface to check if a specified bit of a bitmask is set. + * + * @param bitmask The bitmask containing the bit to clear + * @param bit_index Index of the bit to test + * @param default_value Value to return if `bitmask` is nullptr + * @return true The specified bit is `1` + * @return false The specified bit is `0` + * @return `default_value` if `bitmask` is nullptr + */ +CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask, + size_type bit_index, + bool default_value) +{ + return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value; +} + /** * @brief Returns a bitmask word with the `n` least significant bits set. * diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp new file mode 100644 index 00000000000..6ca6eba6884 --- /dev/null +++ b/cpp/include/cudf_test/io_metadata_utilities.hpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::test { + +void expect_metadata_equal(cudf::io::table_input_metadata in_meta, + cudf::io::table_metadata out_meta); + +} diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index eeafd959f87..c9b6c6e9f91 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -146,7 +146,7 @@ __global__ void __launch_bounds__(block_size, 2) if (t == 0) { s->chunk = chunks[group_id][str_col_idx]; - s->chunk.leaf_column = &orc_columns[col_idx].cudf_column; + s->chunk.leaf_column = &orc_columns[col_idx]; s->chunk.dict_data = dict_data[str_col_idx].data() + rowgroup_bounds[group_id][col_idx].begin; s->chunk.dict_index = dict_index[str_col_idx].data(); s->chunk.start_row = rowgroup_bounds[group_id][col_idx].begin; diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index 77de0b0b286..405bf7c2ecc 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -615,9 +615,13 @@ class metadata { /** * @brief `column_device_view` and additional, ORC specific, information on the column. */ -struct orc_column_device_view { - column_device_view cudf_column; +struct orc_column_device_view : public column_device_view { + __device__ orc_column_device_view(column_device_view col, thrust::optional parent_idx) + : column_device_view{col}, parent_index{parent_idx} + { + } thrust::optional parent_index; + bitmask_type const* pushdown_mask = nullptr; }; /** diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index 88d7e26b3b6..389895abc83 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -135,6 +135,8 @@ struct RowGroup { struct EncChunk { uint32_t start_row; // start row of this chunk uint32_t num_rows; // number of rows in this chunk + uint32_t null_mask_start_row; // adjusted to multiple of 8 + uint32_t null_mask_num_rows; // adjusted to multiple of 8 ColumnEncodingKind encoding_kind; // column encoding kind TypeKind type_kind; // column data type uint8_t dtype_len; // data type length @@ -142,7 +144,7 @@ struct EncChunk { uint32_t* dict_index; // dictionary index from row index uint32_t* decimal_offsets; - column_device_view const* leaf_column; + orc_column_device_view const* column; }; /** @@ -182,7 +184,7 @@ struct DictionaryChunk { uint32_t num_dict_strings; // number of strings in dictionary uint32_t dict_char_count; // size of dictionary string data for this chunk - column_device_view const* leaf_column; //!< Pointer to string column + orc_column_device_view const* leaf_column; //!< Pointer to string column }; /** @@ -197,7 +199,7 @@ struct StripeDictionary { uint32_t num_strings; // number of unique strings in the dictionary uint32_t dict_char_count; // total size of dictionary string data - column_device_view const* leaf_column; //!< Pointer to string column + orc_column_device_view const* leaf_column; //!< Pointer to string column }; constexpr uint32_t encode_block_size = 512; @@ -326,17 +328,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes, device_2dspan enc_streams, rmm::cuda_stream_view stream); -/** - * @brief Set leaf column element of EncChunk - * - * @param[in] orc_columns Pre-order flattened device array of ORC column views - * @param[in,out] chunks encoder chunk device array [column][rowgroup] - * @param[in] stream CUDA stream used for device memory operations and kernel launches - */ -void set_chunk_columns(device_span orc_columns, - device_2dspan chunks, - rmm::cuda_stream_view stream); - /** * @brief Launches kernel for compacting chunked column data prior to compression * @@ -440,6 +431,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups, * @param[in,out] groups Statistics merge groups * @param[in,out] chunks Statistics data * @param[in] statistics_count Number of statistics buffers + * @param[in] stream CUDA stream used for device memory operations and kernel launches */ void orc_encode_statistics(uint8_t* blob_bfr, statistics_merge_group* groups, @@ -447,6 +439,19 @@ void orc_encode_statistics(uint8_t* blob_bfr, uint32_t statistics_count, rmm::cuda_stream_view stream); +/** + * @brief Number of set bits in pushdown masks, per rowgroup. + * + * @param[in] orc_columns Pre-order flattened device array of ORC column views + * @param[in] rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column] + * @param[out] set_counts Per rowgroup number of set bits + * @param[in] stream CUDA stream used for device memory operations and kernel launches + */ +void reduce_pushdown_masks(device_span orc_columns, + device_2dspan rowgroup_bounds, + device_2dspan set_counts, + rmm::cuda_stream_view stream); + } // namespace gpu } // namespace orc } // namespace io diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 9348d817dad..cc7e22f2042 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -265,7 +265,6 @@ static __device__ uint32_t ByteRLE( } } if (!t) { s->strm_pos[cid] = static_cast(dst - s->stream.data_ptrs[cid]); } - __syncthreads(); return out_cnt; } @@ -621,6 +620,100 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, static const __device__ __constant__ int32_t kTimeScale[10] = { 1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}; +template +static __device__ void encode_null_mask(orcenc_state_s* s, + bitmask_type const* pushdown_mask, + Storage& scan_storage, + int t) +{ + if (s->stream.ids[CI_PRESENT] < 0) return; + + auto const column = *s->chunk.column; + while (s->present_rows < s->chunk.null_mask_num_rows or s->numvals > 0) { + // Number of rows read so far + auto present_rows = s->present_rows; + // valid_buf capacity is byte per thread in block + auto const buf_available_bits = encode_block_size * 8 - s->numvals; + // Number of rows for the block to process in this iteration + auto const nrows = min(s->chunk.null_mask_num_rows - present_rows, buf_available_bits); + // Number of rows for this thread to process in this iteration + auto const t_nrows = min(max(static_cast(nrows) - t * 8, 0), 8); + auto const row = s->chunk.null_mask_start_row + present_rows + t * 8; + + auto get_mask_byte = [&](bitmask_type const* mask, size_type offset) -> uint8_t { + if (t_nrows == 0) return 0; + if (mask == nullptr) return 0xff; + + auto const begin_offset = row + offset; + auto const end_offset = min(begin_offset + 8, offset + column.size()); + auto const mask_word = cudf::detail::get_mask_offset_word(mask, 0, begin_offset, end_offset); + return mask_word & 0xff; + }; + + uint8_t pd_byte = (1 << t_nrows) - 1; + uint32_t pd_set_cnt = t_nrows; + uint32_t offset = t_nrows != 0 ? t * 8 : nrows; + if (pushdown_mask != nullptr) { + pd_byte = get_mask_byte(pushdown_mask, 0) & ((1 << t_nrows) - 1); + pd_set_cnt = __popc(pd_byte); + // Scan the number of valid bits to get dst offset for each thread + cub::BlockScan(scan_storage).ExclusiveSum(pd_set_cnt, offset); + } + + auto const mask_byte = get_mask_byte(column.null_mask(), column.offset()); + auto dst_offset = offset + s->nnz; + auto vbuf_bit_idx = [](int row) { + // valid_buf is a circular buffer with validitiy of 8 rows in each element + return row % (encode_block_size * 8); + }; + if (dst_offset % 8 == 0 and pd_set_cnt == 8) { + s->valid_buf[vbuf_bit_idx(dst_offset) / 8] = mask_byte; + } else { + for (auto bit_idx = 0; bit_idx < t_nrows; ++bit_idx) { + // skip bits where pushdown mask is not set + if (not(pd_byte & (1 << bit_idx))) continue; + if (mask_byte & (1 << bit_idx)) { + set_bit(reinterpret_cast(s->valid_buf), vbuf_bit_idx(dst_offset++)); + } else { + clear_bit(reinterpret_cast(s->valid_buf), vbuf_bit_idx(dst_offset++)); + } + } + } + + __syncthreads(); + if (t == block_size - 1) { + // Number of loaded rows, available for encode + s->numvals += offset + pd_set_cnt; + // Number of loaded rows (different from present_rows because of pushdown masks) + s->nnz += offset + pd_set_cnt; + } + present_rows += nrows; + if (!t) { s->present_rows = present_rows; } + __syncthreads(); + + // RLE encode the present stream + if (s->numvals > ((present_rows < s->chunk.null_mask_num_rows) ? 130 * 8 : 0)) { + auto const flush = (present_rows < s->chunk.null_mask_num_rows) ? 0 : 7; + auto const nbytes_out = (s->numvals + flush) / 8; + auto const nrows_encoded = + ByteRLE(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8; + + if (!t) { + // Number of rows enocoded so far + s->present_out += nrows_encoded; + s->numvals -= min(s->numvals, nrows_encoded); + } + __syncthreads(); + } + } + + // reset shared state + if (t == 0) { + s->nnz = 0; + s->numvals = 0; + } +} + /** * @brief Encode column data * @@ -635,6 +728,7 @@ __global__ void __launch_bounds__(block_size) { __shared__ __align__(16) orcenc_state_s state_g; __shared__ union { + typename cub::BlockScan::TempStorage scan_u32; typename cub::BlockReduce::TempStorage i32; typename cub::BlockReduce::TempStorage i64; typename cub::BlockReduce::TempStorage u32; @@ -646,120 +740,74 @@ __global__ void __launch_bounds__(block_size) uint32_t group_id = blockIdx.y; int t = threadIdx.x; if (t == 0) { - s->chunk = chunks[col_id][group_id]; - s->stream = streams[col_id][group_id]; - } - if (t < CI_NUM_STREAMS) { s->strm_pos[t] = 0; } - __syncthreads(); - if (!t) { - s->cur_row = 0; - s->present_rows = 0; - s->present_out = 0; - s->numvals = 0; - s->numlengths = 0; - s->nnz = 0; + s->chunk = chunks[col_id][group_id]; + s->stream = streams[col_id][group_id]; + s->cur_row = 0; + s->present_rows = 0; + s->present_out = 0; + s->numvals = 0; + s->numlengths = 0; + s->nnz = 0; + s->strm_pos[CI_DATA] = 0; + s->strm_pos[CI_PRESENT] = 0; + s->strm_pos[CI_INDEX] = 0; // Dictionary data is encoded in a separate kernel - if (s->chunk.encoding_kind == DICTIONARY_V2) { - s->strm_pos[CI_DATA2] = s->stream.lengths[CI_DATA2]; - s->strm_pos[CI_DICTIONARY] = s->stream.lengths[CI_DICTIONARY]; - } + s->strm_pos[CI_DATA2] = + s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DATA2] : 0; + s->strm_pos[CI_DICTIONARY] = + s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DICTIONARY] : 0; } + __syncthreads(); - auto validity_byte = [&] __device__(int row) -> uint8_t& { - // valid_buf is a circular buffer where validitiy of 8 rows is stored in each element - return s->valid_buf[(row / 8) % encode_block_size]; - }; - - auto validity = [&] __device__(int row) -> uint32_t { - // Check if the specific bit is set in the validity buffer - return (validity_byte(row) >> (row % 8)) & 1; - }; + auto const pushdown_mask = [&]() -> cudf::bitmask_type const* { + auto const parent_index = s->chunk.column->parent_index; + if (!parent_index.has_value()) return nullptr; + return chunks[parent_index.value()][0].column->pushdown_mask; + }(); + encode_null_mask(s, pushdown_mask, temp_storage.scan_u32, t); __syncthreads(); + + auto const column = *s->chunk.column; while (s->cur_row < s->chunk.num_rows || s->numvals + s->numlengths != 0) { - // Encode valid map - if (s->present_rows < s->chunk.num_rows) { - uint32_t present_rows = s->present_rows; - uint32_t nrows = - min(s->chunk.num_rows - present_rows, - encode_block_size * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7))); - uint32_t nrows_out; - if (t * 8 < nrows) { - auto const row_in_group = present_rows + t * 8; - auto const row = s->chunk.start_row + row_in_group; - uint8_t valid = 0; - if (row < s->chunk.leaf_column->size()) { - if (s->chunk.leaf_column->nullable()) { - auto const current_valid_offset = row + s->chunk.leaf_column->offset(); - auto const last_offset = - min(current_valid_offset + 8, - s->chunk.leaf_column->offset() + s->chunk.leaf_column->size()); - auto const mask = cudf::detail::get_mask_offset_word( - s->chunk.leaf_column->null_mask(), 0, current_valid_offset, last_offset); - valid = 0xff & mask; - } else { - valid = 0xff; - } - if (row + 7 > s->chunk.leaf_column->size()) { - valid = valid & ((1 << (s->chunk.leaf_column->size() - row)) - 1); - } - } - validity_byte(row_in_group) = valid; - } - __syncthreads(); - present_rows += nrows; - if (!t) { s->present_rows = present_rows; } - // RLE encode the present stream - nrows_out = present_rows - s->present_out; // Should always be a multiple of 8 except at - // the end of the last row group - if (nrows_out > ((present_rows < s->chunk.num_rows) ? 130 * 8 : 0)) { - uint32_t present_out = s->present_out; - if (s->stream.ids[CI_PRESENT] >= 0) { - uint32_t flush = (present_rows < s->chunk.num_rows) ? 0 : 7; - nrows_out = (nrows_out + flush) >> 3; - nrows_out = - ByteRLE(s, s->valid_buf, present_out >> 3, nrows_out, flush, t) * 8; - } - __syncthreads(); - if (!t) { s->present_out = min(present_out + nrows_out, present_rows); } - } - __syncthreads(); - } // Fetch non-null values if (s->chunk.type_kind != LIST && !s->stream.data_ptrs[CI_DATA]) { // Pass-through __syncthreads(); if (!t) { - s->cur_row = s->present_rows; - s->strm_pos[CI_DATA] = s->cur_row * s->chunk.dtype_len; + s->cur_row = s->chunk.num_rows; + s->strm_pos[CI_DATA] = s->chunk.num_rows * s->chunk.dtype_len; } - __syncthreads(); - } else if (s->cur_row < s->present_rows) { + } else if (s->cur_row < s->chunk.num_rows) { uint32_t maxnumvals = (s->chunk.type_kind == BOOLEAN) ? 2048 : 1024; uint32_t nrows = - min(min(s->present_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)), + min(min(s->chunk.num_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)), encode_block_size); - auto const row_in_group = s->cur_row + t; - uint32_t const valid = (t < nrows) ? validity(row_in_group) : 0; - s->buf.u32[t] = valid; + auto const row = s->chunk.start_row + s->cur_row + t; + + auto const is_value_valid = [&]() { + if (t >= nrows) return false; + return bit_value_or(pushdown_mask, column.offset() + row, true) and + bit_value_or(column.null_mask(), column.offset() + row, true); + }(); + s->buf.u32[t] = is_value_valid ? 1u : 0u; // TODO: Could use a faster reduction relying on _popc() for the initial phase lengths_to_positions(s->buf.u32, encode_block_size, t); __syncthreads(); - auto const row = s->chunk.start_row + row_in_group; - if (valid) { + if (is_value_valid) { int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1); switch (s->chunk.type_kind) { case INT: case DATE: - case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element(row); break; + case FLOAT: s->vals.u32[nz_idx] = column.element(row); break; case DOUBLE: - case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element(row); break; - case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element(row); break; + case LONG: s->vals.u64[nz_idx] = column.element(row); break; + case SHORT: s->vals.u32[nz_idx] = column.element(row); break; case BOOLEAN: - case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element(row); break; + case BYTE: s->vals.u8[nz_idx] = column.element(row); break; case TIMESTAMP: { - int64_t ts = s->chunk.leaf_column->element(row); + int64_t ts = column.element(row); int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)]; int64_t seconds = ts / ts_scale; int64_t nanos = (ts - seconds * ts_scale); @@ -796,7 +844,7 @@ __global__ void __launch_bounds__(block_size) } s->vals.u32[nz_idx] = dict_idx; } else { - string_view value = s->chunk.leaf_column->element(row); + string_view value = column.element(row); s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data(); s->lengths.u32[nz_idx] = value.size_bytes(); } @@ -805,11 +853,10 @@ __global__ void __launch_bounds__(block_size) // Note: can be written in a faster manner, given that all values are equal case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break; case LIST: { - auto const& offsets = - s->chunk.leaf_column->child(lists_column_view::offsets_column_index); + auto const& offsets = column.child(lists_column_view::offsets_column_index); // Compute list length from the offsets - s->lengths.u32[nz_idx] = - offsets.element(row + 1) - offsets.element(row); + s->lengths.u32[nz_idx] = offsets.element(row + 1 + column.offset()) - + offsets.element(row + column.offset()); } break; default: break; } @@ -897,10 +944,10 @@ __global__ void __launch_bounds__(block_size) } break; case DECIMAL: { - if (valid) { - uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32) - ? zigzag(s->chunk.leaf_column->element(row)) - : zigzag(s->chunk.leaf_column->element(row)); + if (is_value_valid) { + uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32) + ? zigzag(column.element(row)) + : zigzag(column.element(row)); auto const offset = (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1]; StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val); @@ -942,8 +989,8 @@ __global__ void __launch_bounds__(block_size) streams[col_id][group_id].lengths[t] = s->strm_pos[t]; if (!s->stream.data_ptrs[t]) { streams[col_id][group_id].data_ptrs[t] = - static_cast(const_cast(s->chunk.leaf_column->head())) + - (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len; + static_cast(const_cast(column.head())) + + (column.offset() + s->chunk.start_row) * s->chunk.dtype_len; } } } @@ -1033,16 +1080,6 @@ __global__ void __launch_bounds__(block_size) if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; } } -__global__ void __launch_bounds__(512) - gpu_set_chunk_columns(device_span orc_columns, - device_2dspan chunks) -{ - // Set leaf_column member of EncChunk - for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) { - chunks[blockIdx.x][i].leaf_column = &orc_columns[blockIdx.x].cudf_column; - } -} - /** * @brief Merge chunked column data into a single contiguous stream * @@ -1255,16 +1292,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes, <<>>(stripes, chunks, enc_streams); } -void set_chunk_columns(device_span orc_columns, - device_2dspan chunks, - rmm::cuda_stream_view stream) -{ - dim3 dim_block(512, 1); - dim3 dim_grid(chunks.size().first, 1); - - gpu_set_chunk_columns<<>>(orc_columns, chunks); -} - void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index d6dbdbe6403..be561530459 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -19,6 +19,7 @@ #include +#include #include namespace cudf { @@ -473,6 +474,45 @@ extern "C" __global__ void __launch_bounds__(128, 8) } } +template +__global__ void __launch_bounds__(block_size) + gpu_reduce_pushdown_masks(device_span orc_columns, + device_2dspan rowgroup_bounds, + device_2dspan set_counts) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + auto const column_id = blockIdx.x; + auto const rowgroup_id = blockIdx.y; + auto const column = orc_columns[column_id]; + auto const t = threadIdx.x; + + auto const use_child_rg = column.type().id() == type_id::LIST; + auto const rg = rowgroup_bounds[rowgroup_id][column_id + (use_child_rg ? 1 : 0)]; + + if (column.pushdown_mask == nullptr) { + // All elements are valid if the null mask is not present + if (t == 0) { set_counts[rowgroup_id][column_id] = rg.size(); } + return; + }; + + size_type count = 0; + static constexpr size_type bits_per_word = sizeof(bitmask_type) * 8; + for (auto row = t * bits_per_word + rg.begin; row < rg.end; row += block_size * bits_per_word) { + auto const begin_bit = row; + auto const end_bit = min(static_cast(row + bits_per_word), rg.end); + auto const mask_len = end_bit - begin_bit; + auto const mask_word = + cudf::detail::get_mask_offset_word(column.pushdown_mask, 0, row, end_bit) & + ((1 << mask_len) - 1); + count += __popc(mask_word); + } + + count = BlockReduce(temp_storage).Sum(count); + if (t == 0) { set_counts[rowgroup_id][column_id] = count; } +} + void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t compression_block_size, @@ -495,19 +535,6 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, num_streams); } -/** - * @brief Launches kernel for constructing rowgroup from index streams - * - * @param[out] row_groups RowGroup device array [rowgroup][column] - * @param[in] strm_info List of compressed streams (or NULL if uncompressed) - * @param[in] chunks ColumnDesc device array [stripe][column] - * @param[in] num_columns Number of columns - * @param[in] num_stripes Number of stripes - * @param[in] num_rowgroups Number of row groups - * @param[in] rowidx_stride Row index stride - * @param[in] use_base_stride Whether to use base stride obtained from meta or the computed value - * @param[in] stream CUDA stream used for device memory operations and kernel launches - */ void __host__ ParseRowGroupIndex(RowGroup* row_groups, CompressedStreamInfo* strm_info, ColumnDesc* chunks, @@ -530,6 +557,17 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups, use_base_stride); } +void __host__ reduce_pushdown_masks(device_span columns, + device_2dspan rowgroups, + device_2dspan valid_counts, + rmm::cuda_stream_view stream) +{ + dim3 dim_block(128, 1); + dim3 dim_grid(columns.size(), rowgroups.size().first); // 1 rowgroup per block + gpu_reduce_pushdown_masks<128> + <<>>(columns, rowgroups, valid_counts); +} + } // namespace gpu } // namespace orc } // namespace io diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 8a0112deb76..299c8fbb730 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -99,6 +99,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id) case cudf::type_id::DECIMAL32: case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL; case cudf::type_id::LIST: return TypeKind::LIST; + case cudf::type_id::STRUCT: return TypeKind::STRUCT; default: return TypeKind::INVALID_TYPE_KIND; } } @@ -142,30 +143,30 @@ class orc_column_view { */ explicit orc_column_view(uint32_t index, int str_idx, - int index_in_table, + orc_column_view* parent, column_view const& col, - const table_metadata* metadata) + column_in_metadata const& metadata) : cudf_column{col}, _index{index}, _str_idx{str_idx}, - _is_child{index_in_table < 0}, + _is_child{parent != nullptr}, _type_width{cudf::is_fixed_width(col.type()) ? cudf::size_of(col.type()) : 0}, _scale{(to_orc_type(col.type().id()) == TypeKind::DECIMAL) ? -col.type().scale() : to_clockscale(col.type().id())}, - _precision{orc_precision(col.type().id())}, - _type_kind{to_orc_type(col.type().id())} + _precision{metadata.is_decimal_precision_set() ? metadata.get_decimal_precision() + : orc_precision(col.type().id())}, + _type_kind{to_orc_type(col.type().id())}, + name{metadata.get_name()} { - // Don't assign names to child columns - if (index_in_table >= 0) { - if (metadata != nullptr && index_in_table < static_cast(metadata->column_names.size())) { - _name = metadata->column_names[index_in_table]; - } else { - // Generating default name if name isn't present in metadata - _name = "_col" + std::to_string(index_in_table); - } + if (metadata.is_nullability_defined()) { nullable_from_metadata = metadata.nullable(); } + if (parent != nullptr) { + parent->add_child(_index); + _parent_index = parent->index(); } } + void add_child(uint32_t child_idx) { children.emplace_back(child_idx); } + auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; } void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; } auto dict_stride() const noexcept { return _dict_stride; } @@ -206,15 +207,22 @@ class orc_column_view { auto device_stripe_dict() const noexcept { return d_stripe_dict; } // Index in the table - auto index() const noexcept { return _index; } + uint32_t index() const noexcept { return _index; } // Id in the ORC file auto id() const noexcept { return _index + 1; } + auto is_child() const noexcept { return _is_child; } + auto parent_index() const noexcept { return _parent_index.value(); } + auto child_begin() const noexcept { return children.cbegin(); } + auto child_end() const noexcept { return children.cend(); } + auto type_width() const noexcept { return _type_width; } auto size() const noexcept { return cudf_column.size(); } + auto null_count() const noexcept { return cudf_column.null_count(); } auto null_mask() const noexcept { return cudf_column.null_mask(); } bool nullable() const noexcept { return null_mask() != nullptr; } + auto user_defined_nullable() const noexcept { return nullable_from_metadata; } auto scale() const noexcept { return _scale; } auto precision() const noexcept { return _precision; } @@ -222,7 +230,7 @@ class orc_column_view { void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; } auto orc_kind() const noexcept { return _type_kind; } auto orc_encoding() const noexcept { return _encoding_kind; } - auto orc_name() const noexcept { return _name; } + std::string_view orc_name() const noexcept { return name; } private: column_view cudf_column; @@ -238,9 +246,9 @@ class orc_column_view { int32_t _precision = 0; // ORC-related members - std::string _name{}; - TypeKind _type_kind; - ColumnEncodingKind _encoding_kind; + TypeKind _type_kind = INVALID_TYPE_KIND; + ColumnEncodingKind _encoding_kind = INVALID_ENCODING_KIND; + std::string name; // String dictionary-related members size_t _dict_stride = 0; @@ -252,6 +260,10 @@ class orc_column_view { // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements // into the output stream. uint32_t* d_decimal_offsets = nullptr; + + std::optional nullable_from_metadata; + std::vector children; + std::optional _parent_index; }; size_type orc_table_view::num_rows() const noexcept @@ -476,11 +488,13 @@ orc_streams writer::impl::create_streams(host_span columns, if (single_write_mode) { return column.nullable(); } else { - if (user_metadata_with_nullability.column_nullable.empty()) return true; - CUDF_EXPECTS(user_metadata_with_nullability.column_nullable.size() > column.index(), - "When passing values in user_metadata_with_nullability, data for all columns " - "must be specified"); - return user_metadata_with_nullability.column_nullable[column.index()]; + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + auto const chunked_nullable = column.user_defined_nullable().value_or(true); + CUDF_EXPECTS(chunked_nullable or !column.nullable(), + "Mismatch in metadata prescribed nullability and input column nullability. " + "Metadata for nullable input column cannot prescribe nullability = false"); + return chunked_nullable; } }(); @@ -594,6 +608,9 @@ orc_streams writer::impl::create_streams(host_span columns, add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT); column.set_orc_encoding(DIRECT_V2); break; + case TypeKind::STRUCT: + // Only has the present stream + break; default: CUDF_FAIL("Unsupported ORC type kind"); } } @@ -641,16 +658,161 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets( return {std::move(strm_offsets), non_rle_data_size, rle_data_size}; } +std::vector> calculate_aligned_rowgroup_bounds( + orc_table_view const& orc_table, + file_segmentation const& segmentation, + rmm::cuda_stream_view stream) +{ + if (segmentation.num_rowgroups() == 0) return {}; + + auto d_pd_set_counts_data = rmm::device_uvector( + orc_table.num_columns() * segmentation.num_rowgroups(), stream); + auto const d_pd_set_counts = device_2dspan{ + d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()}; + gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); + + auto aligned_rgs = hostdevice_2dvector( + segmentation.num_rowgroups(), orc_table.num_columns(), stream); + CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(), + segmentation.rowgroups.base_device_ptr(), + aligned_rgs.count() * sizeof(rowgroup_rows), + cudaMemcpyDefault, + stream.value())); + auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream); + + // One thread per column, per stripe + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + orc_table.num_columns() * segmentation.num_stripes(), + [columns = device_span{orc_table.d_columns}, + stripes = device_span{d_stripes}, + d_pd_set_counts, + out_rowgroups = device_2dspan{aligned_rgs}] __device__(auto& idx) { + uint32_t const col_idx = idx / stripes.size(); + // No alignment needed for root columns + if (not columns[col_idx].parent_index.has_value()) return; + + auto const stripe_idx = idx % stripes.size(); + auto const stripe = stripes[stripe_idx]; + auto const parent_col_idx = columns[col_idx].parent_index.value(); + auto const parent_column = columns[parent_col_idx]; + auto const stripe_end = stripe.first + stripe.size; + + auto seek_last_borrow_rg = [&](auto rg_idx, size_type& bits_to_borrow) { + auto curr = rg_idx + 1; + auto curr_rg_size = [&]() { + return parent_column.pushdown_mask != nullptr ? d_pd_set_counts[curr][parent_col_idx] + : out_rowgroups[curr][col_idx].size(); + }; + while (curr < stripe_end and curr_rg_size() <= bits_to_borrow) { + // All bits from rowgroup borrowed, make the rowgroup empty + out_rowgroups[curr][col_idx].begin = out_rowgroups[curr][col_idx].end; + bits_to_borrow -= curr_rg_size(); + ++curr; + } + return curr; + }; + + int previously_borrowed = 0; + for (auto rg_idx = stripe.first; rg_idx + 1 < stripe_end; ++rg_idx) { + auto& rg = out_rowgroups[rg_idx][col_idx]; + + if (parent_column.pushdown_mask == nullptr) { + // No pushdown mask, all null mask bits will be encoded + // Align on rowgroup size (can be misaligned for list children) + if (rg.size() % 8) { + auto bits_to_borrow = 8 - rg.size() % 8; + auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow); + if (last_borrow_rg_idx == stripe_end) { + // Didn't find enough bits to borrow, move the rowgroup end to the stripe end + rg.end = out_rowgroups[stripe_end - 1][col_idx].end; + // Done with this stripe + break; + } + auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx]; + last_borrow_rg.begin += bits_to_borrow; + rg.end = last_borrow_rg.begin; + // Skip the rowgroups we emptied in the loop + rg_idx = last_borrow_rg_idx - 1; + } + } else { + // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded + // Use the number of set bits in pushdown mask as size + auto bits_to_borrow = + 8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8; + if (bits_to_borrow == 0) { + // Didn't borrow any bits for this rowgroup + previously_borrowed = 0; + continue; + } + + // Find rowgroup in which we finish the search for missing bits + auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow); + if (last_borrow_rg_idx == stripe_end) { + // Didn't find enough bits to borrow, move the rowgroup end to the stripe end + rg.end = out_rowgroups[stripe_end - 1][col_idx].end; + // Done with this stripe + break; + } + + auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx]; + // First row that does not need to be borrowed + auto borrow_end = last_borrow_rg.begin; + + // Adjust the number of bits to borrow in the next iteration + previously_borrowed = bits_to_borrow; + + // Find word in which we finish the search for missing bits (guaranteed to be available) + while (bits_to_borrow != 0) { + auto const mask = cudf::detail::get_mask_offset_word( + parent_column.pushdown_mask, 0, borrow_end, borrow_end + 32); + auto const valid_in_word = __popc(mask); + + if (valid_in_word > bits_to_borrow) break; + bits_to_borrow -= valid_in_word; + borrow_end += 32; + } + + // Find the last of the missing bits (guaranteed to be available) + while (bits_to_borrow != 0) { + if (bit_is_set(parent_column.pushdown_mask, borrow_end)) { --bits_to_borrow; }; + ++borrow_end; + } + + last_borrow_rg.begin = borrow_end; + rg.end = borrow_end; + // Skip the rowgroups we emptied in the loop + rg_idx = last_borrow_rg_idx - 1; + } + } + }); + + aligned_rgs.device_to_host(stream, true); + + std::vector> h_aligned_rgs; + h_aligned_rgs.reserve(segmentation.num_rowgroups()); + std::transform(thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(segmentation.num_rowgroups()), + std::back_inserter(h_aligned_rgs), + [&](auto idx) -> std::vector { + return {aligned_rgs[idx].begin(), aligned_rgs[idx].end()}; + }); + + return h_aligned_rgs; +} + struct segmented_valid_cnt_input { bitmask_type const* mask; std::vector indices; }; -encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, - string_dictionaries&& dictionaries, - encoder_decimal_info&& dec_chunk_sizes, - file_segmentation const& segmentation, - orc_streams const& streams) +encoded_data encode_columns(orc_table_view const& orc_table, + string_dictionaries&& dictionaries, + encoder_decimal_info&& dec_chunk_sizes, + file_segmentation const& segmentation, + orc_streams const& streams, + rmm::cuda_stream_view stream) { auto const num_columns = orc_table.num_columns(); hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); @@ -658,19 +820,22 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups()); rmm::device_uvector encoded_data(stream_offsets.data_size(), stream); + auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream); + // Initialize column chunks' descriptions std::map validity_check_inputs; for (auto const& column : orc_table.columns) { for (auto const& stripe : segmentation.stripes) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { - auto const rg_idx = *rg_idx_it; - auto& ck = chunks[column.index()][rg_idx]; - - ck.start_row = segmentation.rowgroups[rg_idx][column.index()].begin; - ck.num_rows = segmentation.rowgroups[rg_idx][column.index()].size(); - ck.encoding_kind = column.orc_encoding(); - ck.type_kind = column.orc_kind(); + auto const rg_idx = *rg_idx_it; + auto& ck = chunks[column.index()][rg_idx]; + ck.start_row = segmentation.rowgroups[rg_idx][column.index()].begin; + ck.num_rows = segmentation.rowgroups[rg_idx][column.index()].size(); + ck.null_mask_start_row = aligned_rowgroups[rg_idx][column.index()].begin; + ck.null_mask_num_rows = aligned_rowgroups[rg_idx][column.index()].size(); + ck.encoding_kind = column.orc_encoding(); + ck.type_kind = column.orc_kind(); if (ck.type_kind == TypeKind::STRING) { ck.dict_index = (ck.encoding_kind == DICTIONARY_V2) ? column.host_stripe_dict(stripe.id)->dict_index @@ -684,6 +849,19 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, } } } + chunks.host_to_device(stream); + // TODO (future): pass columns separately from chunks (to skip this step) + // and remove info from chunks that is common for the entire column + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + chunks.count(), + [chunks = device_2dspan{chunks}, + cols = device_span{orc_table.d_columns}] __device__(auto& idx) { + auto const col_idx = idx / chunks.size().second; + auto const rg_idx = idx % chunks.size().second; + chunks[col_idx][rg_idx].column = &cols[col_idx]; + }); auto validity_check_indices = [&](size_t col_idx) { std::vector indices; @@ -789,12 +967,8 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table, } } } - - chunks.host_to_device(stream); chunk_streams.host_to_device(stream); - gpu::set_chunk_columns(orc_table.d_columns, chunks, stream); - if (orc_table.num_string_columns() != 0) { auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict(); gpu::EncodeStripeDictionaries(d_stripe_dict, @@ -856,11 +1030,10 @@ void set_stat_desc_leaf_cols(device_span columns, device_span stat_desc, rmm::cuda_stream_view stream) { - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(stat_desc.size()), - [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx].cudf_column; }); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(stat_desc.size()), + [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; }); } std::vector> writer::impl::gather_statistic_blobs( @@ -1101,14 +1274,16 @@ writer::impl::impl(std::unique_ptr sink, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : compression_kind_(to_orc_compression(options.get_compression())), + : _mr(mr), + stream(stream), + compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), - out_sink_(std::move(sink)), single_write_mode(mode == SingleWriteMode::YES), - user_metadata(options.get_metadata()), - stream(stream), - _mr(mr) + out_sink_(std::move(sink)) { + if (options.get_metadata()) { + table_meta = std::make_unique(*options.get_metadata()); + } init_state(); } @@ -1117,18 +1292,16 @@ writer::impl::impl(std::unique_ptr sink, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : compression_kind_(to_orc_compression(options.get_compression())), + : _mr(mr), + stream(stream), + compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), - out_sink_(std::move(sink)), single_write_mode(mode == SingleWriteMode::YES), - stream(stream), - _mr(mr) + out_sink_(std::move(sink)) { - if (options.get_metadata() != nullptr) { - user_metadata_with_nullability = *options.get_metadata(); - user_metadata = &user_metadata_with_nullability; + if (options.get_metadata()) { + table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } @@ -1140,6 +1313,113 @@ void writer::impl::init_state() out_sink_->host_write(MAGIC, std::strlen(MAGIC)); } +void pushdown_lists_null_mask(orc_column_view const& col, + device_span d_columns, + bitmask_type const* parent_pd_mask, + device_span out_mask, + rmm::cuda_stream_view stream) +{ + // Set all bits - correct unless there's a mismatch between offsets and null mask + CUDA_TRY(cudaMemsetAsync(static_cast(out_mask.data()), + 255, + out_mask.size() * sizeof(bitmask_type), + stream.value())); + + // Reset bits where a null list element has rows in the child column + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0u), + col.size(), + [d_columns, col_idx = col.index(), parent_pd_mask, out_mask] __device__(auto& idx) { + auto const d_col = d_columns[col_idx]; + auto const is_row_valid = d_col.is_valid(idx) and bit_value_or(parent_pd_mask, idx, true); + if (not is_row_valid) { + auto offsets = d_col.child(lists_column_view::offsets_column_index); + auto const child_rows_begin = offsets.element(idx + d_col.offset()); + auto const child_rows_end = offsets.element(idx + 1 + d_col.offset()); + for (auto child_row = child_rows_begin; child_row < child_rows_end; ++child_row) + clear_bit(out_mask.data(), child_row); + } + }); +} + +/** + * @brief All pushdown masks in a table. + * + * Pushdown masks are applied to child column(s). Only bits of the child column null mask that + * correspond to set pushdown mask bits are encoded into the output file. Similarly, rows where + * pushdown mask is 0 are treated as invalid and not included in the output. + */ +struct pushdown_null_masks { + // Owning vector for masks in device memory + std::vector> data; + // Pointers to pushdown masks in device memory. Can be same for multiple columns. + std::vector masks; +}; + +pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, + rmm::cuda_stream_view stream) +{ + std::vector mask_ptrs; + mask_ptrs.reserve(orc_table.num_columns()); + std::vector> pd_masks; + for (auto const& col : orc_table.columns) { + // Leaf columns don't need pushdown masks + if (col.orc_kind() != LIST && col.orc_kind() != STRUCT) { + mask_ptrs.emplace_back(nullptr); + continue; + } + auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr; + auto const null_mask = col.null_mask(); + + if (null_mask == nullptr and parent_pd_mask == nullptr) { + mask_ptrs.emplace_back(nullptr); + continue; + } + if (col.orc_kind() == STRUCT) { + if (null_mask != nullptr and parent_pd_mask == nullptr) { + // Reuse own null mask + mask_ptrs.emplace_back(null_mask); + } else if (null_mask == nullptr and parent_pd_mask != nullptr) { + // Reuse parent's pushdown mask + mask_ptrs.emplace_back(parent_pd_mask); + } else { + // Both are nullable, allocate new pushdown mask + pd_masks.emplace_back(num_bitmask_words(col.size()), stream); + mask_ptrs.emplace_back(pd_masks.back().data()); + + thrust::transform(rmm::exec_policy(stream), + null_mask, + null_mask + pd_masks.back().size(), + parent_pd_mask, + pd_masks.back().data(), + thrust::bit_and()); + } + } + if (col.orc_kind() == LIST) { + // Need a new pushdown mask unless both the parent and current colmn are not nullable + auto const child_col = orc_table.column(col.child_begin()[0]); + // pushdown mask applies to child column; use the child column size + pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream); + mask_ptrs.emplace_back(pd_masks.back().data()); + pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream); + } + } + + // Attach null masks to device column views (async) + auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + orc_table.num_columns(), + [cols = device_span{orc_table.d_columns}, + ptrs = device_span{d_mask_ptrs}] __device__(auto& idx) { + cols[idx].pushdown_mask = ptrs[idx]; + }); + + return {std::move(pd_masks), std::move(mask_ptrs)}; +} + template struct device_stack { __device__ device_stack(T* stack_storage, int capacity) @@ -1166,28 +1446,35 @@ struct device_stack { orc_table_view make_orc_table_view(table_view const& table, table_device_view const& d_table, - table_metadata const* user_metadata, + table_input_metadata const& table_meta, rmm::cuda_stream_view stream) { std::vector orc_columns; std::vector str_col_indexes; - std::function append_orc_column = [&](column_view const& col, - int index_in_table) { - int const str_idx = - (col.type().id() == type_id::STRING) ? static_cast(str_col_indexes.size()) : -1; - auto const& new_col = - orc_columns.emplace_back(orc_columns.size(), str_idx, index_in_table, col, user_metadata); - if (new_col.is_string()) { str_col_indexes.push_back(new_col.index()); } - if (col.type().id() == type_id::LIST) - append_orc_column(col.child(lists_column_view::child_column_index), -1); - if (col.type().id() == type_id::STRUCT) - for (auto child = col.child_begin(); child != col.child_end(); ++child) - append_orc_column(*child, -1); - }; + std::function + append_orc_column = + [&](column_view const& col, orc_column_view* parent_col, column_in_metadata const& col_meta) { + int const str_idx = + (col.type().id() == type_id::STRING) ? static_cast(str_col_indexes.size()) : -1; + + auto const new_col_idx = orc_columns.size(); + orc_columns.emplace_back(new_col_idx, str_idx, parent_col, col, col_meta); + if (orc_columns[new_col_idx].is_string()) { str_col_indexes.push_back(new_col_idx); } + + if (col.type().id() == type_id::LIST) { + append_orc_column(col.child(lists_column_view::child_column_index), + &orc_columns[new_col_idx], + col_meta.child(lists_column_view::child_column_index)); + } else if (col.type().id() == type_id::STRUCT) { + for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx) + append_orc_column( + col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx)); + } + }; for (auto col_idx = 0; col_idx < table.num_columns(); ++col_idx) { - append_orc_column(table.column(col_idx), col_idx); + append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]); } rmm::device_uvector d_orc_columns(orc_columns.size(), stream); @@ -1256,19 +1543,24 @@ hostdevice_2dvector calculate_rowgroup_bounds(orc_table_view cons // Root column if (!col.parent_index.has_value()) { size_type const rows_begin = rg_idx * rowgroup_size; - auto const rows_end = - thrust::min((rg_idx + 1) * rowgroup_size, col.cudf_column.size()); + auto const rows_end = thrust::min((rg_idx + 1) * rowgroup_size, col.size()); return rowgroup_rows{rows_begin, rows_end}; } else { // Child column - auto const parent_index = *col.parent_index; - column_device_view parent_col = cols[parent_index].cudf_column; - if (parent_col.type().id() != type_id::LIST) return rg_bounds[rg_idx][parent_index]; - - auto parent_offsets = parent_col.child(lists_column_view::offsets_column_index); - auto const& parent_rowgroup_rows = rg_bounds[rg_idx][parent_index]; - auto const rows_begin = parent_offsets.element(parent_rowgroup_rows.begin); - auto const rows_end = parent_offsets.element(parent_rowgroup_rows.end); + auto const parent_index = *col.parent_index; + orc_column_device_view parent_col = cols[parent_index]; + auto const parent_rg = rg_bounds[rg_idx][parent_index]; + if (parent_col.type().id() != type_id::LIST) { + auto const offset_diff = parent_col.offset() - col.offset(); + return rowgroup_rows{parent_rg.begin + offset_diff, parent_rg.end + offset_diff}; + } + + auto offsets = parent_col.child(lists_column_view::offsets_column_index); + auto const rows_begin = + offsets.element(parent_rg.begin + parent_col.offset()) - col.offset(); + auto const rows_end = + offsets.element(parent_rg.end + parent_col.offset()) - col.offset(); + return rowgroup_rows{rows_begin, rows_end}; } }); @@ -1295,8 +1587,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, current_sizes.end(), [d_cols = device_span{orc_table.d_columns}, col_idx = orc_col.index()] __device__(auto idx) { - auto const& col = d_cols[col_idx].cudf_column; - if (col.is_null(idx)) return 0u; + auto const& col = d_cols[col_idx]; + auto const pushdown_mask = [&]() -> cudf::bitmask_type const* { + auto const parent_index = d_cols[col_idx].parent_index; + if (!parent_index.has_value()) return nullptr; + return d_cols[parent_index.value()].pushdown_mask; + }(); + if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true)) + return 0u; int64_t const element = (col.type().id() == type_id::DECIMAL32) ? col.element(idx) : col.element(idx); @@ -1418,9 +1716,25 @@ void writer::impl::write(table_view const& table) CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); auto const num_rows = table.num_rows(); + if (not table_meta) { table_meta = std::make_unique(table); } + + // Fill unnamed columns' names in table_meta + std::function add_default_name = + [&](column_in_metadata& col_meta, std::string default_name) { + if (col_meta.get_name().empty()) col_meta.set_name(default_name); + for (size_type i = 0; i < col_meta.num_children(); ++i) { + add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i)); + } + }; + for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { + add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); + } + auto const d_table = table_device_view::create(table, stream); - auto orc_table = make_orc_table_view(table, *d_table, user_metadata, stream); + auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream); + + auto const pd_masks = init_pushdown_null_masks(orc_table, stream); auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride_, stream); @@ -1458,7 +1772,7 @@ void writer::impl::write(table_view const& table) auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes)); auto enc_data = encode_columns( - orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams); + orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream); // Assemble individual disparate column chunks into contiguous data streams size_type const num_index_streams = (orc_table.num_columns() + 1); @@ -1646,6 +1960,18 @@ void writer::impl::write(table_view const& table) } // In preorder traversal the column after a list column is always the child column if (column.orc_kind() == LIST) { schema_type.subtypes.emplace_back(column.id() + 1); } + if (column.orc_kind() == STRUCT) { + std::transform(column.child_begin(), + column.child_end(), + std::back_inserter(schema_type.subtypes), + [&](auto const& child_idx) { return orc_table.column(child_idx).id(); }); + std::transform(column.child_begin(), + column.child_end(), + std::back_inserter(schema_type.fieldNames), + [&](auto const& child_idx) { + return std::string{orc_table.column(child_idx).orc_name()}; + }); + } } } else { // verify the user isn't passing mismatched tables @@ -1671,11 +1997,13 @@ void writer::impl::close() PostScript ps; ff.contentLength = out_sink_->bytes_written(); - if (user_metadata) { - for (auto it = user_metadata->user_data.begin(); it != user_metadata->user_data.end(); it++) { - ff.metadata.push_back({it->first, it->second}); - } - } + std::transform(table_meta->user_data.begin(), + table_meta->user_data.end(), + std::back_inserter(ff.metadata), + [&](auto const& udata) { + return UserMetadataItem{udata.first, udata.second}; + }); + // Write statistics metadata if (md.stripeStats.size() != 0) { buffer_.resize((compression_kind_ != NONE) ? 3 : 0); diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 787bdeb3a4e..a8fe22a360f 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -262,23 +262,6 @@ class writer::impl { file_segmentation const& segmentation, std::map const& decimal_column_sizes); - /** - * @brief Encodes the input columns into streams. - * - * @param orc_table Non-owning view of a cuDF table w/ ORC-related info - * @param dict_data Dictionary data memory - * @param dict_index Dictionary index memory - * @param dec_chunk_sizes Information about size of encoded decimal columns - * @param segmentation stripe and rowgroup ranges - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Encoded data and per-chunk stream descriptors - */ - encoded_data encode_columns(orc_table_view const& orc_table, - string_dictionaries&& dictionaries, - encoder_decimal_info&& dec_chunk_sizes, - file_segmentation const& segmentation, - orc_streams const& streams); - /** * @brief Returns stripe information after compacting columns' individual data * chunks into contiguous data streams. @@ -375,14 +358,11 @@ class writer::impl { cudf::io::orc::Metadata md; // current write position for rowgroups/chunks size_t current_chunk_offset; - // optional user metadata - table_metadata const* user_metadata = nullptr; - // only used in the write_chunked() case. copied from the (optionally) user supplied - // argument to write_chunked_begin() - table_metadata_with_nullability user_metadata_with_nullability; // special parameter only used by detail::write() to indicate that we are guaranteeing // a single table write. this enables some internal optimizations. bool const single_write_mode; + // optional user metadata + std::unique_ptr table_meta; // to track if the output has been written to sink bool closed = false; diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp new file mode 100644 index 00000000000..39617c99690 --- /dev/null +++ b/cpp/tests/io/metadata_utilities.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf::test { + +void expect_metadata_equal(cudf::io::table_input_metadata in_meta, + cudf::io::table_metadata out_meta) +{ + std::function compare_names = + [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) { + if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); } + ASSERT_EQ(out_col.children.size(), in_col.num_children()); + for (size_t i = 0; i < out_col.children.size(); ++i) { + compare_names(out_col.children[i], in_col.child(i)); + } + }; + + ASSERT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size()); + + for (size_t i = 0; i < out_meta.schema_info.size(); ++i) { + compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]); + } +} + +} // namespace cudf::test diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index fbeba925f1b..cdf0a3b275b 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -161,14 +162,10 @@ struct SkipRowTest { auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); column_wrapper input_col( sequence, sequence + file_num_rows, validity); - - std::vector> input_cols; - input_cols.push_back(input_col.release()); - auto input_table = std::make_unique(std::move(input_cols)); - EXPECT_EQ(1, input_table->num_columns()); + table_view input_table({input_col}); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table); cudf_io::write_orc(out_opts); auto begin_sequence = sequence, end_sequence = sequence; @@ -180,9 +177,7 @@ struct SkipRowTest { begin_sequence, end_sequence, validity); std::vector> output_cols; output_cols.push_back(output_col.release()); - auto expected = std::make_unique
(std::move(output_cols)); - EXPECT_EQ(1, expected->num_columns()); - return expected; + return std::make_unique
(std::move(output_cols)); } void test(int skip_rows, int file_num_rows, int read_num_rows) @@ -224,22 +219,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls) @@ -250,22 +241,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) @@ -277,15 +264,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -294,7 +277,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps) .timestamp_type(this->type()); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) @@ -307,15 +290,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) constexpr auto num_rows = 100; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -324,12 +303,12 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls) .timestamp_type(this->type()); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TEST_F(OrcWriterTest, MultiColumn) { - constexpr auto num_rows = 100; + constexpr auto num_rows = 10; auto col0_data = random_values(num_rows); auto col1_data = random_values(num_rows); @@ -351,29 +330,29 @@ TEST_F(OrcWriterTest, MultiColumn) column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; column_wrapper col6{col6_data, col6_data + num_rows, validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("bools"); - expected_metadata.column_names.emplace_back("int8s"); - expected_metadata.column_names.emplace_back("int16s"); - expected_metadata.column_names.emplace_back("int32s"); - expected_metadata.column_names.emplace_back("floats"); - expected_metadata.column_names.emplace_back("doubles"); - expected_metadata.column_names.emplace_back("decimal"); - - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - cols.push_back(col6.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(7, expected->num_columns()); + cudf::test::lists_column_wrapper col7{ + {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}; + + auto child_col = + cudf::test::fixed_width_column_wrapper{48, 27, 25, 31, 351, 351, 29, 15, -1, -99}; + auto col8 = cudf::test::structs_column_wrapper{child_col}; + + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("bools"); + expected_metadata.column_metadata[1].set_name("int8s"); + expected_metadata.column_metadata[2].set_name("int16s"); + expected_metadata.column_metadata[3].set_name("int32s"); + expected_metadata.column_metadata[4].set_name("floats"); + expected_metadata.column_metadata[5].set_name("doubles"); + expected_metadata.column_metadata[6].set_name("decimal"); + expected_metadata.column_metadata[7].set_name("lists"); + expected_metadata.column_metadata[8].set_name("structs"); auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -381,13 +360,13 @@ TEST_F(OrcWriterTest, MultiColumn) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, MultiColumnWithNulls) { - constexpr auto num_rows = 100; + constexpr auto num_rows = 10; auto col0_data = random_values(num_rows); auto col1_data = random_values(num_rows); @@ -402,14 +381,14 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) auto col0_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); auto col1_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); }); auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto col3_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); }); auto col4_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); }); auto col5_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); }); auto col6_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); }); @@ -420,30 +399,28 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; - - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("bools"); - expected_metadata.column_names.emplace_back("int8s"); - expected_metadata.column_names.emplace_back("int16s"); - expected_metadata.column_names.emplace_back("int32s"); - expected_metadata.column_names.emplace_back("floats"); - expected_metadata.column_names.emplace_back("doubles"); - expected_metadata.column_names.emplace_back("decimal"); - - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - cols.push_back(col6.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(7, expected->num_columns()); + cudf::test::lists_column_wrapper col7{ + {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}, + col0_mask}; + auto ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}}; + auto col8 = cudf::test::structs_column_wrapper{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}}; + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("bools"); + expected_metadata.column_metadata[1].set_name("int8s"); + expected_metadata.column_metadata[2].set_name("int16s"); + expected_metadata.column_metadata[3].set_name("int32s"); + expected_metadata.column_metadata[4].set_name("floats"); + expected_metadata.column_metadata[5].set_name("doubles"); + expected_metadata.column_metadata[6].set_name("decimal"); + expected_metadata.column_metadata[7].set_name("lists"); + expected_metadata.column_metadata[8].set_name("structs"); auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -451,8 +428,8 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, ReadZeroRows) @@ -463,15 +440,11 @@ TEST_F(OrcWriterTest, ReadZeroRows) constexpr auto num_rows = 10; column_wrapper col( sequence, sequence + num_rows, validity); - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({col}); auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = @@ -498,21 +471,16 @@ TEST_F(OrcWriterTest, Strings) column_wrapper col1{strings.begin(), strings.end()}; column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); - expected_metadata.column_names.emplace_back("col_string"); - expected_metadata.column_names.emplace_back("col_another"); + table_view expected({col0, col1, col2}); - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(3, expected->num_columns()); + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -520,8 +488,8 @@ TEST_F(OrcWriterTest, Strings) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, SlicedTable) @@ -545,21 +513,24 @@ TEST_F(OrcWriterTest, SlicedTable) column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; column_wrapper col3{seq_col3, seq_col3 + num_rows, validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); - expected_metadata.column_names.emplace_back("col_string"); - expected_metadata.column_names.emplace_back("col_another"); - expected_metadata.column_names.emplace_back("col_decimal"); + using lcw = cudf::test::lists_column_wrapper; + lcw col4{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; - std::vector> cols; - cols.push_back(col0.release()); - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(4, expected->num_columns()); + auto ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}}; + auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}}; - auto expected_slice = cudf::slice(expected->view(), {2, static_cast(num_rows)}); + table_view expected({col0, col1, col2, col3, col4, col5}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + expected_metadata.column_metadata[3].set_name("col_decimal"); + expected_metadata.column_metadata[4].set_name("lists"); + expected_metadata.column_metadata[5].set_name("structs"); + + auto expected_slice = cudf::slice(expected, {2, static_cast(num_rows)}); auto filepath = temp_env->get_temp_filepath("SlicedTable.orc"); cudf_io::orc_writer_options out_opts = @@ -572,7 +543,7 @@ TEST_F(OrcWriterTest, SlicedTable) auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, HostBuffer) @@ -583,17 +554,14 @@ TEST_F(OrcWriterTest, HostBuffer) cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); column_wrapper col{seq_col.begin(), seq_col.end(), validity}; - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_other"); + table_view expected{{col}}; - std::vector> cols; - cols.push_back(col.release()); - const auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); std::vector out_buffer; cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view()) + cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected) .metadata(&expected_metadata); cudf_io::write_orc(out_opts); @@ -602,8 +570,8 @@ TEST_F(OrcWriterTest, HostBuffer) .use_index(false); const auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcWriterTest, negTimestampsNano) @@ -618,15 +586,11 @@ TEST_F(OrcWriterTest, negTimestampsNano) -1530705634500000000, -1674638741932929000, }; - - std::vector> cols; - cols.push_back(timestamps_ns.release()); - auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(1, expected->num_columns()); + table_view expected({timestamps_ns}); auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); @@ -634,10 +598,9 @@ TEST_F(OrcWriterTest, negTimestampsNano) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view().column(0), - result.tbl->view().column(0), - cudf::test::debug_output_level::ALL_ERRORS); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } TEST_F(OrcWriterTest, Slice) @@ -747,21 +710,51 @@ TEST_F(OrcChunkedWriterTest, ManyTables) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); } -TEST_F(OrcChunkedWriterTest, Strings) +TEST_F(OrcChunkedWriterTest, Metadata) { - std::vector> cols; + std::vector strings{ + "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"}; + const auto num_rows = strings.size(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper col1{strings.begin(), strings.end()}; + column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; + table_view expected({col0, col1, col2}); + + cudf_io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + + auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc"); + cudf_io::chunked_orc_writer_options opts = + cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath}) + .metadata(&expected_metadata); + cudf_io::orc_chunked_writer(opts).write(expected).write(expected); + + cudf_io::orc_reader_options read_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); + auto result = cudf_io::read_orc(read_opts); + + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(OrcChunkedWriterTest, Strings) +{ bool mask1[] = {1, 1, 0, 1, 1, 1, 1}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); - cols.push_back(strings1.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({strings1}); bool mask2[] = {0, 1, 1, 1, 1, 1, 1}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); - cols.push_back(strings2.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -864,7 +857,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; int num_els = 31; - std::vector> cols; bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; @@ -875,9 +867,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) std::fill(c1b, c1b + num_els, static_cast(6)); column_wrapper c1a_w(c1a, c1a + num_els, mask); column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({c1a_w, c1b_w}); T c2a[num_els]; std::fill(c2a, c2a + num_els, static_cast(8)); @@ -885,9 +875,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) std::fill(c2b, c2b + num_els, static_cast(9)); column_wrapper c2a_w(c2a, c2a + num_els, mask); column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -911,7 +899,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; int num_els = 33; - std::vector> cols; bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; @@ -922,9 +909,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) std::fill(c1b, c1b + num_els, static_cast(6)); column_wrapper c1a_w(c1a, c1a + num_els, mask); column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); + table_view tbl1({c1a_w, c1b_w}); T c2a[num_els]; std::fill(c2a, c2a + num_els, static_cast(8)); @@ -932,9 +917,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) std::fill(c2b, c2b + num_els, static_cast(9)); column_wrapper c2a_w(c2a, c2a + num_els, mask); column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); + table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -981,18 +964,12 @@ TEST_F(OrcStatisticsTest, Basic) sequence, sequence + num_rows, valid_all); column_wrapper col5( sequence, sequence + num_rows, validity); - std::vector> cols; - cols.push_back(col1.release()); - cols.push_back(col2.release()); - cols.push_back(col3.release()); - cols.push_back(col4.release()); - cols.push_back(col5.release()); - auto expected = std::make_unique
(std::move(cols)); + table_view expected({col1, col2, col3, col4, col5}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath}); @@ -1056,17 +1033,14 @@ TEST_F(OrcWriterTest, SlicedValidMask) column_wrapper col{strings.begin(), strings.end(), validity}; - std::vector> cols; - cols.push_back(col.release()); - - cudf_io::table_metadata expected_metadata; - expected_metadata.column_names.emplace_back("col_string"); - // Bug tested here is easiest to reproduce when column_offset % 32 is 31 std::vector indices{31, 34}; - std::vector sliced_col = cudf::slice(cols[0]->view(), indices); + auto sliced_col = cudf::slice(static_cast(col), indices); cudf::table_view tbl{sliced_col}; + cudf_io::table_input_metadata expected_metadata(tbl); + expected_metadata.column_metadata[0].set_name("col_string"); + auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); cudf_io::orc_writer_options out_opts = cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl) @@ -1078,7 +1052,7 @@ TEST_F(OrcWriterTest, SlicedValidMask) auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view()); - EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(OrcReaderTest, SingleInputs) @@ -1087,9 +1061,9 @@ TEST_F(OrcReaderTest, SingleInputs) auto table1 = create_random_fixed_table(5, 5, true); auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); - cudf_io::chunked_orc_writer_options opts1 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1}); - cudf_io::orc_chunked_writer(opts1).write(*table1); + cudf_io::orc_writer_options write_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view()); + cudf_io::write_orc(write_opts); cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1}}); @@ -1106,15 +1080,19 @@ TEST_F(OrcReaderTest, MultipleInputs) auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); - cudf_io::chunked_orc_writer_options opts1 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1}); - cudf_io::orc_chunked_writer(opts1).write(*table1); + auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc"); + { + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view()); + cudf_io::write_orc(out_opts); + } - auto filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc"); - cudf_io::chunked_orc_writer_options opts2 = - cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath2}); - cudf_io::orc_chunked_writer(opts2).write(*table2); + auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc"); + { + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath2}, table2->view()); + cudf_io::write_orc(out_opts); + } cudf_io::orc_reader_options read_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1, filepath2}}); @@ -1139,14 +1117,11 @@ TEST_P(OrcWriterTestDecimal, Decimal64) }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; }); column_wrapper col{data, data + num_rows, mask}; - - std::vector> cols; - cols.push_back(col.release()); - auto tbl = std::make_unique
(std::move(cols)); + cudf::table_view tbl({static_cast(col)}); auto filepath = temp_env->get_temp_filepath("Decimal64.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl); cudf_io::write_orc(out_opts); @@ -1154,7 +1129,7 @@ TEST_P(OrcWriterTestDecimal, Decimal64) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl->view().column(0), result.tbl->view().column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0)); } INSTANTIATE_TEST_CASE_P(OrcWriterTest, @@ -1173,14 +1148,11 @@ TEST_F(OrcWriterTest, Decimal32) }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; }); column_wrapper col{data, data + num_rows, mask}; - - std::vector> cols; - cols.push_back(col.release()); - auto expected = std::make_unique
(std::move(cols)); + cudf::table_view expected({static_cast(col)}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); cudf_io::orc_writer_options out_opts = - cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()); + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected); cudf_io::write_orc(out_opts); diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 7260aa9e686..0f59b0d5e15 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -184,25 +185,6 @@ std::unique_ptr make_parquet_list_col( offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{}); } -void compare_metadata_equality(cudf::io::table_input_metadata in_meta, - cudf::io::table_metadata out_meta) -{ - std::function compare_names = - [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) { - if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); } - EXPECT_EQ(out_col.children.size(), in_col.num_children()); - for (size_t i = 0; i < out_col.children.size(); ++i) { - compare_names(out_col.children[i], in_col.child(i)); - } - }; - - EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size()); - - for (size_t i = 0; i < out_meta.schema_info.size(); ++i) { - compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]); - } -} - // Base test fixture for tests struct ParquetWriterTest : public cudf::test::BaseFixture { }; @@ -444,7 +426,7 @@ TEST_F(ParquetWriterTest, MultiColumn) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, MultiColumnWithNulls) @@ -528,7 +510,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) // TODO: Need to be able to return metadata in tree form from reader so they can be compared. // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not // have any tests for it c++ or python. - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, Strings) @@ -568,7 +550,7 @@ TEST_F(ParquetWriterTest, Strings) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, SlicedTable) @@ -682,7 +664,7 @@ TEST_F(ParquetWriterTest, SlicedTable) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, ListColumn) @@ -780,7 +762,7 @@ TEST_F(ParquetWriterTest, ListColumn) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, MultiIndex) @@ -831,7 +813,7 @@ TEST_F(ParquetWriterTest, MultiIndex) auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, HostBuffer) @@ -860,7 +842,7 @@ TEST_F(ParquetWriterTest, HostBuffer) const auto result = cudf_io::read_parquet(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, NonNullable) @@ -989,7 +971,7 @@ TEST_F(ParquetWriterTest, StructOfList) const auto result = cudf_io::read_parquet(read_args); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetWriterTest, ListOfStruct) @@ -1044,7 +1026,7 @@ TEST_F(ParquetWriterTest, ListOfStruct) const auto result = cudf_io::read_parquet(read_args); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } // custom data sink that supports device writes. uses plain file io. @@ -1433,7 +1415,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) @@ -1526,7 +1508,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls. auto result_parent_list = result.tbl->get_column(0); @@ -1697,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ForcedNullability) @@ -1830,7 +1812,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct) auto result = cudf_io::read_parquet(read_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } TEST_F(ParquetChunkedWriterTest, ReadRowGroups) @@ -2552,7 +2534,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } { // Test selecting a non-leaf and expecting all hierarchy from that node onwards @@ -2581,7 +2563,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(0).child(1).set_name("age"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } { // Test selecting struct children out of order @@ -2616,7 +2598,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn) expected_metadata.column_metadata[0].child(1).set_name("human?"); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - compare_metadata_equality(expected_metadata, result.metadata); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } } diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java index 238e0b61fd9..85443c3ae0f 100644 --- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,6 @@ public class ORCWriterOptions extends CompressedMetadataWriterOptions { - public static ORCWriterOptions DEFAULT = new ORCWriterOptions(new Builder()); - private ORCWriterOptions(Builder builder) { super(builder); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 2744728fb44..0af02d1c926 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1147,7 +1147,11 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo */ @Deprecated public void writeORC(File outputFile) { - writeORC(ORCWriterOptions.DEFAULT, outputFile); + // Need to specify the number of columns but leave all column names undefined + String[] names = new String[getNumberOfColumns()]; + Arrays.fill(names, ""); + ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(names).build(); + writeORC(opts, outputFile); } /** @@ -1157,6 +1161,7 @@ public void writeORC(File outputFile) { */ @Deprecated public void writeORC(ORCWriterOptions options, File outputFile) { + assert options.getColumnNames().length == getNumberOfColumns() : "must specify names for all columns"; try (TableWriter writer = Table.writeORCChunked(options, outputFile)) { writer.write(this); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 96dd02e5f2a..ee75112a2ed 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -736,6 +736,29 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam } } +cudf::io::table_input_metadata createORCTableInputMetadata(JNIEnv *env, + jobjectArray const &j_col_names, + jbooleanArray const &j_col_nullability, + jobjectArray const &j_metadata_keys, + jobjectArray const &j_metadata_values) { + cudf::jni::native_jstringArray const col_names(env, j_col_names); + cudf::jni::native_jbooleanArray const col_nullability(env, j_col_nullability); + cudf::jni::native_jstringArray const meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray const meta_values(env, j_metadata_values); + + std::vector const cpp_names = col_names.as_cpp_vector(); + std::size_t const num_columns = cpp_names.size(); + cudf::io::table_input_metadata metadata; + metadata.column_metadata.resize(cpp_names.size()); + for (std::size_t i = 0; i < num_columns; i++) { + metadata.column_metadata[i].set_name(cpp_names[i]).set_nullability(col_nullability[i]); + } + for (int i = 0; i < meta_keys.size(); ++i) { + metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); + } + return metadata; +} + // Check that window parameters are valid. bool valid_window_parameters(native_jintArray const &values, native_jpointerArray const &ops, @@ -1500,19 +1523,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( try { cudf::jni::auto_set_device(env); using namespace cudf::io; - cudf::jni::native_jstringArray col_names(env, j_col_names); - cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); - - auto d = col_nullability.data(); - std::vector nullability(d, d + col_nullability.size()); - table_metadata_with_nullability metadata; - metadata.column_nullable = nullability; - metadata.column_names = col_names.as_cpp_vector(); - for (int i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } + table_input_metadata metadata = cudf::jni::createORCTableInputMetadata( + env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values); std::unique_ptr data_sink( new cudf::jni::jni_writer_data_sink(env, consumer)); @@ -1542,20 +1554,10 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( try { cudf::jni::auto_set_device(env); using namespace cudf::io; - cudf::jni::native_jstringArray col_names(env, j_col_names); - cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); cudf::jni::native_jstring output_path(env, j_output_path); - auto d = col_nullability.data(); - std::vector nullability(d, d + col_nullability.size()); - table_metadata_with_nullability metadata; - metadata.column_nullable = nullability; - metadata.column_names = col_names.as_cpp_vector(); - for (int i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } + table_input_metadata metadata = cudf::jni::createORCTableInputMetadata( + env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values); sink_info sink{output_path.get()}; chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) @@ -1577,7 +1579,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla JNI_NULL_CHECK(env, j_state, "null state", ); using namespace cudf::io; - cudf::table_view *tview = reinterpret_cast(j_table); + cudf::table_view *tview_orig = reinterpret_cast(j_table); + cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig); cudf::jni::native_orc_writer_handle *state = reinterpret_cast(j_state); @@ -1587,7 +1590,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla } try { cudf::jni::auto_set_device(env); - state->writer->write(*tview); + state->writer->write(tview); } CATCH_STD(env, ) } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index b69dce57180..0e7ac15a79e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6876,7 +6876,10 @@ void testArrowIPCWriteToBufferChunked() { void testORCWriteToBufferChunked() { try (Table table0 = getExpectedFileTable(); MyBufferConsumer consumer = new MyBufferConsumer()) { - try (TableWriter writer = Table.writeORCChunked(ORCWriterOptions.DEFAULT, consumer)) { + String[] colNames = new String[table0.getNumberOfColumns()]; + Arrays.fill(colNames, ""); + ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(colNames).build(); + try (TableWriter writer = Table.writeORCChunked(opts, consumer)) { writer.write(table0); writer.write(table0); writer.write(table0); @@ -6924,7 +6927,13 @@ void testORCWriteToFileWithColNames() throws IOException { void testORCWriteToFileUncompressed() throws IOException { File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc"); try (Table table0 = getExpectedFileTable()) { - table0.writeORC(ORCWriterOptions.builder().withCompressionType(CompressionType.NONE).build(), tempFileUncompressed.getAbsoluteFile()); + String[] colNames = new String[table0.getNumberOfColumns()]; + Arrays.fill(colNames, ""); + ORCWriterOptions opts = ORCWriterOptions.builder() + .withColumnNames(colNames) + .withCompressionType(CompressionType.NONE) + .build(); + table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile()); try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) { assertTablesAreEqual(table0, table2); } diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index d89af43028d..3036b000c5b 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -70,13 +70,13 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type get_compression() except+ bool enable_statistics() except+ cudf_table_view.table_view get_table() except+ - const cudf_io_types.table_metadata *get_metadata() except+ + const cudf_io_types.table_input_metadata *get_metadata() except+ # setter void set_compression(cudf_io_types.compression_type comp) except+ void enable_statistics(bool val) except+ void set_table(cudf_table_view.table_view tbl) except+ - void set_metadata(cudf_io_types.table_metadata* meta) except+ + void set_metadata(cudf_io_types.table_input_metadata* meta) except+ @staticmethod orc_writer_options_builder builder( @@ -94,7 +94,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_table_view.table_view tbl ) except+ orc_writer_options_builder& metadata( - cudf_io_types.table_metadata *meta + cudf_io_types.table_input_metadata *meta ) except+ orc_writer_options build() except+ @@ -107,7 +107,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type get_compression() except+ bool enable_statistics() except+ cudf_table_view.table_view get_table() except+ - const cudf_io_types.table_metadata_with_nullability *get_metadata( + const cudf_io_types.table_input_metadata *get_metadata( ) except+ # setter @@ -115,7 +115,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_statistics(bool val) except+ void set_table(cudf_table_view.table_view tbl) except+ void set_metadata( - cudf_io_types.table_metadata_with_nullability* meta + cudf_io_types.table_input_metadata* meta ) except+ @staticmethod @@ -133,7 +133,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_table_view.table_view tbl ) except+ chunked_orc_writer_options_builder& metadata( - cudf_io_types.table_metadata *meta + cudf_io_types.table_input_metadata *meta ) except+ chunked_orc_writer_options build() except+ diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index e2053f8ce4f..81ca7e5836b 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -66,36 +66,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cdef cudf_io_types.table_with_metadata read_parquet( parquet_reader_options args) except + - cdef cppclass column_in_metadata: - column_in_metadata& set_name(const string& name) - column_in_metadata& set_nullability(bool nullable) - column_in_metadata& set_list_column_as_map() - column_in_metadata& set_int96_timestamps(bool req) - column_in_metadata& set_decimal_precision(uint8_t precision) - column_in_metadata& child(size_type i) - - cdef cppclass table_input_metadata: - table_input_metadata() except + - table_input_metadata(const cudf_table_view.table_view& table) except + - table_input_metadata( - const cudf_table_view.table_view& table, - map[string, string] user_data - ) except + - - vector[column_in_metadata] column_metadata - map[string, string] user_data - cdef cppclass parquet_writer_options: parquet_writer_options() except + cudf_io_types.sink_info get_sink_info() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + cudf_table_view.table_view get_table() except + - const table_input_metadata get_metadata() except + + const cudf_io_types.table_input_metadata get_metadata() except + string get_column_chunks_file_path() except+ void set_metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + void set_stats_level( cudf_io_types.statistics_freq sf @@ -121,7 +102,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_table_view.table_view table_ ) except + parquet_writer_options_builder& metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf @@ -147,11 +128,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.sink_info get_sink() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + - table_input_metadata* get_metadata( + cudf_io_types.table_input_metadata* get_metadata( ) except+ void set_metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + void set_stats_level( cudf_io_types.statistics_freq sf @@ -171,7 +152,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.sink_info sink_, ) except + chunked_parquet_writer_options_builder& metadata( - table_input_metadata *m + cudf_io_types.table_input_metadata *m ) except + chunked_parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index 7fa6406bd29..721d90f1f5b 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr @@ -8,7 +9,9 @@ from libcpp.string cimport string from libcpp.vector cimport vector from pyarrow.includes.libarrow cimport CRandomAccessFile +cimport cudf._lib.cpp.table.table_view as cudf_table_view from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.types cimport size_type cdef extern from "cudf/io/types.hpp" \ @@ -52,15 +55,29 @@ cdef extern from "cudf/io/types.hpp" \ map[string, string] user_data vector[column_name_info] schema_info - cdef cppclass table_metadata_with_nullability(table_metadata): - table_metadata_with_nullability() except + - - vector[bool] nullability - cdef cppclass table_with_metadata: unique_ptr[table] tbl table_metadata metadata + cdef cppclass column_in_metadata: + column_in_metadata& set_name(const string& name) + column_in_metadata& set_nullability(bool nullable) + column_in_metadata& set_list_column_as_map() + column_in_metadata& set_int96_timestamps(bool req) + column_in_metadata& set_decimal_precision(uint8_t precision) + column_in_metadata& child(size_type i) + + cdef cppclass table_input_metadata: + table_input_metadata() except + + table_input_metadata(const cudf_table_view.table_view& table) except + + table_input_metadata( + const cudf_table_view.table_view& table, + map[string, string] user_data + ) except + + + vector[column_in_metadata] column_metadata + map[string, string] user_data + cdef cppclass host_buffer: const char* data size_t size diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index bc4f4aee9cd..03d163b7638 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -23,13 +23,13 @@ from cudf._lib.cpp.io.orc_metadata cimport ( read_raw_orc_statistics as libcudf_read_raw_orc_statistics, ) from cudf._lib.cpp.io.types cimport ( + column_in_metadata, column_name_info, compression_type, data_sink, sink_info, source_info, - table_metadata, - table_metadata_with_nullability, + table_input_metadata, table_with_metadata, ) from cudf._lib.cpp.table.table_view cimport table_view @@ -50,7 +50,8 @@ import numpy as np from cudf._lib.utils cimport data_from_unique_ptr, get_column_names -from cudf._lib.utils import generate_pandas_metadata +from cudf._lib.utils import _index_level_name, generate_pandas_metadata +from cudf.api.types import is_list_dtype, is_struct_dtype cpdef read_raw_orc_statistics(filepath_or_buffer): @@ -144,19 +145,35 @@ cpdef write_orc(Table table, cudf.read_orc """ cdef compression_type compression_ = _get_comp_type(compression) - cdef table_metadata metadata_ = table_metadata() cdef unique_ptr[data_sink] data_sink_c cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - - metadata_.column_names.reserve(len(table._column_names)) - - for col_name in table._column_names: - metadata_.column_names.push_back(str.encode(col_name)) + cdef unique_ptr[table_input_metadata] tbl_meta + + if not isinstance(table._index, cudf.RangeIndex): + tv = table_view_from_table(table) + tbl_meta = make_unique[table_input_metadata](tv) + for level, idx_name in enumerate(table._index.names): + tbl_meta.get().column_metadata[level].set_name( + str.encode( + _index_level_name(idx_name, level, table._column_names) + ) + ) + num_index_cols_meta = len(table._index.names) + else: + tv = table_view_from_table(table, ignore_index=True) + tbl_meta = make_unique[table_input_metadata](tv) + num_index_cols_meta = 0 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + tbl_meta.get().column_metadata[i].set_name(name.encode()) + _set_col_children_names( + table[name]._column, tbl_meta.get().column_metadata[i] + ) cdef orc_writer_options c_orc_writer_options = move( orc_writer_options.builder( sink_info_c, table_view_from_table(table, ignore_index=True) - ).metadata(&metadata_) + ).metadata(tbl_meta.get()) .compression(compression_) .enable_statistics( (True if enable_statistics else False)) .build() @@ -231,6 +248,7 @@ cdef class ORCWriter: cdef bool enable_stats cdef compression_type comp_type cdef object index + cdef unique_ptr[table_input_metadata] tbl_meta def __cinit__(self, object path, object index=None, object compression=None, bool enable_statistics=True): @@ -268,20 +286,46 @@ cdef class ORCWriter: """ Prepare all the values required to build the chunked_orc_writer_options anb creates a writer""" - cdef unique_ptr[table_metadata_with_nullability] tbl_meta - tbl_meta = make_unique[table_metadata_with_nullability]() + cdef table_view tv # Set the table_metadata - tbl_meta.get().column_names = get_column_names(table, self.index) + num_index_cols_meta = 0 + self.tbl_meta = make_unique[table_input_metadata]( + table_view_from_table(table, ignore_index=True) + ) + if self.index is not False: + if isinstance(table._index, cudf.core.multiindex.MultiIndex): + tv = table_view_from_table(table) + self.tbl_meta = make_unique[table_input_metadata](tv) + for level, idx_name in enumerate(table._index.names): + self.tbl_meta.get().column_metadata[level].set_name( + (str.encode(idx_name)) + ) + num_index_cols_meta = len(table._index.names) + else: + if table._index.name is not None: + tv = table_view_from_table(table) + self.tbl_meta = make_unique[table_input_metadata](tv) + self.tbl_meta.get().column_metadata[0].set_name( + str.encode(table._index.name) + ) + num_index_cols_meta = 1 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + self.tbl_meta.get().column_metadata[i].set_name(name.encode()) + _set_col_children_names( + table[name]._column, self.tbl_meta.get().column_metadata[i] + ) + pandas_metadata = generate_pandas_metadata(table, self.index) - tbl_meta.get().user_data[str.encode("pandas")] = \ + self.tbl_meta.get().user_data[str.encode("pandas")] = \ str.encode(pandas_metadata) cdef chunked_orc_writer_options args with nogil: args = move( chunked_orc_writer_options.builder(self.sink) - .metadata(tbl_meta.get()) + .metadata(self.tbl_meta.get()) .compression(self.comp_type) .enable_statistics(self.enable_stats) .build() @@ -289,3 +333,15 @@ cdef class ORCWriter: self.writer.reset(new orc_chunked_writer(args)) self.initialized = True + +cdef _set_col_children_names(Column col, column_in_metadata& col_meta): + if is_struct_dtype(col): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name.encode()) + _set_col_children_names(child_col, col_meta.child(i)) + elif is_list_dtype(col): + _set_col_children_names(col.children[1], col_meta.child(1)) + else: + return diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d9017c7d6f8..70bdb6e2e60 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -45,15 +45,14 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.io.parquet cimport ( chunked_parquet_writer_options, chunked_parquet_writer_options_builder, - column_in_metadata, merge_rowgroup_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, parquet_reader_options, parquet_writer_options, read_parquet as parquet_reader, - table_input_metadata, write_parquet as parquet_writer, ) +from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 73fbd50c824..cc5e1909d67 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -338,11 +338,11 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): for col in df._data.columns: if isinstance(col, cudf.core.column.StructColumn): - raise NotImplementedError( - "Writing to ORC format is not yet supported with " - "Struct columns." + warnings.warn( + "Support for writing tables with struct columns is " + "currently experimental." ) - elif isinstance(col, cudf.core.column.CategoricalColumn): + if isinstance(col, cudf.core.column.CategoricalColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 2d4dc55bd28..61c2ff5ed36 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -58,7 +58,6 @@ def _make_path_or_buf(src): @pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") @pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) @pytest.mark.parametrize("use_index", [False, True]) @pytest.mark.parametrize( @@ -221,6 +220,7 @@ def test_orc_read_statistics(datadir): assert_eq(file_statistics[0]["string1"]["minimum"], "one") +@pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) @pytest.mark.parametrize( "predicate,expected_len", @@ -244,6 +244,7 @@ def test_orc_read_filtered(datadir, engine, predicate, expected_len): assert len(df_filtered) == expected_len +@pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_read_stripes(datadir, engine): path = datadir / "TestOrcFile.testDate1900.orc" @@ -558,7 +559,6 @@ def test_orc_reader_boolean_type(datadir, orc_file): assert_eq(pdf, df) -@pytest.mark.filterwarnings("ignore:Using CPU") def test_orc_reader_tzif_timestamps(datadir): # Contains timstamps in the range covered by the TZif file # Other timedate tests only cover "future" times @@ -954,7 +954,9 @@ def generate_list_struct_buff(size=100_000): return buff -list_struct_buff = generate_list_struct_buff() +@pytest.fixture(scope="module") +def list_struct_buff(): + return generate_list_struct_buff() @pytest.mark.parametrize( @@ -967,9 +969,7 @@ def generate_list_struct_buff(size=100_000): ) @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000]) @pytest.mark.parametrize("use_index", [True, False]) -def test_lists_struct_nests( - columns, num_rows, use_index, -): +def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): gdf = cudf.read_orc( list_struct_buff, @@ -993,7 +993,7 @@ def test_lists_struct_nests( @pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]]) -def test_skip_rows_for_nested_types(columns): +def test_skip_rows_for_nested_types(columns, list_struct_buff): with pytest.raises( RuntimeError, match="skip_rows is not supported by nested column" ): @@ -1379,3 +1379,45 @@ def test_names_in_struct_dtype_nesting(datadir): edf = cudf.DataFrame(expect.to_pandas()) # test schema assert edf.dtypes.equals(got.dtypes) + + +@pytest.mark.filterwarnings("ignore:.*struct.*experimental") +def test_writer_lists_structs(list_struct_buff): + df_in = cudf.read_orc(list_struct_buff) + + buff = BytesIO() + df_in.to_orc(buff) + + pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() + + assert pyarrow_tbl.equals(df_in.to_arrow()) + + +@pytest.mark.filterwarnings("ignore:.*struct.*experimental") +@pytest.mark.parametrize( + "data", + [ + { + "with_pd": [ + [i if i % 3 else None] if i < 9999 or i > 20001 else None + for i in range(21000) + ], + "no_pd": [ + [i if i % 3 else None] if i < 9999 or i > 20001 else [] + for i in range(21000) + ], + }, + ], +) +def test_orc_writer_lists_empty_rg(data): + pdf_in = pd.DataFrame(data) + buffer = BytesIO() + cudf_in = cudf.from_pandas(pdf_in) + + cudf_in.to_orc(buffer) + + df = cudf.read_orc(buffer) + assert_eq(df, cudf_in) + + pdf_out = pa.orc.ORCFile(buffer).read().to_pandas() + assert_eq(pdf_in, pdf_out) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 4bffd06c4cc..e23318eb999 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -391,6 +391,12 @@ enable_statistics: boolean, default True Enable writing column statistics. + +Notes +----- +Support for writing tables with struct columns is currently experimental, +the output may not be as reliable as writing for other datatypes. + See Also -------- cudf.read_orc