Skip to content

Commit

Permalink
Add support for struct type in ORC writer (#9025)
Browse files Browse the repository at this point in the history
Fixes #7830, #8443

Features:
- Use the new table metadata type that matches the table hierarchy, `table_input_metadata`.
- Support struct columns in the writer.

Changes:
- Null masks are encoded as aligned rowgroups to avoid invalid bits when the number of encoded rows is not divisible by 8 (except for the last rowgroup in each stripe). This also affects list columns. The issue is equivalent to #6763 (boolean columns only).
- Added pushdown masks that are used to determine which child elements should not be encoded, including null mask bits.
- Use pushdown masks for rowgroup alignment, null mask encoding and value encoding.
- Separated the null mask encoding from value encoding - can be further moved to a separate kernel call.

Breaking because the table metadata type has changed.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: #9025
  • Loading branch information
vuule authored Sep 22, 2021
1 parent 8dea0b1 commit 2c6b39b
Show file tree
Hide file tree
Showing 29 changed files with 1,313 additions and 797 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ test:
- test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
- test -f $PREFIX/include/cudf_test/cxxopts.hpp
- test -f $PREFIX/include/cudf_test/file_utilities.hpp
- test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
- test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
- test -f $PREFIX/include/cudf_test/table_utilities.hpp
- test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ add_library(cudftestutil STATIC
tests/utilities/base_fixture.cpp
tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu
tests/io/metadata_utilities.cpp
tests/strings/utilities.cu)

set_target_properties(cudftestutil
Expand Down
19 changes: 11 additions & 8 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ class orc_writer_options {
// Set of columns to output
table_view _table;
// Optional associated metadata
const table_metadata* _metadata = nullptr;
const table_input_metadata* _metadata = nullptr;

friend orc_writer_options_builder;

Expand Down Expand Up @@ -445,7 +445,7 @@ class orc_writer_options {
/**
* @brief Returns associated metadata.
*/
table_metadata const* get_metadata() const { return _metadata; }
table_input_metadata const* get_metadata() const { return _metadata; }

// Setters

Expand Down Expand Up @@ -475,7 +475,7 @@ class orc_writer_options {
*
* @param meta Associated metadata.
*/
void set_metadata(table_metadata* meta) { _metadata = meta; }
void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
};

class orc_writer_options_builder {
Expand Down Expand Up @@ -541,7 +541,7 @@ class orc_writer_options_builder {
* @param meta Associated metadata.
* @return this for chaining.
*/
orc_writer_options_builder& metadata(table_metadata* meta)
orc_writer_options_builder& metadata(table_input_metadata const* meta)
{
options._metadata = meta;
return *this;
Expand Down Expand Up @@ -570,6 +570,9 @@ class orc_writer_options_builder {
* cudf::io::write_orc(options);
* @endcode
*
* Note: Support for writing tables with struct columns is currently experimental, the output may
* not be as reliable as writing for other datatypes.
*
* @param options Settings for controlling reading behavior.
* @param mr Device memory resource to use for device memory allocation.
*/
Expand All @@ -592,7 +595,7 @@ class chunked_orc_writer_options {
// Enable writing column statistics
bool _enable_statistics = true;
// Optional associated metadata
const table_metadata_with_nullability* _metadata = nullptr;
const table_input_metadata* _metadata = nullptr;

friend chunked_orc_writer_options_builder;

Expand Down Expand Up @@ -638,7 +641,7 @@ class chunked_orc_writer_options {
/**
* @brief Returns associated metadata.
*/
table_metadata_with_nullability const* get_metadata() const { return _metadata; }
table_input_metadata const* get_metadata() const { return _metadata; }

// Setters

Expand All @@ -661,7 +664,7 @@ class chunked_orc_writer_options {
*
* @param meta Associated metadata.
*/
void metadata(table_metadata_with_nullability* meta) { _metadata = meta; }
void metadata(table_input_metadata const* meta) { _metadata = meta; }
};

class chunked_orc_writer_options_builder {
Expand Down Expand Up @@ -712,7 +715,7 @@ class chunked_orc_writer_options_builder {
* @param meta Associated metadata.
* @return this for chaining.
*/
chunked_orc_writer_options_builder& metadata(table_metadata_with_nullability* meta)
chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta)
{
options._metadata = meta;
return *this;
Expand Down
169 changes: 0 additions & 169 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@

#include <rmm/mr/device/per_device_resource.hpp>

#include <thrust/optional.h>

#include <iostream>
#include <memory>
#include <string>
Expand Down Expand Up @@ -375,173 +373,6 @@ table_with_metadata read_parquet(
* @{
* @file
*/
class table_input_metadata;

class column_in_metadata {
friend table_input_metadata;
std::string _name = "";
thrust::optional<bool> _nullable;
// TODO: This isn't implemented yet
bool _list_column_is_map = false;
bool _use_int96_timestamp = false;
// bool _output_as_binary = false;
thrust::optional<uint8_t> _decimal_precision;
std::vector<column_in_metadata> children;

public:
/**
* @brief Get the children of this column metadata
*
* @return this for chaining
*/
column_in_metadata& add_child(column_in_metadata const& child)
{
children.push_back(child);
return *this;
}

/**
* @brief Set the name of this column
*
* @return this for chaining
*/
column_in_metadata& set_name(std::string const& name)
{
_name = name;
return *this;
}

/**
* @brief Set the nullability of this column
*
* Only valid in case of chunked writes. In single writes, this option is ignored.
*
* @return column_in_metadata&
*/
column_in_metadata& set_nullability(bool nullable)
{
_nullable = nullable;
return *this;
}

/**
* @brief Specify that this list column should be encoded as a map in the written parquet file
*
* The column must have the structure list<struct<key, value>>. This option is invalid otherwise
*
* @return this for chaining
*/
column_in_metadata& set_list_column_as_map()
{
_list_column_is_map = true;
return *this;
}

/**
* @brief Specifies whether this timestamp column should be encoded using the deprecated int96
* physical type. Only valid for the following column types:
* timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
*
* @param req True = use int96 physical type. False = use int64 physical type
* @return this for chaining
*/
column_in_metadata& set_int96_timestamps(bool req)
{
_use_int96_timestamp = req;
return *this;
}

/**
* @brief Set the decimal precision of this column. Only valid if this column is a decimal
* (fixed-point) type
*
* @param precision The integer precision to set for this decimal column
* @return this for chaining
*/
column_in_metadata& set_decimal_precision(uint8_t precision)
{
_decimal_precision = precision;
return *this;
}

/**
* @brief Get reference to a child of this column
*
* @param i Index of the child to get
* @return this for chaining
*/
column_in_metadata& child(size_type i) { return children[i]; }

/**
* @brief Get const reference to a child of this column
*
* @param i Index of the child to get
* @return this for chaining
*/
column_in_metadata const& child(size_type i) const { return children[i]; }

/**
* @brief Get the name of this column
*/
std::string get_name() const { return _name; }

/**
* @brief Get whether nullability has been explicitly set for this column.
*/
bool is_nullability_defined() const { return _nullable.has_value(); }

/**
* @brief Gets the explicitly set nullability for this column.
* @throws If nullability is not explicitly defined for this column.
* Check using `is_nullability_defined()` first.
*/
bool nullable() const { return _nullable.value(); }

/**
* @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
*/
bool is_map() const { return _list_column_is_map; }

/**
* @brief Get whether to encode this timestamp column using deprecated int96 physical type
*/
bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }

/**
* @brief Get whether precision has been set for this decimal column
*/
bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }

/**
* @brief Get the decimal precision that was set for this column.
* @throws If decimal precision was not set for this column.
* Check using `is_decimal_precision_set()` first.
*/
uint8_t get_decimal_precision() const { return _decimal_precision.value(); }

/**
* @brief Get the number of children of this column
*/
size_type num_children() const { return children.size(); }
};

class table_input_metadata {
public:
table_input_metadata() = default; // Required by cython

/**
* @brief Construct a new table_input_metadata from a table_view.
*
* The constructed table_input_metadata has the same structure as the passed table_view
*
* @param table The table_view to construct metadata for
* @param user_data Optional Additional metadata to encode, as key-value pairs
*/
table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});

std::vector<column_in_metadata> column_metadata;
std::map<std::string, std::string> user_data; //!< Format-dependent metadata as key-values pairs
};

/**
* @brief Class to build `parquet_writer_options`.
Expand Down
Loading

0 comments on commit 2c6b39b

Please sign in to comment.