Skip to content

Commit

Permalink
Include writer code and writerVersion in ORC files (#14458)
Browse files Browse the repository at this point in the history
Closes #14325
Changes some of the metadata written to ORC file:

- Include the (cuDF) writer code (5).
- include writerVersion, with the value of 7; This value means that bugs up to ORC-517 are fixed. This version (6+ required) allows us to write the nanosecond statistics.

This change can have unexpected impact, depending on how readers use these fields.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #14458
  • Loading branch information
vuule authored Dec 7, 2023
1 parent a253826 commit b02e82f
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 21 deletions.
6 changes: 5 additions & 1 deletion cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void ProtobufReader::read(PostScript& s, size_t maxlen)
field_reader(3, s.compressionBlockSize),
packed_field_reader(4, s.version),
field_reader(5, s.metadataLength),
field_reader(6, s.writerVersion),
field_reader(8000, s.magic));
function_builder(s, maxlen, op);
}
Expand All @@ -76,7 +77,8 @@ void ProtobufReader::read(FileFooter& s, size_t maxlen)
field_reader(5, s.metadata),
field_reader(6, s.numberOfRows),
raw_field_reader(7, s.statistics),
field_reader(8, s.rowIndexStride));
field_reader(8, s.rowIndexStride),
field_reader(9, s.writer));
function_builder(s, maxlen, op);
}

Expand Down Expand Up @@ -299,6 +301,7 @@ size_t ProtobufWriter::write(PostScript const& s)
if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); }
w.field_packed_uint(4, s.version);
w.field_uint(5, s.metadataLength);
if (s.writerVersion) w.field_uint(6, *s.writerVersion);
w.field_blob(8000, s.magic);
return w.value();
}
Expand All @@ -314,6 +317,7 @@ size_t ProtobufWriter::write(FileFooter const& s)
w.field_uint(6, s.numberOfRows);
w.field_repeated_struct_blob(7, s.statistics);
w.field_uint(8, s.rowIndexStride);
if (s.writer) w.field_uint(9, *s.writer);
return w.value();
}

Expand Down
36 changes: 30 additions & 6 deletions cpp/src/io/orc/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,28 @@ namespace orc {
static constexpr uint32_t block_header_size = 3;
// Seconds from January 1st, 1970 to January 1st, 2015
static constexpr int64_t orc_utc_epoch = 1420070400;
// ORC datasets start with a 3 byte header
static constexpr char const* MAGIC = "ORC";

// Each ORC writer implementation should write its code to the file footer
// the codes are specified in the ORC specification
static constexpr int32_t cudf_writer_code = 5;
// Each ORC writer implementation should write its version to the PostScript
// The version values are based on the ORC Java writer bug fixes and features
// From https://github.com/apache/orc-format/blob/main/src/main/proto/orc_proto.proto:
// 0 = original
// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
// string statistics use utf8 for min/max)
// 2 = HIVE-4243 fixed (use real column names from Hive tables)
// 3 = HIVE-12055 added (vectorized writer implementation)
// 4 = HIVE-13083 fixed (decimals write present stream correctly)
// 5 = ORC-101 fixed (bloom filters use utf8 consistently)
// 6 = ORC-135 fixed (timestamp statistics use utc)
// 7 = ORC-517 fixed (decimal64 min/max incorrect)
// 8 = ORC-203 added (trim very long string statistics)
// 9 = ORC-14 added (column encryption)
// Our version should be updated as we implement the features from the list above
static constexpr uint32_t cudf_writer_version = 7;

// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
Expand All @@ -48,12 +70,13 @@ static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;

struct PostScript {
uint64_t footerLength = 0; // the length of the footer section in bytes
CompressionKind compression = NONE; // the kind of generic compression used
uint32_t compressionBlockSize{}; // the maximum size of each compression chunk
std::vector<uint32_t> version; // the version of the writer [major, minor]
uint64_t metadataLength = 0; // the length of the metadata section in bytes
std::string magic = ""; // the fixed string "ORC"
uint64_t footerLength = 0; // the length of the footer section in bytes
CompressionKind compression = NONE; // the kind of generic compression used
uint32_t compressionBlockSize{}; // the maximum size of each compression chunk
std::vector<uint32_t> version; // the version of the file format [major, minor]
uint64_t metadataLength = 0; // the length of the metadata section in bytes
std::optional<uint32_t> writerVersion; // The version of the writer that wrote the file
std::string magic = ""; // the fixed string "ORC"
};

struct StripeInformation {
Expand Down Expand Up @@ -90,6 +113,7 @@ struct FileFooter {
uint64_t numberOfRows = 0; // the total number of rows in the file
std::vector<ColStatsBlob> statistics; // Column statistics blobs
uint32_t rowIndexStride = 0; // the maximum number of rows in each index entry
std::optional<uint32_t> writer; // Writer code
};

struct Stream {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/stats_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ using strings::detail::fixed_point_string_size;

// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
constexpr bool enable_nanosecond_statistics = false;
constexpr bool enable_nanosecond_statistics = true;

constexpr unsigned int init_threads_per_group = 32;
constexpr unsigned int init_groups_per_block = 4;
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2589,6 +2589,7 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
if (_ffooter.headerLength == 0) {
// First call
_ffooter.headerLength = std::strlen(MAGIC);
_ffooter.writer = cudf_writer_code;
_ffooter.rowIndexStride = _row_index_stride;
_ffooter.types.resize(1 + orc_table.num_columns());
_ffooter.types[0].kind = STRUCT;
Expand Down Expand Up @@ -2702,7 +2703,8 @@ void writer::impl::close()
ps.footerLength = pbw.size();
ps.compression = _compression_kind;
ps.compressionBlockSize = _compression_blocksize;
ps.version = {0, 12};
ps.version = {0, 12}; // Hive 0.12
ps.writerVersion = cudf_writer_version;
ps.magic = MAGIC;

auto const ps_length = static_cast<uint8_t>(pbw.write(ps));
Expand Down
16 changes: 4 additions & 12 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,12 +1054,8 @@ TEST_F(OrcStatisticsTest, Basic)
EXPECT_EQ(*ts4.maximum, 3);
EXPECT_EQ(*ts4.minimum_utc, -4);
EXPECT_EQ(*ts4.maximum_utc, 3);
// nanosecond precision can't be included until we write a writer version that includes ORC-135
// see https://github.com/rapidsai/cudf/issues/14325
// EXPECT_EQ(*ts4.minimum_nanos, 999994);
EXPECT_FALSE(ts4.minimum_nanos.has_value());
// EXPECT_EQ(*ts4.maximum_nanos, 6);
EXPECT_FALSE(ts4.maximum_nanos.has_value());
EXPECT_EQ(*ts4.minimum_nanos, 999994);
EXPECT_EQ(*ts4.maximum_nanos, 6);

auto& s5 = stats[5];
EXPECT_EQ(*s5.number_of_values, 4ul);
Expand All @@ -1069,12 +1065,8 @@ TEST_F(OrcStatisticsTest, Basic)
EXPECT_EQ(*ts5.maximum, 3000);
EXPECT_EQ(*ts5.minimum_utc, -3001);
EXPECT_EQ(*ts5.maximum_utc, 3000);
// nanosecond precision can't be included until we write a writer version that includes ORC-135
// see https://github.com/rapidsai/cudf/issues/14325
// EXPECT_EQ(*ts5.minimum_nanos, 994000);
EXPECT_FALSE(ts5.minimum_nanos.has_value());
// EXPECT_EQ(*ts5.maximum_nanos, 6000);
EXPECT_FALSE(ts5.maximum_nanos.has_value());
EXPECT_EQ(*ts5.minimum_nanos, 994000);
EXPECT_EQ(*ts5.maximum_nanos, 6000);

auto& s6 = stats[6];
EXPECT_EQ(*s6.number_of_values, 4ul);
Expand Down

0 comments on commit b02e82f

Please sign in to comment.