diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index ee5fa4e8b5a..eab951efe36 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -63,6 +63,7 @@ void ProtobufReader::read(PostScript& s, size_t maxlen) field_reader(3, s.compressionBlockSize), packed_field_reader(4, s.version), field_reader(5, s.metadataLength), + field_reader(6, s.writerVersion), field_reader(8000, s.magic)); function_builder(s, maxlen, op); } @@ -76,7 +77,8 @@ void ProtobufReader::read(FileFooter& s, size_t maxlen) field_reader(5, s.metadata), field_reader(6, s.numberOfRows), raw_field_reader(7, s.statistics), - field_reader(8, s.rowIndexStride)); + field_reader(8, s.rowIndexStride), + field_reader(9, s.writer)); function_builder(s, maxlen, op); } @@ -299,6 +301,7 @@ size_t ProtobufWriter::write(PostScript const& s) if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); } w.field_packed_uint(4, s.version); w.field_uint(5, s.metadataLength); + if (s.writerVersion) w.field_uint(6, *s.writerVersion); w.field_blob(8000, s.magic); return w.value(); } @@ -314,6 +317,7 @@ size_t ProtobufWriter::write(FileFooter const& s) w.field_uint(6, s.numberOfRows); w.field_repeated_struct_blob(7, s.statistics); w.field_uint(8, s.rowIndexStride); + if (s.writer) w.field_uint(9, *s.writer); return w.value(); } diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 783ed4206b6..4f3e0a82768 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -40,6 +40,28 @@ namespace orc { static constexpr uint32_t block_header_size = 3; // Seconds from January 1st, 1970 to January 1st, 2015 static constexpr int64_t orc_utc_epoch = 1420070400; +// ORC datasets start with a 3 byte header +static constexpr char const* MAGIC = "ORC"; + +// Each ORC writer implementation should write its code to the file footer +// the codes are specified in the ORC specification +static constexpr int32_t cudf_writer_code = 5; +// Each ORC writer implementation should write its version to the PostScript +// The version values are based on the ORC Java writer bug fixes and features +// From https://github.com/apache/orc-format/blob/main/src/main/proto/orc_proto.proto: +// 0 = original +// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & +// string statistics use utf8 for min/max) +// 2 = HIVE-4243 fixed (use real column names from Hive tables) +// 3 = HIVE-12055 added (vectorized writer implementation) +// 4 = HIVE-13083 fixed (decimals write present stream correctly) +// 5 = ORC-101 fixed (bloom filters use utf8 consistently) +// 6 = ORC-135 fixed (timestamp statistics use utc) +// 7 = ORC-517 fixed (decimal64 min/max incorrect) +// 8 = ORC-203 added (trim very long string statistics) +// 9 = ORC-14 added (column encryption) +// Our version should be updated as we implement the features from the list above +static constexpr uint32_t cudf_writer_version = 7; // Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max // are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder, @@ -48,12 +70,13 @@ static constexpr int32_t DEFAULT_MIN_NANOS = 0; static constexpr int32_t DEFAULT_MAX_NANOS = 999'999; struct PostScript { - uint64_t footerLength = 0; // the length of the footer section in bytes - CompressionKind compression = NONE; // the kind of generic compression used - uint32_t compressionBlockSize{}; // the maximum size of each compression chunk - std::vector version; // the version of the writer [major, minor] - uint64_t metadataLength = 0; // the length of the metadata section in bytes - std::string magic = ""; // the fixed string "ORC" + uint64_t footerLength = 0; // the length of the footer section in bytes + CompressionKind compression = NONE; // the kind of generic compression used + uint32_t compressionBlockSize{}; // the maximum size of each compression chunk + std::vector version; // the version of the file format [major, minor] + uint64_t metadataLength = 0; // the length of the metadata section in bytes + std::optional writerVersion; // The version of the writer that wrote the file + std::string magic = ""; // the fixed string "ORC" }; struct StripeInformation { @@ -90,6 +113,7 @@ struct FileFooter { uint64_t numberOfRows = 0; // the total number of rows in the file std::vector statistics; // Column statistics blobs uint32_t rowIndexStride = 0; // the maximum number of rows in each index entry + std::optional writer; // Writer code }; struct Stream { diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 429fd5b929d..7436e22e125 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -29,7 +29,7 @@ using strings::detail::fixed_point_string_size; // Nanosecond statistics should not be enabled until the spec version is set correctly in the output // files. See https://github.com/rapidsai/cudf/issues/14325 for more details -constexpr bool enable_nanosecond_statistics = false; +constexpr bool enable_nanosecond_statistics = true; constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ac5993e764e..f84918eb9dc 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2589,6 +2589,7 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table, if (_ffooter.headerLength == 0) { // First call _ffooter.headerLength = std::strlen(MAGIC); + _ffooter.writer = cudf_writer_code; _ffooter.rowIndexStride = _row_index_stride; _ffooter.types.resize(1 + orc_table.num_columns()); _ffooter.types[0].kind = STRUCT; @@ -2702,7 +2703,8 @@ void writer::impl::close() ps.footerLength = pbw.size(); ps.compression = _compression_kind; ps.compressionBlockSize = _compression_blocksize; - ps.version = {0, 12}; + ps.version = {0, 12}; // Hive 0.12 + ps.writerVersion = cudf_writer_version; ps.magic = MAGIC; auto const ps_length = static_cast(pbw.write(ps)); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index dca3886db14..234716749ff 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1054,12 +1054,8 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts4.maximum, 3); EXPECT_EQ(*ts4.minimum_utc, -4); EXPECT_EQ(*ts4.maximum_utc, 3); - // nanosecond precision can't be included until we write a writer version that includes ORC-135 - // see https://github.com/rapidsai/cudf/issues/14325 - // EXPECT_EQ(*ts4.minimum_nanos, 999994); - EXPECT_FALSE(ts4.minimum_nanos.has_value()); - // EXPECT_EQ(*ts4.maximum_nanos, 6); - EXPECT_FALSE(ts4.maximum_nanos.has_value()); + EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_EQ(*ts4.maximum_nanos, 6); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); @@ -1069,12 +1065,8 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts5.maximum, 3000); EXPECT_EQ(*ts5.minimum_utc, -3001); EXPECT_EQ(*ts5.maximum_utc, 3000); - // nanosecond precision can't be included until we write a writer version that includes ORC-135 - // see https://github.com/rapidsai/cudf/issues/14325 - // EXPECT_EQ(*ts5.minimum_nanos, 994000); - EXPECT_FALSE(ts5.minimum_nanos.has_value()); - // EXPECT_EQ(*ts5.maximum_nanos, 6000); - EXPECT_FALSE(ts5.maximum_nanos.has_value()); + EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_EQ(*ts5.maximum_nanos, 6000); auto& s6 = stats[6]; EXPECT_EQ(*s6.number_of_values, 4ul);