rapidsai · rapids-bot · Dec 7, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
@@ -63,6 +63,7 @@ void ProtobufReader::read(PostScript& s, size_t maxlen)
                        field_reader(3, s.compressionBlockSize),
                        packed_field_reader(4, s.version),
                        field_reader(5, s.metadataLength),
+                       field_reader(6, s.writerVersion),
                        field_reader(8000, s.magic));
   function_builder(s, maxlen, op);
 }
@@ -76,7 +77,8 @@ void ProtobufReader::read(FileFooter& s, size_t maxlen)
                        field_reader(5, s.metadata),
                        field_reader(6, s.numberOfRows),
                        raw_field_reader(7, s.statistics),
-                       field_reader(8, s.rowIndexStride));
+                       field_reader(8, s.rowIndexStride),
+                       field_reader(9, s.writer));
   function_builder(s, maxlen, op);
 }
 
@@ -299,6 +301,7 @@ size_t ProtobufWriter::write(PostScript const& s)
   if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); }
   w.field_packed_uint(4, s.version);
   w.field_uint(5, s.metadataLength);
+  if (s.writerVersion) w.field_uint(6, *s.writerVersion);
   w.field_blob(8000, s.magic);
   return w.value();
 }
@@ -314,6 +317,7 @@ size_t ProtobufWriter::write(FileFooter const& s)
   w.field_uint(6, s.numberOfRows);
   w.field_repeated_struct_blob(7, s.statistics);
   w.field_uint(8, s.rowIndexStride);
+  if (s.writer) w.field_uint(9, *s.writer);
   return w.value();
 }
 

@@ -40,6 +40,28 @@ namespace orc {
 static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
+// ORC datasets start with a 3 byte header
+static constexpr char const* MAGIC = "ORC";
+
+// Each ORC writer implementation should write its code to the file footer
+// the codes are specified in the ORC specification
+static constexpr int32_t cudf_writer_code = 5;
+// Each ORC writer implementation should write its version to the PostScript
+// The version values are based on the ORC Java writer bug fixes and features
+// From https://github.com/apache/orc/blob/main/proto/orc_proto.proto:
+//   0 = original
+//   1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
+//                        string statistics use utf8 for min/max)
+//   2 = HIVE-4243 fixed (use real column names from Hive tables)
+//   3 = HIVE-12055 added (vectorized writer implementation)
+//   4 = HIVE-13083 fixed (decimals write present stream correctly)
+//   5 = ORC-101 fixed (bloom filters use utf8 consistently)
+//   6 = ORC-135 fixed (timestamp statistics use utc)
+//   7 = ORC-517 fixed (decimal64 min/max incorrect)
+//   8 = ORC-203 added (trim very long string statistics)
+//   9 = ORC-14 added (column encryption)
+// Our version should be updated as we implement the features from the list above
+static constexpr uint32_t cudf_writer_version = 7;
 
 // Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
 // are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
@@ -48,12 +70,13 @@ static constexpr int32_t DEFAULT_MIN_NANOS = 0;
 static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
 
 struct PostScript {
-  uint64_t footerLength       = 0;     // the length of the footer section in bytes
-  CompressionKind compression = NONE;  // the kind of generic compression used
-  uint32_t compressionBlockSize{};     // the maximum size of each compression chunk
-  std::vector<uint32_t> version;       // the version of the writer [major, minor]
-  uint64_t metadataLength = 0;         // the length of the metadata section in bytes
-  std::string magic       = "";        // the fixed string "ORC"
+  uint64_t footerLength       = 0;        // the length of the footer section in bytes
+  CompressionKind compression = NONE;     // the kind of generic compression used
+  uint32_t compressionBlockSize{};        // the maximum size of each compression chunk
+  std::vector<uint32_t> version;          // the version of the file format [major, minor]
+  uint64_t metadataLength = 0;            // the length of the metadata section in bytes
+  std::optional<uint32_t> writerVersion;  // The version of the writer that wrote the file
+  std::string magic = "";                 // the fixed string "ORC"
 };
 
 struct StripeInformation {
@@ -90,6 +113,7 @@ struct FileFooter {
   uint64_t numberOfRows = 0;               // the total number of rows in the file
   std::vector<ColStatsBlob> statistics;    // Column statistics blobs
   uint32_t rowIndexStride = 0;             // the maximum number of rows in each index entry
+  std::optional<uint32_t> writer;          // Writer code
 };
 
 struct Stream {

@@ -29,7 +29,7 @@ using strings::detail::fixed_point_string_size;
 
 // Nanosecond statistics should not be enabled until the spec version is set correctly in the output
 // files. See https://github.com/rapidsai/cudf/issues/14325 for more details
-constexpr bool enable_nanosecond_statistics = false;
+constexpr bool enable_nanosecond_statistics = true;
 
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;

@@ -2589,6 +2589,7 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
   if (_ffooter.headerLength == 0) {
     // First call
     _ffooter.headerLength   = std::strlen(MAGIC);
+    _ffooter.writer         = cudf_writer_code;
     _ffooter.rowIndexStride = _row_index_stride;
     _ffooter.types.resize(1 + orc_table.num_columns());
     _ffooter.types[0].kind = STRUCT;
@@ -2702,7 +2703,8 @@ void writer::impl::close()
   ps.footerLength         = pbw.size();
   ps.compression          = _compression_kind;
   ps.compressionBlockSize = _compression_blocksize;
-  ps.version              = {0, 12};
+  ps.version              = {0, 12};  // Hive 0.12
+  ps.writerVersion        = cudf_writer_version;
   ps.magic                = MAGIC;
 
   auto const ps_length = static_cast<uint8_t>(pbw.write(ps));

@@ -1054,12 +1054,8 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    // nanosecond precision can't be included until we write a writer version that includes ORC-135
-    // see https://github.com/rapidsai/cudf/issues/14325
-    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_FALSE(ts4.minimum_nanos.has_value());
-    // EXPECT_EQ(*ts4.maximum_nanos, 6);
-    EXPECT_FALSE(ts4.maximum_nanos.has_value());
+    EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_EQ(*ts4.maximum_nanos, 6);
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1069,12 +1065,8 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    // nanosecond precision can't be included until we write a writer version that includes ORC-135
-    // see https://github.com/rapidsai/cudf/issues/14325
-    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_FALSE(ts5.minimum_nanos.has_value());
-    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
-    EXPECT_FALSE(ts5.maximum_nanos.has_value());
+    EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_EQ(*ts5.maximum_nanos, 6000);
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);