diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 82d59803c25..9531a012e49 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -141,10 +141,10 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; ///< minimum in milliseconds - std::optional maximum_utc; ///< maximum in milliseconds - std::optional minimum_nanos; ///< nanoseconds part of the minimum - std::optional maximum_nanos; ///< nanoseconds part of the maximum + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_nanos; ///< nanoseconds part of the minimum + std::optional maximum_nanos; ///< nanoseconds part of the maximum }; namespace orc { diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index bc399b75ef9..ee5fa4e8b5a 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) field_reader(5, s.minimum_nanos), field_reader(6, s.maximum_nanos)); function_builder(s, maxlen, op); + + // Adjust nanoseconds because they are encoded as (value + 1) + // Range [1, 1000'000] is translated here to [0, 999'999] + if (s.minimum_nanos.has_value()) { + auto& min_nanos = s.minimum_nanos.value(); + CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds"); + --min_nanos; + } + if (s.maximum_nanos.has_value()) { + auto& max_nanos = s.maximum_nanos.value(); + CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds"); + --max_nanos; + } } void ProtobufReader::read(column_statistics& s, size_t maxlen) diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 6f65e384d2d..783ed4206b6 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3; // Seconds from January 1st, 1970 to January 1st, 2015 static constexpr int64_t orc_utc_epoch = 1420070400; +// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max +// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder, +// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1). +static constexpr int32_t DEFAULT_MIN_NANOS = 0; +static constexpr int32_t DEFAULT_MAX_NANOS = 999'999; + struct PostScript { uint64_t footerLength = 0; // the length of the footer section in bytes CompressionKind compression = NONE; // the kind of generic compression used diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 479a2dfada3..429fd5b929d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu { using strings::detail::fixed_point_string_size; +// Nanosecond statistics should not be enabled until the spec version is set correctly in the output +// files. See https://github.com/rapidsai/cudf/issues/14325 for more details +constexpr bool enable_nanosecond_statistics = false; + constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block; @@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1) stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); break; case dtype_timestamp64: - stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) + - 2 * (pb_fld_hdrlen + pb_fldlen_int32); + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64); + if constexpr (enable_nanosecond_statistics) { + stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32); + } break; case dtype_float32: case dtype_float64: @@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch // optional sint64 maximumUtc = 4; // optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond - // precision optional int32 maximumNanos = 6; + // precision + // optional int32 maximumNanos = 6; // } if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; @@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block) split_nanosecond_timestamp(s->chunk.max_value.i_val); // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC - cur = pb_put_int(cur, 1, min_ms); // minimum - cur = pb_put_int(cur, 2, max_ms); // maximum - cur = pb_put_int(cur, 3, min_ms); // minimumUtc - cur = pb_put_int(cur, 4, max_ms); // maximumUtc - cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos - cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos + cur = pb_put_int(cur, 1, min_ms); // minimum + cur = pb_put_int(cur, 2, max_ms); // maximum + cur = pb_put_int(cur, 3, min_ms); // minimumUtc + cur = pb_put_int(cur, 4, max_ms); // maximumUtc + + if constexpr (enable_nanosecond_statistics) { + if (min_ns_remainder != DEFAULT_MIN_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 5, min_ns_remainder + 1); // minimumNanos + } + if (max_ns_remainder != DEFAULT_MAX_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 6, max_ns_remainder + 1); // maximumNanos + } + } + fld_start[1] = cur - (fld_start + 2); } break; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 234716749ff..dca3886db14 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts4.maximum, 3); EXPECT_EQ(*ts4.minimum_utc, -4); EXPECT_EQ(*ts4.maximum_utc, 3); - EXPECT_EQ(*ts4.minimum_nanos, 999994); - EXPECT_EQ(*ts4.maximum_nanos, 6); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_FALSE(ts4.minimum_nanos.has_value()); + // EXPECT_EQ(*ts4.maximum_nanos, 6); + EXPECT_FALSE(ts4.maximum_nanos.has_value()); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); @@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts5.maximum, 3000); EXPECT_EQ(*ts5.minimum_utc, -3001); EXPECT_EQ(*ts5.maximum_utc, 3000); - EXPECT_EQ(*ts5.minimum_nanos, 994000); - EXPECT_EQ(*ts5.maximum_nanos, 6000); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_FALSE(ts5.minimum_nanos.has_value()); + // EXPECT_EQ(*ts5.maximum_nanos, 6000); + EXPECT_FALSE(ts5.maximum_nanos.has_value()); auto& s6 = stats[6]; EXPECT_EQ(*s6.number_of_values, 4ul);