Skip to content

Commit

Permalink
Fix and disable encoding for nanosecond statistics in ORC writer (#14367
Browse files Browse the repository at this point in the history
)

Issue #14325

Use uint when reading/writing nano stats because nanoseconds have int32 encoding (different from both unit32 and sint32, _obviously_), which does not use zigzag. 
sint32 uses zigzag, and unit32 does not allow negative numbers, so we can use uint since we'll never have negative nanoseconds.

Also disabled the nanoseconds because it should only be written after ORC-135; we don't write the version so readers get confused if nanoseconds are there. Planning to re-enable once we start writing the version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14367
  • Loading branch information
vuule authored Nov 15, 2023
1 parent 8a0a08f commit ab2248e
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 17 deletions.
8 changes: 4 additions & 4 deletions cpp/include/cudf/io/orc_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
* the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
*/
struct timestamp_statistics : minmax_statistics<int64_t> {
std::optional<int64_t> minimum_utc; ///< minimum in milliseconds
std::optional<int64_t> maximum_utc; ///< maximum in milliseconds
std::optional<int32_t> minimum_nanos; ///< nanoseconds part of the minimum
std::optional<int32_t> maximum_nanos; ///< nanoseconds part of the maximum
std::optional<int64_t> minimum_utc; ///< minimum in milliseconds
std::optional<int64_t> maximum_utc; ///< maximum in milliseconds
std::optional<uint32_t> minimum_nanos; ///< nanoseconds part of the minimum
std::optional<uint32_t> maximum_nanos; ///< nanoseconds part of the maximum
};

namespace orc {
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
field_reader(5, s.minimum_nanos),
field_reader(6, s.maximum_nanos));
function_builder(s, maxlen, op);

// Adjust nanoseconds because they are encoded as (value + 1)
// Range [1, 1000'000] is translated here to [0, 999'999]
if (s.minimum_nanos.has_value()) {
auto& min_nanos = s.minimum_nanos.value();
CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
--min_nanos;
}
if (s.maximum_nanos.has_value()) {
auto& max_nanos = s.maximum_nanos.value();
CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
--max_nanos;
}
}

void ProtobufReader::read(column_statistics& s, size_t maxlen)
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/io/orc/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
// Seconds from January 1st, 1970 to January 1st, 2015
static constexpr int64_t orc_utc_epoch = 1420070400;

// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;

struct PostScript {
uint64_t footerLength = 0; // the length of the footer section in bytes
CompressionKind compression = NONE; // the kind of generic compression used
Expand Down
35 changes: 26 additions & 9 deletions cpp/src/io/orc/stats_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {

using strings::detail::fixed_point_string_size;

// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
constexpr bool enable_nanosecond_statistics = false;

constexpr unsigned int init_threads_per_group = 32;
constexpr unsigned int init_groups_per_block = 4;
constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
Expand Down Expand Up @@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
break;
case dtype_timestamp64:
stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
2 * (pb_fld_hdrlen + pb_fldlen_int32);
stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
if constexpr (enable_nanosecond_statistics) {
stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
}
break;
case dtype_float32:
case dtype_float64:
Expand Down Expand Up @@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
// optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
// optional sint64 maximumUtc = 4;
// optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
// precision optional int32 maximumNanos = 6;
// precision
// optional int32 maximumNanos = 6;
// }
if (s->chunk.has_minmax) {
cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
Expand All @@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
split_nanosecond_timestamp(s->chunk.max_value.i_val);

// minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
cur = pb_put_int(cur, 1, min_ms); // minimum
cur = pb_put_int(cur, 2, max_ms); // maximum
cur = pb_put_int(cur, 3, min_ms); // minimumUtc
cur = pb_put_int(cur, 4, max_ms); // maximumUtc
cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos
cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos
cur = pb_put_int(cur, 1, min_ms); // minimum
cur = pb_put_int(cur, 2, max_ms); // maximum
cur = pb_put_int(cur, 3, min_ms); // minimumUtc
cur = pb_put_int(cur, 4, max_ms); // maximumUtc

if constexpr (enable_nanosecond_statistics) {
if (min_ns_remainder != DEFAULT_MIN_NANOS) {
// using uint because positive values are not zigzag encoded
cur = pb_put_uint(cur, 5, min_ns_remainder + 1); // minimumNanos
}
if (max_ns_remainder != DEFAULT_MAX_NANOS) {
// using uint because positive values are not zigzag encoded
cur = pb_put_uint(cur, 6, max_ns_remainder + 1); // maximumNanos
}
}

fld_start[1] = cur - (fld_start + 2);
}
break;
Expand Down
16 changes: 12 additions & 4 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
EXPECT_EQ(*ts4.maximum, 3);
EXPECT_EQ(*ts4.minimum_utc, -4);
EXPECT_EQ(*ts4.maximum_utc, 3);
EXPECT_EQ(*ts4.minimum_nanos, 999994);
EXPECT_EQ(*ts4.maximum_nanos, 6);
// nanosecond precision can't be included until we write a writer version that includes ORC-135
// see https://github.com/rapidsai/cudf/issues/14325
// EXPECT_EQ(*ts4.minimum_nanos, 999994);
EXPECT_FALSE(ts4.minimum_nanos.has_value());
// EXPECT_EQ(*ts4.maximum_nanos, 6);
EXPECT_FALSE(ts4.maximum_nanos.has_value());

auto& s5 = stats[5];
EXPECT_EQ(*s5.number_of_values, 4ul);
Expand All @@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
EXPECT_EQ(*ts5.maximum, 3000);
EXPECT_EQ(*ts5.minimum_utc, -3001);
EXPECT_EQ(*ts5.maximum_utc, 3000);
EXPECT_EQ(*ts5.minimum_nanos, 994000);
EXPECT_EQ(*ts5.maximum_nanos, 6000);
// nanosecond precision can't be included until we write a writer version that includes ORC-135
// see https://github.com/rapidsai/cudf/issues/14325
// EXPECT_EQ(*ts5.minimum_nanos, 994000);
EXPECT_FALSE(ts5.minimum_nanos.has_value());
// EXPECT_EQ(*ts5.maximum_nanos, 6000);
EXPECT_FALSE(ts5.maximum_nanos.has_value());

auto& s6 = stats[6];
EXPECT_EQ(*s6.number_of_values, 4ul);
Expand Down

0 comments on commit ab2248e

Please sign in to comment.