From c412480fc06ae5160cba9f5cdbd1153adb66dc79 Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Wed, 2 Aug 2023 06:49:57 -0700 Subject: [PATCH] Cast only time of day to nanos to avoid an overflow in Parquet INT96 write (#13776) Rework extraction of nanoseconds of the last day in INT96 write call path to avoid overflow. Contributes to NVIDIA/spark-rapids#8625 Fixes #8070 Authors: - Gera Shegalov (https://github.com/gerashegalov) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - MithunR (https://github.com/mythrocks) - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/13776 --- cpp/src/io/parquet/page_enc.cu | 39 ++++++++++++++++++---------------- cpp/tests/io/parquet_test.cpp | 25 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 5136cba3ac0..05f8bba7477 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -933,22 +933,24 @@ constexpr auto julian_calendar_epoch_diff() } /** - * @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian - * days. Does not deal with time zones. Used by INT96 code. + * @brief Converts number `v` of periods of type `PeriodT` into a pair with nanoseconds since + * midnight and number of Julian days. Does not deal with time zones. Used by INT96 code. * - * @param ns number of nanoseconds since epoch - * @return std::pair where nanoseconds is the number of nanoseconds + * @tparam PeriodT a ratio representing the tick period in duration + * @param v count of ticks since epoch + * @return A pair of (nanoseconds, days) where nanoseconds is the number of nanoseconds * elapsed in the day and days is the number of days from Julian epoch. */ -static __device__ std::pair convert_nanoseconds(timestamp_ns const ns) +template +__device__ auto julian_days_with_time(int64_t v) { using namespace cuda::std::chrono; - auto const nanosecond_ticks = ns.time_since_epoch(); - auto const gregorian_days = floor(nanosecond_ticks); - auto const julian_days = gregorian_days + ceil(julian_calendar_epoch_diff()); - - auto const last_day_ticks = nanosecond_ticks - gregorian_days; - return {last_day_ticks, julian_days}; + auto const dur_total = duration{v}; + auto const dur_days = floor(dur_total); + auto const dur_time_of_day = dur_total - dur_days; + auto const dur_time_of_day_nanos = duration_cast(dur_time_of_day); + auto const julian_days = dur_days + ceil(julian_calendar_epoch_diff()); + return std::make_pair(dur_time_of_day_nanos, julian_days); } // blockDim(128, 1, 1) @@ -1236,22 +1238,23 @@ __global__ void __launch_bounds__(128, 8) } } - auto const ret = convert_nanoseconds([&]() { + auto const [last_day_nanos, julian_days] = [&] { + using namespace cuda::std::chrono; switch (s->col.leaf_column->type().id()) { case type_id::TIMESTAMP_SECONDS: case type_id::TIMESTAMP_MILLISECONDS: { - return timestamp_ns{duration_ms{v}}; + return julian_days_with_time(v); } break; case type_id::TIMESTAMP_MICROSECONDS: case type_id::TIMESTAMP_NANOSECONDS: { - return timestamp_ns{duration_us{v}}; + return julian_days_with_time(v); } break; } - return timestamp_ns{duration_ns{0}}; - }()); + return julian_days_with_time(0); + }(); // the 12 bytes of fixed length data. - v = ret.first.count(); + v = last_day_nanos.count(); dst[pos + 0] = v; dst[pos + 1] = v >> 8; dst[pos + 2] = v >> 16; @@ -1260,7 +1263,7 @@ __global__ void __launch_bounds__(128, 8) dst[pos + 5] = v >> 40; dst[pos + 6] = v >> 48; dst[pos + 7] = v >> 56; - uint32_t w = ret.second.count(); + uint32_t w = julian_days.count(); dst[pos + 8] = w; dst[pos + 9] = w >> 8; dst[pos + 10] = w >> 16; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 0ac3f659ffe..4e28f536728 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -6411,4 +6412,28 @@ TEST_F(ParquetReaderTest, FilterFloatNAN) CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1); } +TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow) +{ + using namespace cuda::std::chrono; + using namespace cudf::io; + + column_wrapper big_ts_col{ + sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us, + sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}}; + + table_view expected({big_ts_col}); + auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet"); + + auto const out_opts = + parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build(); + write_parquet(out_opts); + + auto const in_opts = parquet_reader_options::builder(source_info(filepath)) + .timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS)) + .build(); + auto const result = read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN()